fix: curl with errors

This commit is contained in:
yanlong.wang 2025-01-15 19:29:59 +08:00
parent 6be6051aa7
commit 80b9a6a5a0
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 21 additions and 8 deletions

View File

@ -857,12 +857,12 @@ export class CrawlerHost extends RPCHost {
nominalUrl?: URL, nominalUrl?: URL,
urlValidMs?: number urlValidMs?: number
) { ) {
if (crawlerOptions.engine?.toLowerCase().includes('lm')) { const engine = crawlerOptions.engine?.toLowerCase() || '';
if (engine.includes('lm')) {
const output: FormattedPage = { const output: FormattedPage = {
title: snapshot.title, title: snapshot.title,
content: snapshot.parsed?.textContent, content: snapshot.parsed?.textContent,
url: snapshot.href, url: snapshot.href,
pageshotUrl: snapshot.pageshotUrl,
[Symbol.dispose]: () => undefined, [Symbol.dispose]: () => undefined,
}; };

View File

@ -2,10 +2,11 @@ import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service'; import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import { Curl } from 'node-libcurl'; import { Curl, HeaderInfo } from 'node-libcurl';
import { PageSnapshot, ScrappingOptions } from './puppeteer'; import { PageSnapshot, ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
import { JSDomControl } from './jsdom'; import { JSDomControl } from './jsdom';
import { AssertionFailureError } from 'civkit';
@singleton() @singleton()
export class CurlControl extends AsyncService { export class CurlControl extends AsyncService {
@ -26,7 +27,11 @@ export class CurlControl extends AsyncService {
} }
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) { async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
const html = await new Promise<string>((resolve, reject) => { const result = await new Promise<{
statusCode: number,
data: string,
headers: Buffer | HeaderInfo[],
}>((resolve, reject) => {
const curl = new Curl(); const curl = new Curl();
curl.setOpt('URL', urlToCrawl.toString()); curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true); curl.setOpt(Curl.option.FOLLOWLOCATION, true);
@ -52,23 +57,31 @@ export class CurlControl extends AsyncService {
} }
curl.on('end', (statusCode, data, headers) => { curl.on('end', (statusCode, data, headers) => {
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers }); this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
resolve(data.toString()); resolve({
statusCode,
data: data.toString(),
headers,
});
curl.close(); curl.close();
}); });
curl.on('error', (err) => { curl.on('error', (err) => {
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
curl.close(); curl.close();
reject(err); reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
}); });
curl.perform(); curl.perform();
}); });
if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
}
const snapshot = { const snapshot = {
href: urlToCrawl.toString(), href: urlToCrawl.toString(),
html: html, html: result.data,
title: '', title: '',
text: '', text: '',
} as PageSnapshot; } as PageSnapshot;