fix: curl with errors

This commit is contained in:
yanlong.wang 2025-01-15 19:29:59 +08:00
parent 6be6051aa7
commit 80b9a6a5a0
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 21 additions and 8 deletions

View File

@ -857,12 +857,12 @@ export class CrawlerHost extends RPCHost {
nominalUrl?: URL,
urlValidMs?: number
) {
if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
const engine = crawlerOptions.engine?.toLowerCase() || '';
if (engine.includes('lm')) {
const output: FormattedPage = {
title: snapshot.title,
content: snapshot.parsed?.textContent,
url: snapshot.href,
pageshotUrl: snapshot.pageshotUrl,
[Symbol.dispose]: () => undefined,
};

View File

@ -2,10 +2,11 @@ import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl } from 'node-libcurl';
import { Curl, HeaderInfo } from 'node-libcurl';
import { PageSnapshot, ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger';
import { JSDomControl } from './jsdom';
import { AssertionFailureError } from 'civkit';
@singleton()
export class CurlControl extends AsyncService {
@ -26,7 +27,11 @@ export class CurlControl extends AsyncService {
}
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
const html = await new Promise<string>((resolve, reject) => {
const result = await new Promise<{
statusCode: number,
data: string,
headers: Buffer | HeaderInfo[],
}>((resolve, reject) => {
const curl = new Curl();
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
@ -52,23 +57,31 @@ export class CurlControl extends AsyncService {
}
curl.on('end', (statusCode, data, headers) => {
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers });
resolve(data.toString());
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
resolve({
statusCode,
data: data.toString(),
headers,
});
curl.close();
});
curl.on('error', (err) => {
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
curl.close();
reject(err);
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
});
curl.perform();
});
if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
}
const snapshot = {
href: urlToCrawl.toString(),
html: html,
html: result.data,
title: '',
text: '',
} as PageSnapshot;