mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 04:55:54 +08:00
fix: curl with errors
This commit is contained in:
parent
6be6051aa7
commit
80b9a6a5a0
@ -857,12 +857,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
nominalUrl?: URL,
|
nominalUrl?: URL,
|
||||||
urlValidMs?: number
|
urlValidMs?: number
|
||||||
) {
|
) {
|
||||||
if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
|
const engine = crawlerOptions.engine?.toLowerCase() || '';
|
||||||
|
if (engine.includes('lm')) {
|
||||||
const output: FormattedPage = {
|
const output: FormattedPage = {
|
||||||
title: snapshot.title,
|
title: snapshot.title,
|
||||||
content: snapshot.parsed?.textContent,
|
content: snapshot.parsed?.textContent,
|
||||||
url: snapshot.href,
|
url: snapshot.href,
|
||||||
pageshotUrl: snapshot.pageshotUrl,
|
|
||||||
[Symbol.dispose]: () => undefined,
|
[Symbol.dispose]: () => undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2,10 +2,11 @@ import { marshalErrorLike } from 'civkit/lang';
|
|||||||
import { AsyncService } from 'civkit/async-service';
|
import { AsyncService } from 'civkit/async-service';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
|
|
||||||
import { Curl } from 'node-libcurl';
|
import { Curl, HeaderInfo } from 'node-libcurl';
|
||||||
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import { JSDomControl } from './jsdom';
|
import { JSDomControl } from './jsdom';
|
||||||
|
import { AssertionFailureError } from 'civkit';
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class CurlControl extends AsyncService {
|
export class CurlControl extends AsyncService {
|
||||||
@ -26,7 +27,11 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
|
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
|
||||||
const html = await new Promise<string>((resolve, reject) => {
|
const result = await new Promise<{
|
||||||
|
statusCode: number,
|
||||||
|
data: string,
|
||||||
|
headers: Buffer | HeaderInfo[],
|
||||||
|
}>((resolve, reject) => {
|
||||||
const curl = new Curl();
|
const curl = new Curl();
|
||||||
curl.setOpt('URL', urlToCrawl.toString());
|
curl.setOpt('URL', urlToCrawl.toString());
|
||||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||||
@ -52,23 +57,31 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
curl.on('end', (statusCode, data, headers) => {
|
curl.on('end', (statusCode, data, headers) => {
|
||||||
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers });
|
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
||||||
resolve(data.toString());
|
resolve({
|
||||||
|
statusCode,
|
||||||
|
data: data.toString(),
|
||||||
|
headers,
|
||||||
|
});
|
||||||
curl.close();
|
curl.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
curl.on('error', (err) => {
|
curl.on('error', (err) => {
|
||||||
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||||
curl.close();
|
curl.close();
|
||||||
reject(err);
|
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
||||||
});
|
});
|
||||||
|
|
||||||
curl.perform();
|
curl.perform();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
||||||
|
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
||||||
|
}
|
||||||
|
|
||||||
const snapshot = {
|
const snapshot = {
|
||||||
href: urlToCrawl.toString(),
|
href: urlToCrawl.toString(),
|
||||||
html: html,
|
html: result.data,
|
||||||
title: '',
|
title: '',
|
||||||
text: '',
|
text: '',
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user