feat: fetch page content by curl (#1119)

* feat: fetch url without script data

* refactor: rename X-Agent to X-Engine

Co-Authored-By: yanlong.wang@jina.ai <yanlong.wang@jina.ai>

* refactor: rename X-Agent to X-Engine

Co-Authored-By: yanlong.wang@jina.ai <yanlong.wang@jina.ai>

* refactor: rename X-Agent to X-Engine header and property (#1122)

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai>

* refactor: rename X-Agent to X-Engine while preserving user-agent functionality (#1123)

- Remove duplicate X-Engine header definition
- Restore userAgent threadLocal.set
- Restore overrideUserAgent in crawler options
- Maintain engine-related changes

Link to Devin run: https://app.devin.ai/sessions/cd65e5d9466049a28a92002267c48e8b

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai>

* fix: remove duplicate engine declarations in scrapping-options.ts (#1124)

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai>

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai>
This commit is contained in:
Sha Zhou 2025-01-08 19:25:14 +08:00 committed by GitHub
parent 2606c445d9
commit 6c23342cbf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 967 additions and 9 deletions

File diff suppressed because it is too large Load Diff

View File

@ -49,6 +49,7 @@
"linkedom": "^0.18.4",
"maxmind": "^4.3.18",
"minio": "^7.1.3",
"node-libcurl": "^4.1.0",
"openai": "^4.20.0",
"pdfjs-dist": "^4.2.67",
"puppeteer": "^23.3.0",

View File

@ -9,6 +9,7 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import { Curl } from 'node-libcurl';
const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { Crawled } from '../db/crawled';
import { randomUUID } from 'crypto';
@ -28,6 +29,7 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
targetSelector?: string | string[];
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
engine?: string;
}
const indexProto = {
@ -588,6 +590,58 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlerOpts?.engine?.toLowerCase() === 'curl') {
const html = await new Promise<string>((resolve, reject) => {
const curl = new Curl();
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
if (crawlOpts?.timeoutMs) {
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
}
if (crawlOpts?.overrideUserAgent) {
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
}
if (crawlOpts?.extraHeaders) {
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
}
if (crawlOpts?.proxyUrl) {
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
}
if (crawlOpts?.cookies) {
curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; '));
}
if (crawlOpts?.referer) {
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
}
curl.on('end', (statusCode, data, headers) => {
this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers });
resolve(data.toString());
curl.close();
});
curl.on('error', (err) => {
this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) });
reject(err);
curl.close();
});
curl.perform();
});
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: html,
title: '',
text: '',
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
}
let cache;
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
@ -706,6 +760,7 @@ export class CrawlerHost extends RPCHost {
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
this.threadLocal.set('engine', opts.engine);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
@ -720,6 +775,7 @@ export class CrawlerHost extends RPCHost {
targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector,
overrideUserAgent: opts.userAgent,
engine: opts.engine,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe,
withShadowDom: opts.withShadowDom,

View File

@ -180,7 +180,12 @@ class Viewport extends AutoCastable {
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
in: 'header',
schema: { type: 'string' }
}
},
'X-Engine': {
description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl',
in: 'header',
schema: { type: 'string' }
},
}
}
}
@ -272,6 +277,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
userAgent?: string;
@Prop({ default: 'puppeteer' })
engine?: string;
@Prop({
arrayOf: String,
})
@ -376,6 +384,11 @@ export class CrawlerOptions extends AutoCastable {
const overrideUserAgent = ctx?.req.get('x-user-agent');
instance.userAgent ??= overrideUserAgent;
const engine = ctx?.req.get('x-engine');
if (engine) {
instance.engine = engine;
}
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
if (keepImgDataUrl !== undefined) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl);