mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 08:36:02 +08:00
feat: fetch page content by curl (#1119)
* feat: fetch url without script data * refactor: rename X-Agent to X-Engine Co-Authored-By: yanlong.wang@jina.ai <yanlong.wang@jina.ai> * refactor: rename X-Agent to X-Engine Co-Authored-By: yanlong.wang@jina.ai <yanlong.wang@jina.ai> * refactor: rename X-Agent to X-Engine header and property (#1122) Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai> * refactor: rename X-Agent to X-Engine while preserving user-agent functionality (#1123) - Remove duplicate X-Engine header definition - Restore userAgent threadLocal.set - Restore overrideUserAgent in crawler options - Maintain engine-related changes Link to Devin run: https://app.devin.ai/sessions/cd65e5d9466049a28a92002267c48e8b Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai> * fix: remove duplicate engine declarations in scrapping-options.ts (#1124) Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: yanlong.wang@jina.ai <yanlong.wang@jina.ai>
This commit is contained in:
parent
2606c445d9
commit
6c23342cbf
904
backend/functions/package-lock.json
generated
904
backend/functions/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -49,6 +49,7 @@
|
||||
"linkedom": "^0.18.4",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"node-libcurl": "^4.1.0",
|
||||
"openai": "^4.20.0",
|
||||
"pdfjs-dist": "^4.2.67",
|
||||
"puppeteer": "^23.3.0",
|
||||
|
@ -9,6 +9,7 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
import { Curl } from 'node-libcurl';
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { randomUUID } from 'crypto';
|
||||
@ -28,6 +29,7 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
targetSelector?: string | string[];
|
||||
removeSelector?: string | string[];
|
||||
keepImgDataUrl?: boolean;
|
||||
engine?: string;
|
||||
}
|
||||
|
||||
const indexProto = {
|
||||
@ -588,6 +590,58 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlerOpts?.engine?.toLowerCase() === 'curl') {
|
||||
const html = await new Promise<string>((resolve, reject) => {
|
||||
const curl = new Curl();
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||
|
||||
if (crawlOpts?.timeoutMs) {
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
||||
}
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||
}
|
||||
if (crawlOpts?.extraHeaders) {
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
}
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||
}
|
||||
if (crawlOpts?.cookies) {
|
||||
curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; '));
|
||||
}
|
||||
if (crawlOpts?.referer) {
|
||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||
}
|
||||
|
||||
|
||||
curl.on('end', (statusCode, data, headers) => {
|
||||
this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers });
|
||||
resolve(data.toString());
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.on('error', (err) => {
|
||||
this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) });
|
||||
reject(err);
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
|
||||
const fakeSnapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: html,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||
return;
|
||||
}
|
||||
|
||||
let cache;
|
||||
|
||||
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
||||
@ -706,6 +760,7 @@ export class CrawlerHost extends RPCHost {
|
||||
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
this.threadLocal.set('engine', opts.engine);
|
||||
if (opts.timeout) {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
}
|
||||
@ -720,6 +775,7 @@ export class CrawlerHost extends RPCHost {
|
||||
targetSelector: opts.targetSelector,
|
||||
waitForSelector: opts.waitForSelector,
|
||||
overrideUserAgent: opts.userAgent,
|
||||
engine: opts.engine,
|
||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||
withIframe: opts.withIframe,
|
||||
withShadowDom: opts.withShadowDom,
|
||||
|
@ -180,7 +180,12 @@ class Viewport extends AutoCastable {
|
||||
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
}
|
||||
},
|
||||
'X-Engine': {
|
||||
description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -272,6 +277,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
userAgent?: string;
|
||||
|
||||
@Prop({ default: 'puppeteer' })
|
||||
engine?: string;
|
||||
|
||||
@Prop({
|
||||
arrayOf: String,
|
||||
})
|
||||
@ -376,6 +384,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
|
||||
const engine = ctx?.req.get('x-engine');
|
||||
if (engine) {
|
||||
instance.engine = engine;
|
||||
}
|
||||
|
||||
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
||||
if (keepImgDataUrl !== undefined) {
|
||||
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||
|
Loading…
x
Reference in New Issue
Block a user