mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:45:53 +08:00
fix: side load context bridging
This commit is contained in:
parent
e92ff33ad0
commit
8597daa96b
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-c44153f",
|
"civkit": "^0.8.4-6ed9027",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -4095,9 +4095,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.4-c44153f",
|
"version": "0.8.4-6ed9027",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-c44153f.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
|
||||||
"integrity": "sha512-VBElW71aAqqP0G+8F460hZfnDrn4kMCxTCn+FaFqGG2B0TmNkfwjVZL9VuDRNtSzNBbEO9rRKLJG1iw4y8sZxQ==",
|
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-c44153f",
|
"civkit": "^0.8.4-6ed9027",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
|
@ -17,6 +17,7 @@ import { isIP } from 'net';
|
|||||||
import { CurlControl } from './curl';
|
import { CurlControl } from './curl';
|
||||||
import { readFile } from 'fs/promises';
|
import { readFile } from 'fs/promises';
|
||||||
import { BlackHoleDetector } from './blackhole-detector';
|
import { BlackHoleDetector } from './blackhole-detector';
|
||||||
|
import { AsyncLocalContext } from './async-context';
|
||||||
const tldExtract = require('tld-extract');
|
const tldExtract = require('tld-extract');
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -468,8 +469,11 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
circuitBreakerHosts: Set<string> = new Set();
|
circuitBreakerHosts: Set<string> = new Set();
|
||||||
|
|
||||||
|
lifeCycleTrack = new WeakMap();
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
|
protected asyncLocalContext: AsyncLocalContext,
|
||||||
protected curlControl: CurlControl,
|
protected curlControl: CurlControl,
|
||||||
protected blackHoleDetector: BlackHoleDetector,
|
protected blackHoleDetector: BlackHoleDetector,
|
||||||
) {
|
) {
|
||||||
@ -774,6 +778,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
const pdfUrls: string[] = [];
|
const pdfUrls: string[] = [];
|
||||||
let navigationResponse: HTTPResponse | undefined;
|
let navigationResponse: HTTPResponse | undefined;
|
||||||
const page = await this.getNextPage();
|
const page = await this.getNextPage();
|
||||||
|
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
||||||
this.pagePhase.set(page, 'active');
|
this.pagePhase.set(page, 'active');
|
||||||
page.on('response', (resp) => {
|
page.on('response', (resp) => {
|
||||||
this.blackHoleDetector.itWorked();
|
this.blackHoleDetector.itWorked();
|
||||||
@ -805,6 +810,19 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
if (!options.proxyResources) {
|
if (!options.proxyResources) {
|
||||||
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
||||||
if (!isDocRequest) {
|
if (!isDocRequest) {
|
||||||
|
if (options.extraHeaders) {
|
||||||
|
const overrides = req.continueRequestOverrides();
|
||||||
|
const continueArgs = [{
|
||||||
|
...overrides,
|
||||||
|
headers: {
|
||||||
|
...req.headers(),
|
||||||
|
...overrides?.headers,
|
||||||
|
...options.extraHeaders,
|
||||||
|
}
|
||||||
|
}, 1] as const;
|
||||||
|
|
||||||
|
return req.continue(continueArgs[0], continueArgs[1]);
|
||||||
|
}
|
||||||
const overrides = req.continueRequestOverrides();
|
const overrides = req.continueRequestOverrides();
|
||||||
|
|
||||||
return req.continue(overrides, 0);
|
return req.continue(overrides, 0);
|
||||||
@ -830,54 +848,69 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
||||||
|
const ctx = this.lifeCycleTrack.get(page);
|
||||||
|
if (proxy && ctx) {
|
||||||
|
return this.asyncLocalContext.bridge(ctx, async () => {
|
||||||
|
try {
|
||||||
|
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||||
|
...options,
|
||||||
|
method: req.method(),
|
||||||
|
body: req.postData(),
|
||||||
|
extraHeaders: {
|
||||||
|
...req.headers(),
|
||||||
|
...options.extraHeaders,
|
||||||
|
},
|
||||||
|
proxyUrl: proxy
|
||||||
|
});
|
||||||
|
if (req.isInterceptResolutionHandled()) {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
if (proxy) {
|
if (curled.chain.length === 1) {
|
||||||
try {
|
if (!curled.file) {
|
||||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
return req.respond({
|
||||||
...options,
|
status: curled.status,
|
||||||
method: req.method(),
|
headers: _.omit(curled.headers, 'result'),
|
||||||
body: req.postData(),
|
contentType: curled.contentType,
|
||||||
extraHeaders: {
|
}, 999);
|
||||||
...req.headers(),
|
}
|
||||||
...options.extraHeaders,
|
const body = await readFile(await curled.file.filePath);
|
||||||
},
|
if (req.isInterceptResolutionHandled()) {
|
||||||
proxyUrl: proxy
|
return;
|
||||||
});
|
};
|
||||||
if (req.isInterceptResolutionHandled()) {
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
if (curled.chain.length === 1) {
|
|
||||||
if (!curled.file) {
|
|
||||||
return req.respond({
|
return req.respond({
|
||||||
status: curled.status,
|
status: curled.status,
|
||||||
headers: _.omit(curled.headers, 'result'),
|
headers: _.omit(curled.headers, 'result'),
|
||||||
contentType: curled.contentType,
|
contentType: curled.contentType,
|
||||||
|
body: Uint8Array.from(body),
|
||||||
}, 999);
|
}, 999);
|
||||||
}
|
}
|
||||||
const body = await readFile(await curled.file.filePath);
|
options.sideLoad ??= curled.sideLoadOpts;
|
||||||
if (req.isInterceptResolutionHandled()) {
|
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||||
return;
|
const firstReq = curled.chain[0];
|
||||||
};
|
|
||||||
return req.respond({
|
return req.respond({
|
||||||
status: curled.status,
|
status: firstReq.result!.code,
|
||||||
headers: _.omit(curled.headers, 'result'),
|
headers: _.omit(firstReq, 'result'),
|
||||||
contentType: curled.contentType,
|
|
||||||
body: Uint8Array.from(body),
|
|
||||||
}, 999);
|
}, 999);
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
||||||
}
|
}
|
||||||
options.sideLoad ??= curled.sideLoadOpts;
|
if (req.isInterceptResolutionHandled()) {
|
||||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
return;
|
||||||
const firstReq = curled.chain[0];
|
};
|
||||||
|
const overrides = req.continueRequestOverrides();
|
||||||
|
const continueArgs = [{
|
||||||
|
...overrides,
|
||||||
|
headers: {
|
||||||
|
...req.headers(),
|
||||||
|
...overrides?.headers,
|
||||||
|
...options.extraHeaders,
|
||||||
|
}
|
||||||
|
}, 1] as const;
|
||||||
|
|
||||||
return req.respond({
|
return req.continue(continueArgs[0], continueArgs[1]);
|
||||||
status: firstReq.result!.code,
|
});
|
||||||
headers: _.omit(firstReq, 'result'),
|
|
||||||
}, 999);
|
|
||||||
} catch (err: any) {
|
|
||||||
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (req.isInterceptResolutionHandled()) {
|
if (req.isInterceptResolutionHandled()) {
|
||||||
@ -895,25 +928,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
return req.continue(continueArgs[0], continueArgs[1]);
|
return req.continue(continueArgs[0], continueArgs[1]);
|
||||||
});
|
});
|
||||||
if (options.extraHeaders) {
|
|
||||||
page.on('request', async (req) => {
|
|
||||||
if (req.isInterceptResolutionHandled()) {
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
const overrides = req.continueRequestOverrides();
|
|
||||||
const continueArgs = [{
|
|
||||||
...overrides,
|
|
||||||
headers: {
|
|
||||||
...req.headers(),
|
|
||||||
...overrides?.headers,
|
|
||||||
...options.extraHeaders,
|
|
||||||
}
|
|
||||||
}, 1] as const;
|
|
||||||
|
|
||||||
return req.continue(continueArgs[0], continueArgs[1]);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
let pageScriptEvaluations: Promise<unknown>[] = [];
|
let pageScriptEvaluations: Promise<unknown>[] = [];
|
||||||
let frameScriptEvaluations: Promise<unknown>[] = [];
|
let frameScriptEvaluations: Promise<unknown>[] = [];
|
||||||
if (options.injectPageScripts?.length) {
|
if (options.injectPageScripts?.length) {
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 16521fd4a55f983c050d4cdd0c24a8ac400901d1
|
Subproject commit 20417f5bb7f8c773a835304f0624a180b558ff65
|
Loading…
x
Reference in New Issue
Block a user