mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 02:05:58 +08:00
fix: side load context bridging
This commit is contained in:
parent
e92ff33ad0
commit
8597daa96b
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-c44153f",
|
||||
"civkit": "^0.8.4-6ed9027",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -4095,9 +4095,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.4-c44153f",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-c44153f.tgz",
|
||||
"integrity": "sha512-VBElW71aAqqP0G+8F460hZfnDrn4kMCxTCn+FaFqGG2B0TmNkfwjVZL9VuDRNtSzNBbEO9rRKLJG1iw4y8sZxQ==",
|
||||
"version": "0.8.4-6ed9027",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
|
||||
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -25,7 +25,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-c44153f",
|
||||
"civkit": "^0.8.4-6ed9027",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
@ -17,6 +17,7 @@ import { isIP } from 'net';
|
||||
import { CurlControl } from './curl';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
const tldExtract = require('tld-extract');
|
||||
|
||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||
@ -468,8 +469,11 @@ export class PuppeteerControl extends AsyncService {
|
||||
|
||||
circuitBreakerHosts: Set<string> = new Set();
|
||||
|
||||
lifeCycleTrack = new WeakMap();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected asyncLocalContext: AsyncLocalContext,
|
||||
protected curlControl: CurlControl,
|
||||
protected blackHoleDetector: BlackHoleDetector,
|
||||
) {
|
||||
@ -774,6 +778,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
const pdfUrls: string[] = [];
|
||||
let navigationResponse: HTTPResponse | undefined;
|
||||
const page = await this.getNextPage();
|
||||
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
||||
this.pagePhase.set(page, 'active');
|
||||
page.on('response', (resp) => {
|
||||
this.blackHoleDetector.itWorked();
|
||||
@ -805,6 +810,19 @@ export class PuppeteerControl extends AsyncService {
|
||||
if (!options.proxyResources) {
|
||||
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
||||
if (!isDocRequest) {
|
||||
if (options.extraHeaders) {
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
}
|
||||
const overrides = req.continueRequestOverrides();
|
||||
|
||||
return req.continue(overrides, 0);
|
||||
@ -830,54 +848,69 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
|
||||
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
||||
const ctx = this.lifeCycleTrack.get(page);
|
||||
if (proxy && ctx) {
|
||||
return this.asyncLocalContext.bridge(ctx, async () => {
|
||||
try {
|
||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||
...options,
|
||||
method: req.method(),
|
||||
body: req.postData(),
|
||||
extraHeaders: {
|
||||
...req.headers(),
|
||||
...options.extraHeaders,
|
||||
},
|
||||
proxyUrl: proxy
|
||||
});
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
if (proxy) {
|
||||
try {
|
||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||
...options,
|
||||
method: req.method(),
|
||||
body: req.postData(),
|
||||
extraHeaders: {
|
||||
...req.headers(),
|
||||
...options.extraHeaders,
|
||||
},
|
||||
proxyUrl: proxy
|
||||
});
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
if (curled.chain.length === 1) {
|
||||
if (!curled.file) {
|
||||
if (curled.chain.length === 1) {
|
||||
if (!curled.file) {
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
}, 999);
|
||||
}
|
||||
const body = await readFile(await curled.file.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
body: Uint8Array.from(body),
|
||||
}, 999);
|
||||
}
|
||||
const body = await readFile(await curled.file.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
options.sideLoad ??= curled.sideLoadOpts;
|
||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||
const firstReq = curled.chain[0];
|
||||
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
body: Uint8Array.from(body),
|
||||
status: firstReq.result!.code,
|
||||
headers: _.omit(firstReq, 'result'),
|
||||
}, 999);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
||||
}
|
||||
options.sideLoad ??= curled.sideLoadOpts;
|
||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||
const firstReq = curled.chain[0];
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.respond({
|
||||
status: firstReq.result!.code,
|
||||
headers: _.omit(firstReq, 'result'),
|
||||
}, 999);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
||||
|
||||
}
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
}
|
||||
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
@ -895,25 +928,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
if (options.extraHeaders) {
|
||||
page.on('request', async (req) => {
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
}
|
||||
let pageScriptEvaluations: Promise<unknown>[] = [];
|
||||
let frameScriptEvaluations: Promise<unknown>[] = [];
|
||||
if (options.injectPageScripts?.length) {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 16521fd4a55f983c050d4cdd0c24a8ac400901d1
|
||||
Subproject commit 20417f5bb7f8c773a835304f0624a180b558ff65
|
Loading…
x
Reference in New Issue
Block a user