mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:15:58 +08:00
fix: walk around locale setting bug
This commit is contained in:
parent
9242bb393a
commit
ebc09003d1
44
backend/functions/package-lock.json
generated
44
backend/functions/package-lock.json
generated
@ -15,7 +15,7 @@
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"civkit": "^0.8.1-bb8d850",
|
||||
"civkit": "^0.8.2-c9ca977",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -33,7 +33,7 @@
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
||||
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"puppeteer-page-proxy": "^1.3.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
@ -3729,9 +3729,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.1-bb8d850",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-bb8d850.tgz",
|
||||
"integrity": "sha512-b80LGS/jnpPgEHlbPk9QyDqHAxmA/VH7pyc428HYXtcXQOiqhkMaFanOU78sCctxjpzYqnzOkNDwcPl1PdzgFw==",
|
||||
"version": "0.8.2-c9ca977",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz",
|
||||
"integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
@ -3754,6 +3754,7 @@
|
||||
"koa-compose": "^4.1.0",
|
||||
"libmagic-ffi": "^0.1.4",
|
||||
"mime": "^3.0.0",
|
||||
"minimatch": "^10.0.1",
|
||||
"minio": "^7.0.33",
|
||||
"node-object-hash": "^3.0.0",
|
||||
"node-schedule": "^2.1.1",
|
||||
@ -3770,6 +3771,32 @@
|
||||
"tsyringe": "^4"
|
||||
}
|
||||
},
|
||||
"node_modules/civkit/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/civkit/node_modules/minimatch": {
|
||||
"version": "10.0.1",
|
||||
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
|
||||
"integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
|
||||
"license": "ISC",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"brace-expansion": "^2.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": "20 || >=22"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/isaacs"
|
||||
}
|
||||
},
|
||||
"node_modules/cjs-module-lexer": {
|
||||
"version": "1.2.3",
|
||||
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
|
||||
@ -9795,9 +9822,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/puppeteer-extra-plugin-page-proxy": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
|
||||
"integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz",
|
||||
"integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"debug": "^4.1.1",
|
||||
"got": "^11.8.5",
|
||||
|
@ -35,7 +35,7 @@
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"civkit": "^0.8.1-bb8d850",
|
||||
"civkit": "^0.8.2-c9ca977",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -53,7 +53,7 @@
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
||||
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"puppeteer-page-proxy": "^1.3.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
|
@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost {
|
||||
referer: opts.referer,
|
||||
};
|
||||
|
||||
if (opts.locale) {
|
||||
crawlOpts.extraHeaders ??= {};
|
||||
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||
}
|
||||
|
||||
return crawlOpts;
|
||||
}
|
||||
|
||||
|
@ -46,7 +46,7 @@ export interface PageSnapshot {
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
shadowExpanded?: string
|
||||
shadowExpanded?: string;
|
||||
text: string;
|
||||
status?: number;
|
||||
statusText?: string;
|
||||
@ -75,6 +75,7 @@ export interface ScrappingOptions {
|
||||
timeoutMs?: number;
|
||||
locale?: string;
|
||||
referer?: string;
|
||||
extraHeaders?: Record<string, string>;
|
||||
}
|
||||
|
||||
|
||||
@ -581,14 +582,34 @@ if (window.self === window.top) {
|
||||
pdfUrls.push(url);
|
||||
}
|
||||
});
|
||||
if (options?.extraHeaders) {
|
||||
page.on('request', async (req) => {
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
}
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
|
||||
this.logger.info(`Locale setting: ${options?.locale}`);
|
||||
if (options?.locale) {
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept-Language': options?.locale
|
||||
});
|
||||
// Add headers via request interception to walk around this bug
|
||||
// https://github.com/puppeteer/puppeteer/issues/10235
|
||||
// await page.setExtraHTTPHeaders({
|
||||
// 'Accept-Language': options?.locale
|
||||
// });
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, "language", {
|
||||
@ -605,7 +626,10 @@ if (window.self === window.top) {
|
||||
}
|
||||
|
||||
if (options?.proxyUrl) {
|
||||
await page.useProxy(options.proxyUrl);
|
||||
await page.useProxy(options.proxyUrl, {
|
||||
headers: options.extraHeaders,
|
||||
interceptResolutionPriority: 2,
|
||||
});
|
||||
}
|
||||
if (options?.cookies) {
|
||||
const mapped = options.cookies.map((x) => {
|
||||
|
Loading…
x
Reference in New Issue
Block a user