mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 04:16:02 +08:00
fix: walk around locale setting bug
This commit is contained in:
parent
9242bb393a
commit
ebc09003d1
44
backend/functions/package-lock.json
generated
44
backend/functions/package-lock.json
generated
@ -15,7 +15,7 @@
|
|||||||
"archiver": "^6.0.1",
|
"archiver": "^6.0.1",
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.8.1-bb8d850",
|
"civkit": "^0.8.2-c9ca977",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -33,7 +33,7 @@
|
|||||||
"puppeteer": "^23.3.0",
|
"puppeteer": "^23.3.0",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
"puppeteer-page-proxy": "^1.3.0",
|
"puppeteer-page-proxy": "^1.3.0",
|
||||||
"robots-parser": "^3.0.1",
|
"robots-parser": "^3.0.1",
|
||||||
@ -3729,9 +3729,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.1-bb8d850",
|
"version": "0.8.2-c9ca977",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-bb8d850.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz",
|
||||||
"integrity": "sha512-b80LGS/jnpPgEHlbPk9QyDqHAxmA/VH7pyc428HYXtcXQOiqhkMaFanOU78sCctxjpzYqnzOkNDwcPl1PdzgFw==",
|
"integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
@ -3754,6 +3754,7 @@
|
|||||||
"koa-compose": "^4.1.0",
|
"koa-compose": "^4.1.0",
|
||||||
"libmagic-ffi": "^0.1.4",
|
"libmagic-ffi": "^0.1.4",
|
||||||
"mime": "^3.0.0",
|
"mime": "^3.0.0",
|
||||||
|
"minimatch": "^10.0.1",
|
||||||
"minio": "^7.0.33",
|
"minio": "^7.0.33",
|
||||||
"node-object-hash": "^3.0.0",
|
"node-object-hash": "^3.0.0",
|
||||||
"node-schedule": "^2.1.1",
|
"node-schedule": "^2.1.1",
|
||||||
@ -3770,6 +3771,32 @@
|
|||||||
"tsyringe": "^4"
|
"tsyringe": "^4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/civkit/node_modules/brace-expansion": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||||
|
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"dependencies": {
|
||||||
|
"balanced-match": "^1.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/civkit/node_modules/minimatch": {
|
||||||
|
"version": "10.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
|
||||||
|
"integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
|
||||||
|
"license": "ISC",
|
||||||
|
"optional": true,
|
||||||
|
"dependencies": {
|
||||||
|
"brace-expansion": "^2.0.1"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": "20 || >=22"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/isaacs"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/cjs-module-lexer": {
|
"node_modules/cjs-module-lexer": {
|
||||||
"version": "1.2.3",
|
"version": "1.2.3",
|
||||||
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
|
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
|
||||||
@ -9795,9 +9822,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/puppeteer-extra-plugin-page-proxy": {
|
"node_modules/puppeteer-extra-plugin-page-proxy": {
|
||||||
"version": "2.0.0",
|
"version": "1.3.1",
|
||||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz",
|
||||||
"integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
|
"integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==",
|
||||||
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"debug": "^4.1.1",
|
"debug": "^4.1.1",
|
||||||
"got": "^11.8.5",
|
"got": "^11.8.5",
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
"archiver": "^6.0.1",
|
"archiver": "^6.0.1",
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.8.1-bb8d850",
|
"civkit": "^0.8.2-c9ca977",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -53,7 +53,7 @@
|
|||||||
"puppeteer": "^23.3.0",
|
"puppeteer": "^23.3.0",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
"puppeteer-page-proxy": "^1.3.0",
|
"puppeteer-page-proxy": "^1.3.0",
|
||||||
"robots-parser": "^3.0.1",
|
"robots-parser": "^3.0.1",
|
||||||
|
@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost {
|
|||||||
referer: opts.referer,
|
referer: opts.referer,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (opts.locale) {
|
||||||
|
crawlOpts.extraHeaders ??= {};
|
||||||
|
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||||
|
}
|
||||||
|
|
||||||
return crawlOpts;
|
return crawlOpts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ export interface PageSnapshot {
|
|||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
shadowExpanded?: string
|
shadowExpanded?: string;
|
||||||
text: string;
|
text: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
statusText?: string;
|
statusText?: string;
|
||||||
@ -75,6 +75,7 @@ export interface ScrappingOptions {
|
|||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
locale?: string;
|
locale?: string;
|
||||||
referer?: string;
|
referer?: string;
|
||||||
|
extraHeaders?: Record<string, string>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -581,14 +582,34 @@ if (window.self === window.top) {
|
|||||||
pdfUrls.push(url);
|
pdfUrls.push(url);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
if (options?.extraHeaders) {
|
||||||
|
page.on('request', async (req) => {
|
||||||
|
if (req.isInterceptResolutionHandled()) {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
const overrides = req.continueRequestOverrides();
|
||||||
|
const continueArgs = [{
|
||||||
|
...overrides,
|
||||||
|
headers: {
|
||||||
|
...overrides?.headers,
|
||||||
|
...options.extraHeaders,
|
||||||
|
}
|
||||||
|
}, 1] as const;
|
||||||
|
|
||||||
|
return req.continue(continueArgs[0], continueArgs[1]);
|
||||||
|
});
|
||||||
|
}
|
||||||
const sn = this.snMap.get(page);
|
const sn = this.snMap.get(page);
|
||||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||||
|
|
||||||
this.logger.info(`Locale setting: ${options?.locale}`);
|
this.logger.info(`Locale setting: ${options?.locale}`);
|
||||||
if (options?.locale) {
|
if (options?.locale) {
|
||||||
await page.setExtraHTTPHeaders({
|
// Add headers via request interception to walk around this bug
|
||||||
'Accept-Language': options?.locale
|
// https://github.com/puppeteer/puppeteer/issues/10235
|
||||||
});
|
// await page.setExtraHTTPHeaders({
|
||||||
|
// 'Accept-Language': options?.locale
|
||||||
|
// });
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
await page.evaluateOnNewDocument(() => {
|
||||||
Object.defineProperty(navigator, "language", {
|
Object.defineProperty(navigator, "language", {
|
||||||
@ -605,7 +626,10 @@ if (window.self === window.top) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (options?.proxyUrl) {
|
if (options?.proxyUrl) {
|
||||||
await page.useProxy(options.proxyUrl);
|
await page.useProxy(options.proxyUrl, {
|
||||||
|
headers: options.extraHeaders,
|
||||||
|
interceptResolutionPriority: 2,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
if (options?.cookies) {
|
if (options?.cookies) {
|
||||||
const mapped = options.cookies.map((x) => {
|
const mapped = options.cookies.map((x) => {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user