fix: walk around locale setting bug

This commit is contained in:
yanlong.wang 2024-10-29 15:09:20 +08:00
parent 9242bb393a
commit ebc09003d1
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 72 additions and 15 deletions

View File

@ -15,7 +15,7 @@
"archiver": "^6.0.1", "archiver": "^6.0.1",
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"civkit": "^0.8.1-bb8d850", "civkit": "^0.8.2-c9ca977",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",
@ -33,7 +33,7 @@
"puppeteer": "^23.3.0", "puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0", "puppeteer-extra-plugin-page-proxy": "^1.3.1",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0", "puppeteer-page-proxy": "^1.3.0",
"robots-parser": "^3.0.1", "robots-parser": "^3.0.1",
@ -3729,9 +3729,9 @@
} }
}, },
"node_modules/civkit": { "node_modules/civkit": {
"version": "0.8.1-bb8d850", "version": "0.8.2-c9ca977",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-bb8d850.tgz", "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz",
"integrity": "sha512-b80LGS/jnpPgEHlbPk9QyDqHAxmA/VH7pyc428HYXtcXQOiqhkMaFanOU78sCctxjpzYqnzOkNDwcPl1PdzgFw==", "integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==",
"license": "AGPL", "license": "AGPL",
"dependencies": { "dependencies": {
"lodash": "^4.17.21", "lodash": "^4.17.21",
@ -3754,6 +3754,7 @@
"koa-compose": "^4.1.0", "koa-compose": "^4.1.0",
"libmagic-ffi": "^0.1.4", "libmagic-ffi": "^0.1.4",
"mime": "^3.0.0", "mime": "^3.0.0",
"minimatch": "^10.0.1",
"minio": "^7.0.33", "minio": "^7.0.33",
"node-object-hash": "^3.0.0", "node-object-hash": "^3.0.0",
"node-schedule": "^2.1.1", "node-schedule": "^2.1.1",
@ -3770,6 +3771,32 @@
"tsyringe": "^4" "tsyringe": "^4"
} }
}, },
"node_modules/civkit/node_modules/brace-expansion": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
"license": "MIT",
"optional": true,
"dependencies": {
"balanced-match": "^1.0.0"
}
},
"node_modules/civkit/node_modules/minimatch": {
"version": "10.0.1",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
"integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
"license": "ISC",
"optional": true,
"dependencies": {
"brace-expansion": "^2.0.1"
},
"engines": {
"node": "20 || >=22"
},
"funding": {
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/cjs-module-lexer": { "node_modules/cjs-module-lexer": {
"version": "1.2.3", "version": "1.2.3",
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz", "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
@ -9795,9 +9822,10 @@
} }
}, },
"node_modules/puppeteer-extra-plugin-page-proxy": { "node_modules/puppeteer-extra-plugin-page-proxy": {
"version": "2.0.0", "version": "1.3.1",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz", "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz",
"integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==", "integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==",
"license": "MIT",
"dependencies": { "dependencies": {
"debug": "^4.1.1", "debug": "^4.1.1",
"got": "^11.8.5", "got": "^11.8.5",

View File

@ -35,7 +35,7 @@
"archiver": "^6.0.1", "archiver": "^6.0.1",
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"civkit": "^0.8.1-bb8d850", "civkit": "^0.8.2-c9ca977",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",
@ -53,7 +53,7 @@
"puppeteer": "^23.3.0", "puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0", "puppeteer-extra-plugin-page-proxy": "^1.3.1",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0", "puppeteer-page-proxy": "^1.3.0",
"robots-parser": "^3.0.1", "robots-parser": "^3.0.1",

View File

@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost {
referer: opts.referer, referer: opts.referer,
}; };
if (opts.locale) {
crawlOpts.extraHeaders ??= {};
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}
return crawlOpts; return crawlOpts;
} }

View File

@ -46,7 +46,7 @@ export interface PageSnapshot {
href: string; href: string;
rebase?: string; rebase?: string;
html: string; html: string;
shadowExpanded?: string shadowExpanded?: string;
text: string; text: string;
status?: number; status?: number;
statusText?: string; statusText?: string;
@ -75,6 +75,7 @@ export interface ScrappingOptions {
timeoutMs?: number; timeoutMs?: number;
locale?: string; locale?: string;
referer?: string; referer?: string;
extraHeaders?: Record<string, string>;
} }
@ -581,14 +582,34 @@ if (window.self === window.top) {
pdfUrls.push(url); pdfUrls.push(url);
} }
}); });
if (options?.extraHeaders) {
page.on('request', async (req) => {
if (req.isInterceptResolutionHandled()) {
return;
};
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...overrides?.headers,
...options.extraHeaders,
}
}, 1] as const;
return req.continue(continueArgs[0], continueArgs[1]);
});
}
const sn = this.snMap.get(page); const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
this.logger.info(`Locale setting: ${options?.locale}`); this.logger.info(`Locale setting: ${options?.locale}`);
if (options?.locale) { if (options?.locale) {
await page.setExtraHTTPHeaders({ // Add headers via request interception to walk around this bug
'Accept-Language': options?.locale // https://github.com/puppeteer/puppeteer/issues/10235
}); // await page.setExtraHTTPHeaders({
// 'Accept-Language': options?.locale
// });
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "language", { Object.defineProperty(navigator, "language", {
@ -605,7 +626,10 @@ if (window.self === window.top) {
} }
if (options?.proxyUrl) { if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl); await page.useProxy(options.proxyUrl, {
headers: options.extraHeaders,
interceptResolutionPriority: 2,
});
} }
if (options?.cookies) { if (options?.cookies) {
const mapped = options.cookies.map((x) => { const mapped = options.cookies.map((x) => {