From ebc09003d1f0e40ced2b24f891465350b8861f0a Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Tue, 29 Oct 2024 15:09:20 +0800 Subject: [PATCH] fix: walk around locale setting bug --- backend/functions/package-lock.json | 44 +++++++++++++++---- backend/functions/package.json | 4 +- .../functions/src/cloud-functions/crawler.ts | 5 +++ backend/functions/src/services/puppeteer.ts | 34 +++++++++++--- 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 4b2ce1f..664a62f 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -15,7 +15,7 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.8.1-bb8d850", + "civkit": "^0.8.2-c9ca977", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -33,7 +33,7 @@ "puppeteer": "^23.3.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", - "puppeteer-extra-plugin-page-proxy": "^2.0.0", + "puppeteer-extra-plugin-page-proxy": "^1.3.1", "puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-page-proxy": "^1.3.0", "robots-parser": "^3.0.1", @@ -3729,9 +3729,9 @@ } }, "node_modules/civkit": { - "version": "0.8.1-bb8d850", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-bb8d850.tgz", - "integrity": "sha512-b80LGS/jnpPgEHlbPk9QyDqHAxmA/VH7pyc428HYXtcXQOiqhkMaFanOU78sCctxjpzYqnzOkNDwcPl1PdzgFw==", + "version": "0.8.2-c9ca977", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz", + "integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", @@ -3754,6 +3754,7 @@ "koa-compose": "^4.1.0", "libmagic-ffi": "^0.1.4", "mime": "^3.0.0", + "minimatch": "^10.0.1", "minio": "^7.0.33", "node-object-hash": "^3.0.0", "node-schedule": "^2.1.1", @@ -3770,6 +3771,32 @@ "tsyringe": "^4" } }, + "node_modules/civkit/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "license": "MIT", + "optional": true, + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/civkit/node_modules/minimatch": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz", + "integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==", + "license": "ISC", + "optional": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/cjs-module-lexer": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz", @@ -9795,9 +9822,10 @@ } }, "node_modules/puppeteer-extra-plugin-page-proxy": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz", - "integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==", + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz", + "integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==", + "license": "MIT", "dependencies": { "debug": "^4.1.1", "got": "^11.8.5", diff --git a/backend/functions/package.json b/backend/functions/package.json index 648f4af..901bfe9 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -35,7 +35,7 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.8.1-bb8d850", + "civkit": "^0.8.2-c9ca977", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -53,7 +53,7 @@ "puppeteer": "^23.3.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", - "puppeteer-extra-plugin-page-proxy": "^2.0.0", + "puppeteer-extra-plugin-page-proxy": "^1.3.1", "puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-page-proxy": "^1.3.0", "robots-parser": "^3.0.1", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 872bcd7..8301448 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost { referer: opts.referer, }; + if (opts.locale) { + crawlOpts.extraHeaders ??= {}; + crawlOpts.extraHeaders['Accept-Language'] = opts.locale; + } + return crawlOpts; } diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 1f3a7f5..19ef4fc 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -46,7 +46,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; - shadowExpanded?: string + shadowExpanded?: string; text: string; status?: number; statusText?: string; @@ -75,6 +75,7 @@ export interface ScrappingOptions { timeoutMs?: number; locale?: string; referer?: string; + extraHeaders?: Record; } @@ -581,14 +582,34 @@ if (window.self === window.top) { pdfUrls.push(url); } }); + if (options?.extraHeaders) { + page.on('request', async (req) => { + if (req.isInterceptResolutionHandled()) { + return; + }; + + const overrides = req.continueRequestOverrides(); + const continueArgs = [{ + ...overrides, + headers: { + ...overrides?.headers, + ...options.extraHeaders, + } + }, 1] as const; + + return req.continue(continueArgs[0], continueArgs[1]); + }); + } const sn = this.snMap.get(page); this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); this.logger.info(`Locale setting: ${options?.locale}`); if (options?.locale) { - await page.setExtraHTTPHeaders({ - 'Accept-Language': options?.locale - }); + // Add headers via request interception to walk around this bug + // https://github.com/puppeteer/puppeteer/issues/10235 + // await page.setExtraHTTPHeaders({ + // 'Accept-Language': options?.locale + // }); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, "language", { @@ -605,7 +626,10 @@ if (window.self === window.top) { } if (options?.proxyUrl) { - await page.useProxy(options.proxyUrl); + await page.useProxy(options.proxyUrl, { + headers: options.extraHeaders, + interceptResolutionPriority: 2, + }); } if (options?.cookies) { const mapped = options.cookies.map((x) => {