From dbeb69582a5171e011c069e61fda630f982b7fff Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Sat, 13 Apr 2024 22:27:50 +0800 Subject: [PATCH] puppeteer stealth --- backend/functions/package-lock.json | 320 +++++++++++++++++- backend/functions/package.json | 2 + .../functions/src/cloud-functions/crawler.ts | 20 +- backend/functions/src/index.ts | 11 +- backend/functions/src/services/puppeteer.ts | 17 +- 5 files changed, 357 insertions(+), 13 deletions(-) diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index fea0f89..93c40d4 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -27,6 +27,8 @@ "minio": "^7.1.3", "openai": "^4.20.0", "puppeteer": "^22.6.3", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "stripe": "^11.11.0", "tiktoken": "^1.0.10", "turndown": "^7.1.3", @@ -2526,6 +2528,14 @@ "@types/node": "*" } }, + "node_modules/@types/debug": { + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", + "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", + "dependencies": { + "@types/ms": "*" + } + }, "node_modules/@types/express": { "version": "4.17.3", "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz", @@ -2673,6 +2683,11 @@ "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA==", "optional": true }, + "node_modules/@types/ms": { + "version": "0.7.34", + "resolved": "https://registry.npmjs.org/@types/ms/-/ms-0.7.34.tgz", + "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==" + }, "node_modules/@types/node": { "version": "18.19.31", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.31.tgz", @@ -3234,6 +3249,14 @@ "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" }, + "node_modules/arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/array-buffer-byte-length": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz", @@ -4076,6 +4099,21 @@ "node": ">=12" } }, + "node_modules/clone-deep": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz", + "integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==", + "dependencies": { + "for-own": "^0.1.3", + "is-plain-object": "^2.0.1", + "kind-of": "^3.0.2", + "lazy-cache": "^1.0.3", + "shallow-clone": "^0.1.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -4466,8 +4504,6 @@ "version": "4.3.1", "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", - "dev": true, - "peer": true, "engines": { "node": ">=0.10.0" } @@ -5739,6 +5775,25 @@ "is-callable": "^1.1.3" } }, + "node_modules/for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==", + "dependencies": { + "for-in": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/form-data": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", @@ -6786,6 +6841,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + }, "node_modules/is-callable": { "version": "1.2.7", "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", @@ -6839,6 +6899,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-extglob": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", @@ -6948,6 +7016,17 @@ "node": ">=8" } }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-regex": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", @@ -7064,6 +7143,14 @@ "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "dev": true }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -8049,6 +8136,17 @@ "json-buffer": "3.0.1" } }, + "node_modules/kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==", + "dependencies": { + "is-buffer": "^1.1.5" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/klaw": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/klaw/-/klaw-3.0.0.tgz", @@ -8184,6 +8282,14 @@ "unicode-9.0.0": "0.7.0" } }, + "node_modules/lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/lazystream": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz", @@ -8504,6 +8610,19 @@ "optional": true, "peer": true }, + "node_modules/merge-deep": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz", + "integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==", + "dependencies": { + "arr-union": "^3.1.0", + "clone-deep": "^0.2.4", + "kind-of": "^3.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/merge-descriptors": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", @@ -8672,6 +8791,26 @@ "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==" }, + "node_modules/mixin-object": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz", + "integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==", + "dependencies": { + "for-in": "^0.1.3", + "is-extendable": "^0.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/mixin-object/node_modules/for-in": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz", + "integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/mkdirp": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz", @@ -9719,6 +9858,150 @@ "node": ">=18" } }, + "node_modules/puppeteer-extra": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz", + "integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "deepmerge": "^4.2.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "@types/puppeteer": "*", + "puppeteer": "*", + "puppeteer-core": "*" + }, + "peerDependenciesMeta": { + "@types/puppeteer": { + "optional": true + }, + "puppeteer": { + "optional": true + }, + "puppeteer-core": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz", + "integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "merge-deep": "^3.0.1" + }, + "engines": { + "node": ">=9.11.2" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-stealth": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", + "integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==", + "dependencies": { + "debug": "^4.1.1", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-preferences": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz", + "integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==", + "dependencies": { + "debug": "^4.1.1", + "fs-extra": "^10.0.0", + "puppeteer-extra-plugin": "^3.2.3", + "rimraf": "^3.0.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/fs-extra": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz", + "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==", + "dependencies": { + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/puppeteer-extra-plugin-user-preferences": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz", + "integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==", + "dependencies": { + "debug": "^4.1.1", + "deepmerge": "^4.2.2", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-data-dir": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -10314,6 +10597,39 @@ "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==" }, + "node_modules/shallow-clone": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", + "integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==", + "dependencies": { + "is-extendable": "^0.1.1", + "kind-of": "^2.0.1", + "lazy-cache": "^0.2.3", + "mixin-object": "^2.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/kind-of": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz", + "integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==", + "dependencies": { + "is-buffer": "^1.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/lazy-cache": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz", + "integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", diff --git a/backend/functions/package.json b/backend/functions/package.json index 6fb7f83..427bd42 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -47,6 +47,8 @@ "minio": "^7.1.3", "openai": "^4.20.0", "puppeteer": "^22.6.3", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "stripe": "^11.11.0", "tiktoken": "^1.0.10", "turndown": "^7.1.3", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 205caf2..b09aae7 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -1,4 +1,4 @@ -import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError } from 'civkit'; +import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit'; import { singleton } from 'tsyringe'; import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import _ from 'lodash'; @@ -32,11 +32,11 @@ export class CrawlerHost extends RPCHost { const toBeTurnedToMd = snapshot.parsed?.content; const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : ''; - const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text.trim(); + const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim(); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), - url: snapshot.href.trim(), + url: snapshot.href?.trim(), content: contentText.trim(), toString() { @@ -80,7 +80,15 @@ ${this.content} }, ) { const noSlashURL = ctx.req.url.slice(1); - const urlToCrawl = new URL(normalizeUrl(noSlashURL)); + let urlToCrawl; + try { + urlToCrawl = new URL(normalizeUrl(noSlashURL.trim())); + } catch (err) { + throw new ParamValidationError({ + message: `${err}`, + path: 'url' + }); + } const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']); const noCache = Boolean(ctx.req.headers['x-no-cache']); @@ -125,7 +133,7 @@ ${this.content} if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { lastScrapped = scrapped; - if (!scrapped?.parsed?.content) { + if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } @@ -143,7 +151,7 @@ ${this.content} for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { lastScrapped = scrapped; - if (!scrapped?.parsed?.content) { + if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } diff --git a/backend/functions/src/index.ts b/backend/functions/src/index.ts index 3ef6869..251a721 100644 --- a/backend/functions/src/index.ts +++ b/backend/functions/src/index.ts @@ -21,6 +21,7 @@ initializeApp(); import { loadModulesDynamically, registry } from './shared'; import path from 'path'; +import { ApplicationError } from 'civkit'; loadModulesDynamically(path.resolve(__dirname, 'cloud-functions')); Object.assign(exports, registry.exportAll()); @@ -31,4 +32,12 @@ Object.assign(exports, registry.exportGrouped({ registry.title = 'reader'; registry.version = '0.1.0'; -process.on('unhandledRejection', () => 'no big deal'); +process.on('unhandledRejection', (err) => { + // Walk around Firebase runtime bug. + if (err instanceof ApplicationError) { + // Application error shall not crash the process; + return; + } + + throw err; +}); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index c3e2c2d..176876d 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -1,11 +1,13 @@ import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit'; import { container, singleton } from 'tsyringe'; -import puppeteer, { Browser } from 'puppeteer'; +import type { Browser } from 'puppeteer'; import { Logger } from '../shared/services/logger'; import genericPool from 'generic-pool'; import os from 'os'; import fs from 'fs'; import { Crawled } from '../db/crawled'; +import puppeteer from 'puppeteer-extra'; +import puppeteerStealth from 'puppeteer-extra-plugin-stealth'; const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -31,6 +33,12 @@ export interface PageSnapshot { } const md5Hasher = new HashManager('md5', 'hex'); +puppeteer.use(puppeteerStealth()); +// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override'); +// puppeteer.use(puppeteerUAOverride({ +// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)` +// })) + @singleton() export class PuppeteerControl extends AsyncService { @@ -77,9 +85,10 @@ export class PuppeteerControl extends AsyncService { headless: true, timeout: 10_000 }).catch((err) => { - this.logger.error(`Unknown firebase issue, just die fast, quitting process.`, { err }); + this.logger.error(`Unknown firebase issue, just die fast.`, { err }); process.nextTick(() => { - process.exit(1); + this.emit('error', err); + // process.exit(1); }); return Promise.reject(err); }); @@ -100,7 +109,7 @@ export class PuppeteerControl extends AsyncService { const preparations = []; // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); - preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); + // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); preparations.push(page.setViewport({ width: 1920, height: 1080 })); preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {