diff --git a/backend/firebase.json b/backend/firebase.json index 6dc10e9..a54c720 100644 --- a/backend/firebase.json +++ b/backend/firebase.json @@ -13,7 +13,8 @@ ".git", "*.log", "*.local", - ".secret.*" + ".secret.*", + ".firebase-emu" ], "predeploy": [ "npm --prefix \"$RESOURCE_DIR\" run build:clean", diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index b18a71f..ec2087c 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -29,7 +29,10 @@ "puppeteer": "^22.6.3", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", + "puppeteer-extra-plugin-page-proxy": "^2.0.0", "puppeteer-extra-plugin-stealth": "^2.11.2", + "puppeteer-page-proxy": "^1.3.0", + "set-cookie-parser": "^2.6.0", "stripe": "^11.11.0", "tiktoken": "^1.0.10", "turndown": "^7.1.3", @@ -42,6 +45,7 @@ "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", "@types/node": "^18", + "@types/set-cookie-parser": "^2.4.7", "@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/parser": "^5.12.0", "eslint": "^8.9.0", @@ -1986,6 +1990,17 @@ "dev": true, "peer": true }, + "node_modules/@sindresorhus/is": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-4.6.0.tgz", + "integrity": "sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sindresorhus/is?sponsor=1" + } + }, "node_modules/@sinonjs/commons": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz", @@ -2006,6 +2021,17 @@ "@sinonjs/commons": "^3.0.0" } }, + "node_modules/@szmarczak/http-timer": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz", + "integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==", + "dependencies": { + "defer-to-connect": "^2.0.0" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/@tootallnate/once": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", @@ -2091,6 +2117,17 @@ "@types/node": "*" } }, + "node_modules/@types/cacheable-request": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz", + "integrity": "sha512-IQ3EbTzGxIigb1I3qPZc1rWJnH0BmSKv5QYTalEwweFvyBDLSAe24zP0le/hyi7ecGfZVlIVAg4BZqb8WBwKqw==", + "dependencies": { + "@types/http-cache-semantics": "*", + "@types/keyv": "^3.1.4", + "@types/node": "*", + "@types/responselike": "^1.0.0" + } + }, "node_modules/@types/caseless": { "version": "0.12.5", "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz", @@ -2161,6 +2198,11 @@ "@types/node": "*" } }, + "node_modules/@types/http-cache-semantics": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz", + "integrity": "sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA==" + }, "node_modules/@types/http-errors": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz", @@ -2213,6 +2255,14 @@ "@types/node": "*" } }, + "node_modules/@types/keyv": { + "version": "3.1.4", + "resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz", + "integrity": "sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/lodash": { "version": "4.17.0", "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz", @@ -2294,6 +2344,14 @@ "node": ">= 0.12" } }, + "node_modules/@types/responselike": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/@types/responselike/-/responselike-1.0.3.tgz", + "integrity": "sha512-H/+L+UkTV33uf49PH5pCAUBVPNj2nDBXTN+qS1dOwyyg24l3CcicicCA7ca+HMvJBZcFgl5r8e+RR6elsb4Lyw==", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/semver": { "version": "7.5.8", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz", @@ -2319,6 +2377,15 @@ "@types/send": "*" } }, + "node_modules/@types/set-cookie-parser": { + "version": "2.4.7", + "resolved": "https://registry.npmjs.org/@types/set-cookie-parser/-/set-cookie-parser-2.4.7.tgz", + "integrity": "sha512-+ge/loa0oTozxip6zmhRIk8Z/boU51wl9Q6QdLZcokIGMzY5lFXYy/x7Htj2HTC6/KZP1hUbZ1ekx8DYXICvWg==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/stack-utils": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz", @@ -3447,6 +3514,45 @@ "node": ">= 6.0.0" } }, + "node_modules/cacheable-lookup": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz", + "integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==", + "engines": { + "node": ">=10.6.0" + } + }, + "node_modules/cacheable-request": { + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-7.0.4.tgz", + "integrity": "sha512-v+p6ongsrp0yTGbJXjgxPow2+DL93DASP4kXCDKb8/bwRtt9OEF3whggkkDkGNzgcWy2XaF4a8nZglC7uElscg==", + "dependencies": { + "clone-response": "^1.0.2", + "get-stream": "^5.1.0", + "http-cache-semantics": "^4.0.0", + "keyv": "^4.0.0", + "lowercase-keys": "^2.0.0", + "normalize-url": "^6.0.1", + "responselike": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/cacheable-request/node_modules/get-stream": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz", + "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==", + "dependencies": { + "pump": "^3.0.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/call-bind": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", @@ -3642,6 +3748,25 @@ "node": ">=0.10.0" } }, + "node_modules/clone-response": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.3.tgz", + "integrity": "sha512-ROoL94jJH2dUVML2Y/5PEDNaSHgeOdSDicUyS7izcF63G6sTc/FTjLub4b8Il9S8S0beOfYt0TaA5qvFK+w0wA==", + "dependencies": { + "mimic-response": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/clone-response/node_modules/mimic-response": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz", + "integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==", + "engines": { + "node": ">=4" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -4003,6 +4128,17 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/decompress-response/node_modules/mimic-response": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", + "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/dedent": { "version": "1.5.3", "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz", @@ -4046,6 +4182,14 @@ "node": ">=0.10.0" } }, + "node_modules/defer-to-connect": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz", + "integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==", + "engines": { + "node": ">=10" + } + }, "node_modules/define-data-property": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", @@ -5837,6 +5981,30 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/got": { + "version": "11.8.6", + "resolved": "https://registry.npmjs.org/got/-/got-11.8.6.tgz", + "integrity": "sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==", + "dependencies": { + "@sindresorhus/is": "^4.0.0", + "@szmarczak/http-timer": "^4.0.5", + "@types/cacheable-request": "^6.0.1", + "@types/responselike": "^1.0.0", + "cacheable-lookup": "^5.0.3", + "cacheable-request": "^7.0.2", + "decompress-response": "^6.0.0", + "http2-wrapper": "^1.0.0-beta.5.2", + "lowercase-keys": "^2.0.0", + "p-cancelable": "^2.0.0", + "responselike": "^2.0.0" + }, + "engines": { + "node": ">=10.19.0" + }, + "funding": { + "url": "https://github.com/sindresorhus/got?sponsor=1" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -6024,6 +6192,11 @@ "node": ">= 0.6" } }, + "node_modules/http-cache-semantics": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz", + "integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==" + }, "node_modules/http-errors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", @@ -6067,6 +6240,18 @@ "node": ">= 14" } }, + "node_modules/http2-wrapper": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz", + "integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==", + "dependencies": { + "quick-lru": "^5.1.1", + "resolve-alpn": "^1.0.0" + }, + "engines": { + "node": ">=10.19.0" + } + }, "node_modules/https-proxy-agent": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", @@ -7391,8 +7576,7 @@ "node_modules/json-buffer": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", - "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==", - "dev": true + "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==" }, "node_modules/json-parse-even-better-errors": { "version": "2.3.1", @@ -7550,7 +7734,6 @@ "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", - "dev": true, "dependencies": { "json-buffer": "3.0.1" } @@ -7869,6 +8052,14 @@ "integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==", "optional": true }, + "node_modules/lowercase-keys": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-2.0.0.tgz", + "integrity": "sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==", + "engines": { + "node": ">=8" + } + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -8054,17 +8245,6 @@ "node": ">=6" } }, - "node_modules/mimic-response": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", - "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/minimatch": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", @@ -8423,6 +8603,17 @@ "node": ">=0.10.0" } }, + "node_modules/normalize-url": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz", + "integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/npm-run-path": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz", @@ -8632,6 +8823,14 @@ "node": ">= 0.8.0" } }, + "node_modules/p-cancelable": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz", + "integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==", + "engines": { + "node": ">=8" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -9243,6 +9442,11 @@ "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", "integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ==" }, + "node_modules/psl": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", + "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==" + }, "node_modules/pump": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", @@ -9367,6 +9571,58 @@ } } }, + "node_modules/puppeteer-extra-plugin-page-proxy": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz", + "integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==", + "dependencies": { + "debug": "^4.1.1", + "got": "^11.8.5", + "http-proxy-agent": "^5.0.0", + "https-proxy-agent": "^5.0.1", + "puppeteer-extra-plugin": "^3.2.3", + "socks-proxy-agent": "^7.0.0", + "tough-cookie": "^4.1.2" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-page-proxy/node_modules/http-proxy-agent": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", + "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", + "dependencies": { + "@tootallnate/once": "2", + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/puppeteer-extra-plugin-page-proxy/node_modules/socks-proxy-agent": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz", + "integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==", + "dependencies": { + "agent-base": "^6.0.2", + "debug": "^4.3.3", + "socks": "^2.6.2" + }, + "engines": { + "node": ">= 10" + } + }, "node_modules/puppeteer-extra-plugin-stealth": { "version": "2.11.2", "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", @@ -9457,6 +9713,44 @@ } } }, + "node_modules/puppeteer-page-proxy": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/puppeteer-page-proxy/-/puppeteer-page-proxy-1.3.0.tgz", + "integrity": "sha512-PDpLjJfcUKiLvUZ3yQJeUcP1d+7nW17s2LZIrKH0gyxEN4zTGkCvB9/HwquPgYq5YcVi8QugsvBckP/K9Vn/iw==", + "dependencies": { + "got": "^11.8.5", + "http-proxy-agent": "^5.0.0", + "https-proxy-agent": "^5.0.1", + "socks-proxy-agent": "^7.0.0", + "tough-cookie": "^4.1.2" + } + }, + "node_modules/puppeteer-page-proxy/node_modules/http-proxy-agent": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", + "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", + "dependencies": { + "@tootallnate/once": "2", + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/puppeteer-page-proxy/node_modules/socks-proxy-agent": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz", + "integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==", + "dependencies": { + "agent-base": "^6.0.2", + "debug": "^4.3.3", + "socks": "^2.6.2" + }, + "engines": { + "node": ">= 10" + } + }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -9523,6 +9817,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/querystringify": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz", + "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==" + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -9554,6 +9853,17 @@ "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==", "optional": true }, + "node_modules/quick-lru": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz", + "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/range-parser": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", @@ -9709,6 +10019,11 @@ "node": ">=0.10.0" } }, + "node_modules/requires-port": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", + "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==" + }, "node_modules/resolve": { "version": "1.22.8", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", @@ -9726,6 +10041,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/resolve-alpn": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/resolve-alpn/-/resolve-alpn-1.2.1.tgz", + "integrity": "sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g==" + }, "node_modules/resolve-cwd": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz", @@ -9767,6 +10087,17 @@ "node": ">=10" } }, + "node_modules/responselike": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/responselike/-/responselike-2.0.1.tgz", + "integrity": "sha512-4gl03wn3hj1HP3yzgdI7d3lCkF95F21Pz4BPGvKHinyQzALR5CapwC8yIi0Rh58DEMQ/SguC03wFj2k0M/mHhw==", + "dependencies": { + "lowercase-keys": "^2.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/retry": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", @@ -10029,6 +10360,11 @@ "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==" }, + "node_modules/set-cookie-parser": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.6.0.tgz", + "integrity": "sha512-RVnVQxTXuerk653XfuliOxBP81Sf0+qfQE73LIYKcyMYHG94AuH0kgrQpRDuTZnSmjpysHmzxJXKNfa6PjFhyQ==" + }, "node_modules/set-function-length": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", @@ -10767,6 +11103,28 @@ "node": ">=0.6" } }, + "node_modules/tough-cookie": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz", + "integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==", + "dependencies": { + "psl": "^1.1.33", + "punycode": "^2.1.1", + "universalify": "^0.2.0", + "url-parse": "^1.5.3" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/tough-cookie/node_modules/universalify": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz", + "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==", + "engines": { + "node": ">= 4.0.0" + } + }, "node_modules/tr46": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz", @@ -11127,6 +11485,15 @@ "punycode": "^2.1.0" } }, + "node_modules/url-parse": { + "version": "1.5.10", + "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz", + "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==", + "dependencies": { + "querystringify": "^2.1.1", + "requires-port": "^1.0.0" + } + }, "node_modules/urlpattern-polyfill": { "version": "10.0.0", "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz", diff --git a/backend/functions/package.json b/backend/functions/package.json index b0acf7b..13d8283 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -49,7 +49,10 @@ "puppeteer": "^22.6.3", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", + "puppeteer-extra-plugin-page-proxy": "^2.0.0", "puppeteer-extra-plugin-stealth": "^2.11.2", + "puppeteer-page-proxy": "^1.3.0", + "set-cookie-parser": "^2.6.0", "stripe": "^11.11.0", "tiktoken": "^1.0.10", "turndown": "^7.1.3", @@ -62,6 +65,7 @@ "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", "@types/node": "^18", + "@types/set-cookie-parser": "^2.4.7", "@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/parser": "^5.12.0", "eslint": "^8.9.0", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 46c21a7..b1904da 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -1,51 +1,25 @@ -import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit'; +import { + assignTransferProtocolMeta, marshalErrorLike, + RPCHost, RPCReflection, + HashManager, + AssertionFailureError, ParamValidationError, +} from 'civkit'; import { singleton } from 'tsyringe'; -import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared'; +import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import _ from 'lodash'; -import { PageSnapshot, PuppeteerControl } from '../services/puppeteer'; +import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { Request, Response } from 'express'; import normalizeUrl from "@esm2cjs/normalize-url"; import { AltTextService } from '../services/alt-text'; import TurndownService from 'turndown'; +import { parseString as parseSetCookieString } from 'set-cookie-parser'; +import { CookieParam } from 'puppeteer'; +import { Crawled } from '../db/crawled'; +import { tidyMarkdown } from '../utils/markdown'; +import { cleanAttribute } from '../utils/misc'; +import { randomUUID } from 'crypto'; -function tidyMarkdown(markdown: string): string { - - // Step 1: Handle complex broken links with text and optional images spread across multiple lines - let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { - // Remove internal new lines and excessive spaces within the text - text = text.replace(/\s+/g, ' ').trim(); - url = url.replace(/\s+/g, '').trim(); - return `[${text}](${url})`; - }); - - normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { - // Normalize by removing excessive spaces and new lines - text = text.replace(/\s+/g, ' ').trim(); - alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; - imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; - linkUrl = linkUrl.replace(/\s+/g, '').trim(); - if (imgUrl) { - return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; - } else { - return `[${text}](${linkUrl})`; - } - }); - - // Step 2: Normalize regular links that may be broken across lines - normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { - text = text.replace(/\s+/g, ' ').trim(); - url = url.replace(/\s+/g, '').trim(); - return `[${text}](${url})`; - }); - - // Step 3: Replace more than two consecutive empty lines with exactly two empty lines - normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); - - // Step 4: Remove leading spaces from each line - normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); - - return normalizedMarkdown.trim(); -} +const md5Hasher = new HashManager('md5', 'hex'); @singleton() export class CrawlerHost extends RPCHost { @@ -53,12 +27,29 @@ export class CrawlerHost extends RPCHost { turnDownPlugins = [require('turndown-plugin-gfm').tables]; + cacheRetentionMs = 1000 * 3600 * 24 * 7; + cacheValidMs = 1000 * 300; + urlValidMs = 1000 * 3600 * 4; + constructor( protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, protected altTextService: AltTextService, + protected firebaseObjectStorage: FirebaseStorageBucketControl, ) { super(...arguments); + + puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => { + if (!snapshot.title?.trim()) { + return; + } + if (options.cookies?.length) { + // Potential privacy issue, dont cache if cookies are used + return; + } + + await this.setToCache(options.url, snapshot); + }); } override async init() { @@ -67,16 +58,51 @@ export class CrawlerHost extends RPCHost { this.emit('ready'); } - async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) { + async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { + screenshotUrl?: string; + }, nominalUrl?: URL) { + if (mode === 'screenshot') { + if (snapshot.screenshot && !snapshot.screenshotUrl) { + const fid = `instant-screenshots/${randomUUID()}`; + await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, { + metadata: { + contentType: 'image/png', + } + }); + snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); + } + + return { + screenshotUrl: snapshot.screenshotUrl, + toString() { + return this.screenshotUrl; + } + }; + } + if (mode === 'html') { + return { + html: snapshot.html, + toString() { + return this.html; + } + }; + } + if (mode === 'text') { + return { + text: snapshot.text, + toString() { + return this.text; + } + }; + } + const toBeTurnedToMd = snapshot.parsed?.content; let turnDownService = new TurndownService(); for (const plugin of this.turnDownPlugins) { turnDownService = turnDownService.use(plugin); } - - let contentText = ''; - if (toBeTurnedToMd) { - const urlToAltMap: { [k: string]: string | undefined; } = {}; + const urlToAltMap: { [k: string]: string | undefined; } = {}; + if (snapshot.imgs?.length) { const tasks = (snapshot.imgs || []).map(async (x) => { const r = await this.altTextService.getAltText(x).catch((err: any) => { this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); @@ -88,25 +114,27 @@ export class CrawlerHost extends RPCHost { }); await Promise.all(tasks); - let imgIdx = 0; - - turnDownService.addRule('img-generated-alt', { - filter: 'img', - replacement: (_content, node) => { - const src = (node.getAttribute('src') || '').trim(); - const alt = cleanAttribute(node.getAttribute('alt')); - if (!src) { - return ''; - } - const mapped = urlToAltMap[src]; - imgIdx++; - if (mapped) { - return `![Image ${imgIdx}: ${mapped || alt}](${src})`; - } - return `![Image ${imgIdx}: ${alt}](${src})`; + } + let imgIdx = 0; + turnDownService.addRule('img-generated-alt', { + filter: 'img', + replacement: (_content, node) => { + const src = (node.getAttribute('src') || '').trim(); + const alt = cleanAttribute(node.getAttribute('alt')); + if (!src) { + return ''; } - }); + const mapped = urlToAltMap[src]; + imgIdx++; + if (mapped) { + return `![Image ${imgIdx}: ${mapped || alt}](${src})`; + } + return `![Image ${imgIdx}: ${alt}](${src})`; + } + }); + let contentText = ''; + if (toBeTurnedToMd) { try { contentText = turnDownService.turndown(toBeTurnedToMd).trim(); } catch (err) { @@ -141,7 +169,7 @@ export class CrawlerHost extends RPCHost { const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), - url: nominalUrl || snapshot.href?.trim(), + url: nominalUrl?.toString() || snapshot.href?.trim(), content: cleanText, publishedTime: snapshot.parsed?.publishedTime || undefined, @@ -171,6 +199,7 @@ ${this.content} timeoutSeconds: 540, concurrency: 4, }, + tags: ['Crawler'], httpMethod: ['get', 'post'], returnType: [String, OutputServerEventStream], }) @@ -181,6 +210,57 @@ ${this.content} concurrency: 21, maxInstances: 476, }, + openapi: { + operation: { + parameters: { + 'Accept': { + description: `Specifies your preference for the response format. \n\n` + + `Supported formats:\n` + + `- text/event-stream\n` + + `- application/json or text/json\n` + + `- text/plain` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-No-Cache': { + description: `Ignores internal cache if this header is specified with a value.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Respond-With': { + description: `Specifies the form factor of the crawled data you prefer. \n\n` + + `Supported formats:\n` + + `- markdown\n` + + `- html\n` + + `- text\n` + + `- screenshot\n\n` + + `Defaults to: markdown` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Proxy-Url': { + description: `Specifies your custom proxy if you prefer to use one. \n\n` + + `Supported protocols:\n` + + `- http\n` + + `- https\n` + + `- socks4\n` + + `- socks5\n\n` + + `For authentication, https://user:pass@host:port`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Set-Cookie': { + description: `Sets cookie(s) to the headless browser for your request. \n\n` + + `Syntax is the same with standard Set-Cookie`, + in: 'header', + schema: { type: 'string' } + }, + } + } + }, + tags: ['Crawler'], httpMethod: ['get', 'post'], returnType: [String, OutputServerEventStream], }) @@ -207,27 +287,41 @@ ${this.content} path: 'url' }); } - const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']); - const noCache = Boolean(ctx.req.headers['x-no-cache']); + + const customMode = ctx.req.get('x-respond-with') || 'markdown'; + const noCache = Boolean(ctx.req.get('x-no-cache')); + const cookies: CookieParam[] = []; + const setCookieHeaders = ctx.req.headers['x-set-cookie']; + if (Array.isArray(setCookieHeaders)) { + for (const setCookie of setCookieHeaders) { + cookies.push({ + ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam, + domain: urlToCrawl.hostname, + }); + } + } else if (setCookieHeaders) { + cookies.push({ + ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam, + domain: urlToCrawl.hostname, + }); + } + + const crawlOpts: ScrappingOptions = { + proxyUrl: ctx.req.get('x-proxy-url'), + cookies, + }; if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); try { - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) { if (!scrapped) { continue; } - const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); - - if (scrapped.screenshot && screenshotEnabled) { - sseStream.write({ - event: 'screenshot', - data: scrapped.screenshot.toString('base64'), - }); - } + const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); sseStream.write({ event: 'data', @@ -235,7 +329,7 @@ ${this.content} }); } } catch (err: any) { - this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) }); + this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) }); sseStream.write({ event: 'error', data: marshalErrorLike(err), @@ -249,13 +343,13 @@ ${this.content} let lastScrapped; if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) { lastScrapped = scrapped; if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } - const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); + const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); return formatted; } @@ -264,16 +358,22 @@ ${this.content} throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); } - return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString()); + return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl); } - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) { lastScrapped = scrapped; if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } - const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); + const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); + if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { + + return assignTransferProtocolMeta(`${formatted}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } + ); + } return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } @@ -282,12 +382,111 @@ ${this.content} throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); } - return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`; + const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl); + if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { + + return assignTransferProtocolMeta(`${formatted}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } + ); + } + + return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } + getUrlDigest(urlToCrawl: URL) { + const normalizedURL = new URL(urlToCrawl); + normalizedURL.hash = ''; + const normalizedUrl = normalizedURL.toString().toLowerCase(); + const digest = md5Hasher.hash(normalizedUrl.toString()); + + return digest; + } + + async queryCache(urlToCrawl: URL) { + const digest = this.getUrlDigest(urlToCrawl); + + const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; + + if (cache) { + const age = Date.now() - cache.createdAt.valueOf(); + const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs); + this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, { + url: urlToCrawl, digest, age, stale + }); + + const r = cache.snapshot; + + return { + isFresh: !stale, + snapshot: { + ...r, + screenshot: undefined, + screenshotUrl: cache.screenshotAvailable ? + await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined, + } as PageSnapshot & { screenshotUrl?: string; } + }; + } + + return undefined; + } + + async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) { + const digest = this.getUrlDigest(urlToCrawl); + + this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href }); + const nowDate = new Date(); + + const cache = Crawled.from({ + _id: randomUUID(), + url: urlToCrawl.toString(), + createdAt: nowDate, + expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), + urlPathDigest: digest, + snapshot: { + ...snapshot, + screenshot: null + }, + }); + + if (snapshot.screenshot) { + await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, { + metadata: { + contentType: 'image/png', + } + }); + cache.screenshotAvailable = true; + } + const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { + this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); + + return undefined; + }); + + return r; + } + + async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) { + let cache; + if (!noCache && !crawlOpts.cookies?.length) { + cache = await this.queryCache(urlToCrawl); + } + + if (cache?.isFresh) { + yield cache.snapshot; + + return; + } + + try { + yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts); + } catch (err: any) { + if (cache) { + this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) }); + yield cache.snapshot; + return; + } + throw err; + } + } } - -function cleanAttribute(attribute: string) { - return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; -} diff --git a/backend/functions/src/db/crawled.ts b/backend/functions/src/db/crawled.ts index 60801c5..f05ad1c 100644 --- a/backend/functions/src/db/crawled.ts +++ b/backend/functions/src/db/crawled.ts @@ -1,6 +1,7 @@ import { Also, parseJSONText, Prop } from 'civkit'; import { FirestoreRecord } from '../shared/lib/firestore'; import _ from 'lodash'; +import type { PageSnapshot } from '../services/puppeteer'; @Also({ dictOf: Object @@ -21,7 +22,10 @@ export class Crawled extends FirestoreRecord { urlPathDigest!: string; @Prop() - snapshot!: any; + snapshot!: PageSnapshot & { screenshot: never; }; + + @Prop() + screenshotAvailable?: boolean; @Prop() createdAt!: Date; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 1c1e8b7..4be3a6d 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -1,13 +1,17 @@ -import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit'; -import { container, singleton } from 'tsyringe'; -import type { Browser, Page } from 'puppeteer'; -import { Logger } from '../shared/services/logger'; -import genericPool from 'generic-pool'; import os from 'os'; import fs from 'fs'; -import { Crawled } from '../db/crawled'; +import { container, singleton } from 'tsyringe'; +import genericPool from 'generic-pool'; +import { AsyncService, Defer, marshalErrorLike, AssertionFailureError } from 'civkit'; +import { Logger } from '../shared/services/logger'; + +import type { Browser, CookieParam, Page } from 'puppeteer'; import puppeteer from 'puppeteer-extra'; +import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; +import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; + + const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); export interface ImgBrief { @@ -42,7 +46,12 @@ export interface PageSnapshot { screenshot?: Buffer; imgs?: ImgBrief[]; } -const md5Hasher = new HashManager('md5', 'hex'); + +export interface ScrappingOptions { + proxyUrl?: string; + cookies?: CookieParam[]; +} + const puppeteerStealth = require('puppeteer-extra-plugin-stealth'); puppeteer.use(puppeteerStealth()); @@ -51,9 +60,13 @@ puppeteer.use(puppeteerStealth()); // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`, // platform: `Linux`, // })) -const puppeteerBlockResources = require('puppeteer-extra-plugin-block-resources'); + puppeteer.use(puppeteerBlockResources({ blockedTypes: new Set(['media']), + interceptResolutionPriority: 1, +})); +puppeteer.use(puppeteerPageProxy({ + interceptResolutionPriority: 1, })); @singleton() @@ -74,7 +87,7 @@ export class PuppeteerControl extends AsyncService { return page.browser().connected && !page.isClosed(); } }, { - max: Math.max(1 + Math.floor(os.freemem() / (1024 * 1024 * 1024)), 16), + max: Math.max(1 + Math.floor(os.totalmem() / (384 * 1024 * 1024)), 16), min: 1, acquireTimeoutMillis: 60_000, testOnBorrow: true, @@ -88,7 +101,7 @@ export class PuppeteerControl extends AsyncService { override async init() { await this.dependencyReady(); - + this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max }); this.pagePool.start(); if (this.browser) { @@ -128,7 +141,10 @@ export class PuppeteerControl extends AsyncService { // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); preparations.push(page.setViewport({ width: 1024, height: 1024 })); - preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { + preparations.push(page.exposeFunction('reportSnapshot', (snapshot: PageSnapshot) => { + if (snapshot.href === 'about:blank') { + return; + } page.emit('snapshot', snapshot); })); preparations.push(page.evaluateOnNewDocument(READABILITY_JS)); @@ -166,40 +182,39 @@ function giveSnapshot() { const elem = document.createElement('div'); elem.innerHTML = parsed.content; r.imgs = briefImgs(elem); + } else { + const allImgs = briefImgs(); + if (allImgs.length === 1) { + r.imgs = allImgs; + } } return r; } `)); - preparations.push(page.evaluateOnNewDocument(() => { - let aftershot: any; - const handlePageLoad = () => { - // @ts-expect-error - if (document.readyState !== 'complete' && document.readyState !== 'interactive') { - return; - } - // @ts-expect-error - const parsed = giveSnapshot(); - if (parsed) { - // @ts-expect-error - window.reportSnapshot(parsed); - } else { - if (aftershot) { - clearTimeout(aftershot); - } - aftershot = setTimeout(() => { - // @ts-expect-error - window.reportSnapshot(giveSnapshot()); - }, 500); - } - }; - // setInterval(handlePageLoad, 1000); - // @ts-expect-error - document.addEventListener('readystatechange', handlePageLoad); - // @ts-expect-error - document.addEventListener('load', handlePageLoad); - })); - + preparations.push(page.evaluateOnNewDocument(` +let aftershot = undefined; +const handlePageLoad = () => { + if (document.readyState !== 'complete') { + return; + } + const parsed = giveSnapshot(); + window.reportSnapshot(parsed); + if (!parsed.text) { + if (aftershot) { + clearTimeout(aftershot); + } + aftershot = setTimeout(() => { + const r = giveSnapshot(); + if (r && r.text) { + window.reportSnapshot(r); + } + }, 500); + } +}; +document.addEventListener('readystatechange', handlePageLoad); +document.addEventListener('load', handlePageLoad); +`)); await Promise.all(preparations); // TODO: further setup the page; @@ -207,41 +222,23 @@ function giveSnapshot() { return page; } - async *scrap(url: string, noCache: string | boolean = false): AsyncGenerator { - const parsedUrl = new URL(url); + async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator { // parsedUrl.search = ''; - parsedUrl.hash = ''; - const normalizedUrl = parsedUrl.toString().toLowerCase(); - const digest = md5Hasher.hash(normalizedUrl); - this.logger.info(`Scraping ${url}, normalized digest: ${digest}`, { url, digest }); + const url = parsedUrl.toString(); + + this.logger.info(`Scraping ${url}`, { url }); let snapshot: PageSnapshot | undefined; let screenshot: Buffer | undefined; - if (!noCache) { - const cached = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; - - if (cached && cached.createdAt.valueOf() > (Date.now() - 1000 * 300)) { - const age = Date.now() - cached.createdAt.valueOf(); - this.logger.info(`Cache hit for ${url}, normalized digest: ${digest}, ${age}ms old`, { url, digest, age }); - snapshot = { - ...cached.snapshot - }; - if (snapshot) { - delete snapshot.screenshot; - } - - screenshot = cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined; - yield { - ...cached.snapshot, - screenshot: cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined - }; - - return; - } + const page = await this.pagePool.acquire(); + if (options.proxyUrl) { + await page.useProxy(options.proxyUrl); + } + if (options.cookies) { + await page.setCookie(...options.cookies); } - const page = await this.pagePool.acquire(); let nextSnapshotDeferred = Defer(); let finalized = false; const hdl = (s: any) => { @@ -262,48 +259,43 @@ function giveSnapshot() { cause: err, })); }).finally(async () => { - finalized = true; if (!snapshot?.html) { + finalized = true; return; } - screenshot = await page.screenshot({ - type: 'jpeg', - quality: 75, - }); snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; + screenshot = await page.screenshot(); if (!snapshot.title || !snapshot.parsed?.content) { const salvaged = await this.salvage(url, page); if (salvaged) { - screenshot = await page.screenshot({ - type: 'jpeg', - quality: 75, - }); snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; + screenshot = await page.screenshot(); } } - this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href }); - const nowDate = new Date(); - Crawled.save( - Crawled.from({ - url, - createdAt: nowDate, - expireAt: new Date(nowDate.valueOf() + 1000 * 3600 * 24 * 7), - urlPathDigest: digest, - snapshot: { ...snapshot, screenshot: screenshot?.toString('base64') || '' }, - }).degradeForFireStore() - ).catch((err) => { - this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) }); - }); + finalized = true; + this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); + this.emit( + 'crawled', + { ...snapshot, screenshot }, + { ...options, url: parsedUrl } + ); }); try { + let lastHTML = snapshot?.html; while (true) { await Promise.race([nextSnapshotDeferred.promise, gotoPromise]); if (finalized) { yield { ...snapshot, screenshot } as PageSnapshot; break; } - yield snapshot; + if (snapshot?.title && snapshot?.html !== lastHTML) { + screenshot = await page.screenshot(); + lastHTML = snapshot.html; + } + if (snapshot || screenshot) { + yield { ...snapshot, screenshot } as PageSnapshot; + } } } finally { gotoPromise.finally(() => { @@ -333,6 +325,8 @@ function giveSnapshot() { this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) }); }); + this.logger.info(`Salvation completed.`); + return true; } } diff --git a/backend/functions/src/utils/markdown.ts b/backend/functions/src/utils/markdown.ts new file mode 100644 index 0000000..73dfa39 --- /dev/null +++ b/backend/functions/src/utils/markdown.ts @@ -0,0 +1,39 @@ + +export function tidyMarkdown(markdown: string): string { + + // Step 1: Handle complex broken links with text and optional images spread across multiple lines + let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { + // Remove internal new lines and excessive spaces within the text + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; + }); + + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { + // Normalize by removing excessive spaces and new lines + text = text.replace(/\s+/g, ' ').trim(); + alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; + imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; + linkUrl = linkUrl.replace(/\s+/g, '').trim(); + if (imgUrl) { + return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; + } else { + return `[${text}](${linkUrl})`; + } + }); + + // Step 2: Normalize regular links that may be broken across lines + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; + }); + + // Step 3: Replace more than two consecutive empty lines with exactly two empty lines + normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); + + // Step 4: Remove leading spaces from each line + normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); + + return normalizedMarkdown.trim(); +} diff --git a/backend/functions/src/utils/misc.ts b/backend/functions/src/utils/misc.ts new file mode 100644 index 0000000..a522fc7 --- /dev/null +++ b/backend/functions/src/utils/misc.ts @@ -0,0 +1,3 @@ +export function cleanAttribute(attribute: string) { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +}