Merge branch 'main' of github.com:jina-ai/url2text

This commit is contained in:
yanlong.wang 2024-04-24 19:21:50 +08:00
commit ae99af50aa
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
8 changed files with 804 additions and 193 deletions

View File

@ -13,7 +13,8 @@
".git",
"*.log",
"*.local",
".secret.*"
".secret.*",
".firebase-emu"
],
"predeploy": [
"npm --prefix \"$RESOURCE_DIR\" run build:clean",

View File

@ -29,7 +29,10 @@
"puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0",
"set-cookie-parser": "^2.6.0",
"stripe": "^11.11.0",
"tiktoken": "^1.0.10",
"turndown": "^7.1.3",
@ -42,6 +45,7 @@
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^18",
"@types/set-cookie-parser": "^2.4.7",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0",
@ -1986,6 +1990,17 @@
"dev": true,
"peer": true
},
"node_modules/@sindresorhus/is": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-4.6.0.tgz",
"integrity": "sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sindresorhus/is?sponsor=1"
}
},
"node_modules/@sinonjs/commons": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz",
@ -2006,6 +2021,17 @@
"@sinonjs/commons": "^3.0.0"
}
},
"node_modules/@szmarczak/http-timer": {
"version": "4.0.6",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
"dependencies": {
"defer-to-connect": "^2.0.0"
},
"engines": {
"node": ">=10"
}
},
"node_modules/@tootallnate/once": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz",
@ -2091,6 +2117,17 @@
"@types/node": "*"
}
},
"node_modules/@types/cacheable-request": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
"integrity": "sha512-IQ3EbTzGxIigb1I3qPZc1rWJnH0BmSKv5QYTalEwweFvyBDLSAe24zP0le/hyi7ecGfZVlIVAg4BZqb8WBwKqw==",
"dependencies": {
"@types/http-cache-semantics": "*",
"@types/keyv": "^3.1.4",
"@types/node": "*",
"@types/responselike": "^1.0.0"
}
},
"node_modules/@types/caseless": {
"version": "0.12.5",
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz",
@ -2161,6 +2198,11 @@
"@types/node": "*"
}
},
"node_modules/@types/http-cache-semantics": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
"integrity": "sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA=="
},
"node_modules/@types/http-errors": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@ -2213,6 +2255,14 @@
"@types/node": "*"
}
},
"node_modules/@types/keyv": {
"version": "3.1.4",
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
"integrity": "sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/lodash": {
"version": "4.17.0",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
@ -2294,6 +2344,14 @@
"node": ">= 0.12"
}
},
"node_modules/@types/responselike": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@types/responselike/-/responselike-1.0.3.tgz",
"integrity": "sha512-H/+L+UkTV33uf49PH5pCAUBVPNj2nDBXTN+qS1dOwyyg24l3CcicicCA7ca+HMvJBZcFgl5r8e+RR6elsb4Lyw==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/semver": {
"version": "7.5.8",
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz",
@ -2319,6 +2377,15 @@
"@types/send": "*"
}
},
"node_modules/@types/set-cookie-parser": {
"version": "2.4.7",
"resolved": "https://registry.npmjs.org/@types/set-cookie-parser/-/set-cookie-parser-2.4.7.tgz",
"integrity": "sha512-+ge/loa0oTozxip6zmhRIk8Z/boU51wl9Q6QdLZcokIGMzY5lFXYy/x7Htj2HTC6/KZP1hUbZ1ekx8DYXICvWg==",
"dev": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/stack-utils": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz",
@ -3447,6 +3514,45 @@
"node": ">= 6.0.0"
}
},
"node_modules/cacheable-lookup": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
"engines": {
"node": ">=10.6.0"
}
},
"node_modules/cacheable-request": {
"version": "7.0.4",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-7.0.4.tgz",
"integrity": "sha512-v+p6ongsrp0yTGbJXjgxPow2+DL93DASP4kXCDKb8/bwRtt9OEF3whggkkDkGNzgcWy2XaF4a8nZglC7uElscg==",
"dependencies": {
"clone-response": "^1.0.2",
"get-stream": "^5.1.0",
"http-cache-semantics": "^4.0.0",
"keyv": "^4.0.0",
"lowercase-keys": "^2.0.0",
"normalize-url": "^6.0.1",
"responselike": "^2.0.0"
},
"engines": {
"node": ">=8"
}
},
"node_modules/cacheable-request/node_modules/get-stream": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
"dependencies": {
"pump": "^3.0.0"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/call-bind": {
"version": "1.0.7",
"resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
@ -3642,6 +3748,25 @@
"node": ">=0.10.0"
}
},
"node_modules/clone-response": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.3.tgz",
"integrity": "sha512-ROoL94jJH2dUVML2Y/5PEDNaSHgeOdSDicUyS7izcF63G6sTc/FTjLub4b8Il9S8S0beOfYt0TaA5qvFK+w0wA==",
"dependencies": {
"mimic-response": "^1.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/clone-response/node_modules/mimic-response": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==",
"engines": {
"node": ">=4"
}
},
"node_modules/co": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@ -4003,6 +4128,17 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/decompress-response/node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/dedent": {
"version": "1.5.3",
"resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz",
@ -4046,6 +4182,14 @@
"node": ">=0.10.0"
}
},
"node_modules/defer-to-connect": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
"integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==",
"engines": {
"node": ">=10"
}
},
"node_modules/define-data-property": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
@ -5837,6 +5981,30 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/got": {
"version": "11.8.6",
"resolved": "https://registry.npmjs.org/got/-/got-11.8.6.tgz",
"integrity": "sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==",
"dependencies": {
"@sindresorhus/is": "^4.0.0",
"@szmarczak/http-timer": "^4.0.5",
"@types/cacheable-request": "^6.0.1",
"@types/responselike": "^1.0.0",
"cacheable-lookup": "^5.0.3",
"cacheable-request": "^7.0.2",
"decompress-response": "^6.0.0",
"http2-wrapper": "^1.0.0-beta.5.2",
"lowercase-keys": "^2.0.0",
"p-cancelable": "^2.0.0",
"responselike": "^2.0.0"
},
"engines": {
"node": ">=10.19.0"
},
"funding": {
"url": "https://github.com/sindresorhus/got?sponsor=1"
}
},
"node_modules/graceful-fs": {
"version": "4.2.11",
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@ -6024,6 +6192,11 @@
"node": ">= 0.6"
}
},
"node_modules/http-cache-semantics": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz",
"integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ=="
},
"node_modules/http-errors": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@ -6067,6 +6240,18 @@
"node": ">= 14"
}
},
"node_modules/http2-wrapper": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
"dependencies": {
"quick-lru": "^5.1.1",
"resolve-alpn": "^1.0.0"
},
"engines": {
"node": ">=10.19.0"
}
},
"node_modules/https-proxy-agent": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
@ -7391,8 +7576,7 @@
"node_modules/json-buffer": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
"integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
"dev": true
"integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="
},
"node_modules/json-parse-even-better-errors": {
"version": "2.3.1",
@ -7550,7 +7734,6 @@
"version": "4.5.4",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
"integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
"dev": true,
"dependencies": {
"json-buffer": "3.0.1"
}
@ -7869,6 +8052,14 @@
"integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==",
"optional": true
},
"node_modules/lowercase-keys": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-2.0.0.tgz",
"integrity": "sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==",
"engines": {
"node": ">=8"
}
},
"node_modules/lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
@ -8054,17 +8245,6 @@
"node": ">=6"
}
},
"node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimatch": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
@ -8423,6 +8603,17 @@
"node": ">=0.10.0"
}
},
"node_modules/normalize-url": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz",
"integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/npm-run-path": {
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
@ -8632,6 +8823,14 @@
"node": ">= 0.8.0"
}
},
"node_modules/p-cancelable": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
"engines": {
"node": ">=8"
}
},
"node_modules/p-limit": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
@ -9243,6 +9442,11 @@
"resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",
"integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ=="
},
"node_modules/psl": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
"integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag=="
},
"node_modules/pump": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
@ -9367,6 +9571,58 @@
}
}
},
"node_modules/puppeteer-extra-plugin-page-proxy": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
"integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
"dependencies": {
"debug": "^4.1.1",
"got": "^11.8.5",
"http-proxy-agent": "^5.0.0",
"https-proxy-agent": "^5.0.1",
"puppeteer-extra-plugin": "^3.2.3",
"socks-proxy-agent": "^7.0.0",
"tough-cookie": "^4.1.2"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-page-proxy/node_modules/http-proxy-agent": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
"integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
"dependencies": {
"@tootallnate/once": "2",
"agent-base": "6",
"debug": "4"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/puppeteer-extra-plugin-page-proxy/node_modules/socks-proxy-agent": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
"integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
"dependencies": {
"agent-base": "^6.0.2",
"debug": "^4.3.3",
"socks": "^2.6.2"
},
"engines": {
"node": ">= 10"
}
},
"node_modules/puppeteer-extra-plugin-stealth": {
"version": "2.11.2",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
@ -9457,6 +9713,44 @@
}
}
},
"node_modules/puppeteer-page-proxy": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/puppeteer-page-proxy/-/puppeteer-page-proxy-1.3.0.tgz",
"integrity": "sha512-PDpLjJfcUKiLvUZ3yQJeUcP1d+7nW17s2LZIrKH0gyxEN4zTGkCvB9/HwquPgYq5YcVi8QugsvBckP/K9Vn/iw==",
"dependencies": {
"got": "^11.8.5",
"http-proxy-agent": "^5.0.0",
"https-proxy-agent": "^5.0.1",
"socks-proxy-agent": "^7.0.0",
"tough-cookie": "^4.1.2"
}
},
"node_modules/puppeteer-page-proxy/node_modules/http-proxy-agent": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
"integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
"dependencies": {
"@tootallnate/once": "2",
"agent-base": "6",
"debug": "4"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/puppeteer-page-proxy/node_modules/socks-proxy-agent": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
"integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
"dependencies": {
"agent-base": "^6.0.2",
"debug": "^4.3.3",
"socks": "^2.6.2"
},
"engines": {
"node": ">= 10"
}
},
"node_modules/pure-rand": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz",
@ -9523,6 +9817,11 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/querystringify": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
"integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="
},
"node_modules/queue-microtask": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
@ -9554,6 +9853,17 @@
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
"optional": true
},
"node_modules/quick-lru": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/range-parser": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@ -9709,6 +10019,11 @@
"node": ">=0.10.0"
}
},
"node_modules/requires-port": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
},
"node_modules/resolve": {
"version": "1.22.8",
"resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
@ -9726,6 +10041,11 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/resolve-alpn": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/resolve-alpn/-/resolve-alpn-1.2.1.tgz",
"integrity": "sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g=="
},
"node_modules/resolve-cwd": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
@ -9767,6 +10087,17 @@
"node": ">=10"
}
},
"node_modules/responselike": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/responselike/-/responselike-2.0.1.tgz",
"integrity": "sha512-4gl03wn3hj1HP3yzgdI7d3lCkF95F21Pz4BPGvKHinyQzALR5CapwC8yIi0Rh58DEMQ/SguC03wFj2k0M/mHhw==",
"dependencies": {
"lowercase-keys": "^2.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/retry": {
"version": "0.13.1",
"resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
@ -10029,6 +10360,11 @@
"resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
"integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
},
"node_modules/set-cookie-parser": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.6.0.tgz",
"integrity": "sha512-RVnVQxTXuerk653XfuliOxBP81Sf0+qfQE73LIYKcyMYHG94AuH0kgrQpRDuTZnSmjpysHmzxJXKNfa6PjFhyQ=="
},
"node_modules/set-function-length": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
@ -10767,6 +11103,28 @@
"node": ">=0.6"
}
},
"node_modules/tough-cookie": {
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
"dependencies": {
"psl": "^1.1.33",
"punycode": "^2.1.1",
"universalify": "^0.2.0",
"url-parse": "^1.5.3"
},
"engines": {
"node": ">=6"
}
},
"node_modules/tough-cookie/node_modules/universalify": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
"integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
"engines": {
"node": ">= 4.0.0"
}
},
"node_modules/tr46": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
@ -11127,6 +11485,15 @@
"punycode": "^2.1.0"
}
},
"node_modules/url-parse": {
"version": "1.5.10",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
"integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
"dependencies": {
"querystringify": "^2.1.1",
"requires-port": "^1.0.0"
}
},
"node_modules/urlpattern-polyfill": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",

View File

@ -49,7 +49,10 @@
"puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0",
"set-cookie-parser": "^2.6.0",
"stripe": "^11.11.0",
"tiktoken": "^1.0.10",
"turndown": "^7.1.3",
@ -62,6 +65,7 @@
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^18",
"@types/set-cookie-parser": "^2.4.7",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0",

View File

@ -1,51 +1,25 @@
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit';
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
HashManager,
AssertionFailureError, ParamValidationError,
} from 'civkit';
import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import normalizeUrl from "@esm2cjs/normalize-url";
import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { CookieParam } from 'puppeteer';
import { Crawled } from '../db/crawled';
import { tidyMarkdown } from '../utils/markdown';
import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto';
function tidyMarkdown(markdown: string): string {
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
// Remove internal new lines and excessive spaces within the text
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
// Normalize by removing excessive spaces and new lines
text = text.replace(/\s+/g, ' ').trim();
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
linkUrl = linkUrl.replace(/\s+/g, '').trim();
if (imgUrl) {
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
} else {
return `[${text}](${linkUrl})`;
}
});
// Step 2: Normalize regular links that may be broken across lines
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
// Step 4: Remove leading spaces from each line
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
return normalizedMarkdown.trim();
}
const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class CrawlerHost extends RPCHost {
@ -53,12 +27,29 @@ export class CrawlerHost extends RPCHost {
turnDownPlugins = [require('turndown-plugin-gfm').tables];
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 300;
urlValidMs = 1000 * 3600 * 4;
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected altTextService: AltTextService,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
) {
super(...arguments);
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
if (!snapshot.title?.trim()) {
return;
}
if (options.cookies?.length) {
// Potential privacy issue, dont cache if cookies are used
return;
}
await this.setToCache(options.url, snapshot);
});
}
override async init() {
@ -67,16 +58,51 @@ export class CrawlerHost extends RPCHost {
this.emit('ready');
}
async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) {
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
screenshotUrl?: string;
}, nominalUrl?: URL) {
if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
screenshotUrl: snapshot.screenshotUrl,
toString() {
return this.screenshotUrl;
}
};
}
if (mode === 'html') {
return {
html: snapshot.html,
toString() {
return this.html;
}
};
}
if (mode === 'text') {
return {
text: snapshot.text,
toString() {
return this.text;
}
};
}
const toBeTurnedToMd = snapshot.parsed?.content;
let turnDownService = new TurndownService();
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}
let contentText = '';
if (toBeTurnedToMd) {
const urlToAltMap: { [k: string]: string | undefined; } = {};
if (snapshot.imgs?.length) {
const tasks = (snapshot.imgs || []).map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
@ -88,8 +114,8 @@ export class CrawlerHost extends RPCHost {
});
await Promise.all(tasks);
}
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', {
filter: 'img',
replacement: (_content, node) => {
@ -107,6 +133,8 @@ export class CrawlerHost extends RPCHost {
}
});
let contentText = '';
if (toBeTurnedToMd) {
try {
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
} catch (err) {
@ -141,7 +169,7 @@ export class CrawlerHost extends RPCHost {
const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl || snapshot.href?.trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(),
content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined,
@ -171,6 +199,7 @@ ${this.content}
timeoutSeconds: 540,
concurrency: 4,
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
})
@ -181,6 +210,57 @@ ${this.content}
concurrency: 21,
maxInstances: 476,
},
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format. \n\n` +
`Supported formats:\n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
`Supported formats:\n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n\n` +
`Defaults to: markdown`
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
`Supported protocols:\n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
})
@ -207,27 +287,41 @@ ${this.content}
path: 'url'
});
}
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
const noCache = Boolean(ctx.req.headers['x-no-cache']);
const customMode = ctx.req.get('x-respond-with') || 'markdown';
const noCache = Boolean(ctx.req.get('x-no-cache'));
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
domain: urlToCrawl.hostname,
});
}
} else if (setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
domain: urlToCrawl.hostname,
});
}
const crawlOpts: ScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'),
cookies,
};
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
if (!scrapped) {
continue;
}
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
if (scrapped.screenshot && screenshotEnabled) {
sseStream.write({
event: 'screenshot',
data: scrapped.screenshot.toString('base64'),
});
}
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
sseStream.write({
event: 'data',
@ -235,7 +329,7 @@ ${this.content}
});
}
} catch (err: any) {
this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) });
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
sseStream.write({
event: 'error',
data: marshalErrorLike(err),
@ -249,13 +343,13 @@ ${this.content}
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue;
}
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
return formatted;
}
@ -264,16 +358,22 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
}
return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString());
return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
}
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue;
}
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
@ -282,12 +382,111 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
}
return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`;
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
getUrlDigest(urlToCrawl: URL) {
const normalizedURL = new URL(urlToCrawl);
normalizedURL.hash = '';
const normalizedUrl = normalizedURL.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl.toString());
return digest;
}
async queryCache(urlToCrawl: URL) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) {
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
url: urlToCrawl, digest, age, stale
});
const r = cache.snapshot;
return {
isFresh: !stale,
snapshot: {
...r,
screenshot: undefined,
screenshotUrl: cache.screenshotAvailable ?
await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined,
} as PageSnapshot & { screenshotUrl?: string; }
};
}
return undefined;
}
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
const digest = this.getUrlDigest(urlToCrawl);
this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
const nowDate = new Date();
const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
urlPathDigest: digest,
snapshot: {
...snapshot,
screenshot: null
},
});
if (snapshot.screenshot) {
await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
cache.screenshotAvailable = true;
}
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined;
});
return r;
}
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
let cache;
if (!noCache && !crawlOpts.cookies?.length) {
cache = await this.queryCache(urlToCrawl);
}
if (cache?.isFresh) {
yield cache.snapshot;
return;
}
try {
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) {
if (cache) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield cache.snapshot;
return;
}
throw err;
}
}
}
function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
}

View File

@ -1,6 +1,7 @@
import { Also, parseJSONText, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';
import type { PageSnapshot } from '../services/puppeteer';
@Also({
dictOf: Object
@ -21,7 +22,10 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string;
@Prop()
snapshot!: any;
snapshot!: PageSnapshot & { screenshot: never; };
@Prop()
screenshotAvailable?: boolean;
@Prop()
createdAt!: Date;

View File

@ -1,13 +1,17 @@
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
import { container, singleton } from 'tsyringe';
import type { Browser, Page } from 'puppeteer';
import { Logger } from '../shared/services/logger';
import genericPool from 'generic-pool';
import os from 'os';
import fs from 'fs';
import { Crawled } from '../db/crawled';
import { container, singleton } from 'tsyringe';
import genericPool from 'generic-pool';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, Page } from 'puppeteer';
import puppeteer from 'puppeteer-extra';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
export interface ImgBrief {
@ -42,7 +46,12 @@ export interface PageSnapshot {
screenshot?: Buffer;
imgs?: ImgBrief[];
}
const md5Hasher = new HashManager('md5', 'hex');
export interface ScrappingOptions {
proxyUrl?: string;
cookies?: CookieParam[];
}
const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
puppeteer.use(puppeteerStealth());
@ -51,9 +60,13 @@ puppeteer.use(puppeteerStealth());
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
// platform: `Linux`,
// }))
const puppeteerBlockResources = require('puppeteer-extra-plugin-block-resources');
puppeteer.use(puppeteerBlockResources({
blockedTypes: new Set(['media']),
interceptResolutionPriority: 1,
}));
puppeteer.use(puppeteerPageProxy({
interceptResolutionPriority: 1,
}));
@singleton()
@ -74,7 +87,7 @@ export class PuppeteerControl extends AsyncService {
return page.browser().connected && !page.isClosed();
}
}, {
max: Math.max(1 + Math.floor(os.freemem() / (1024 * 1024 * 1024)), 16),
max: Math.max(1 + Math.floor(os.totalmem() / (384 * 1024 * 1024)), 16),
min: 1,
acquireTimeoutMillis: 60_000,
testOnBorrow: true,
@ -88,7 +101,7 @@ export class PuppeteerControl extends AsyncService {
override async init() {
await this.dependencyReady();
this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max });
this.pagePool.start();
if (this.browser) {
@ -128,7 +141,10 @@ export class PuppeteerControl extends AsyncService {
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: PageSnapshot) => {
if (snapshot.href === 'about:blank') {
return;
}
page.emit('snapshot', snapshot);
}));
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
@ -166,40 +182,39 @@ function giveSnapshot() {
const elem = document.createElement('div');
elem.innerHTML = parsed.content;
r.imgs = briefImgs(elem);
} else {
const allImgs = briefImgs();
if (allImgs.length === 1) {
r.imgs = allImgs;
}
}
return r;
}
`));
preparations.push(page.evaluateOnNewDocument(() => {
let aftershot: any;
const handlePageLoad = () => {
// @ts-expect-error
if (document.readyState !== 'complete' && document.readyState !== 'interactive') {
preparations.push(page.evaluateOnNewDocument(`
let aftershot = undefined;
const handlePageLoad = () => {
if (document.readyState !== 'complete') {
return;
}
// @ts-expect-error
const parsed = giveSnapshot();
if (parsed) {
// @ts-expect-error
window.reportSnapshot(parsed);
} else {
if (!parsed.text) {
if (aftershot) {
clearTimeout(aftershot);
}
aftershot = setTimeout(() => {
// @ts-expect-error
window.reportSnapshot(giveSnapshot());
const r = giveSnapshot();
if (r && r.text) {
window.reportSnapshot(r);
}
}, 500);
}
};
// setInterval(handlePageLoad, 1000);
// @ts-expect-error
document.addEventListener('readystatechange', handlePageLoad);
// @ts-expect-error
document.addEventListener('load', handlePageLoad);
}));
};
document.addEventListener('readystatechange', handlePageLoad);
document.addEventListener('load', handlePageLoad);
`));
await Promise.all(preparations);
// TODO: further setup the page;
@ -207,41 +222,23 @@ function giveSnapshot() {
return page;
}
async *scrap(url: string, noCache: string | boolean = false): AsyncGenerator<PageSnapshot | undefined> {
const parsedUrl = new URL(url);
async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = '';
parsedUrl.hash = '';
const normalizedUrl = parsedUrl.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl);
this.logger.info(`Scraping ${url}, normalized digest: ${digest}`, { url, digest });
const url = parsedUrl.toString();
this.logger.info(`Scraping ${url}`, { url });
let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined;
if (!noCache) {
const cached = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cached && cached.createdAt.valueOf() > (Date.now() - 1000 * 300)) {
const age = Date.now() - cached.createdAt.valueOf();
this.logger.info(`Cache hit for ${url}, normalized digest: ${digest}, ${age}ms old`, { url, digest, age });
snapshot = {
...cached.snapshot
};
if (snapshot) {
delete snapshot.screenshot;
}
screenshot = cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined;
yield {
...cached.snapshot,
screenshot: cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined
};
return;
}
}
const page = await this.pagePool.acquire();
if (options.proxyUrl) {
await page.useProxy(options.proxyUrl);
}
if (options.cookies) {
await page.setCookie(...options.cookies);
}
let nextSnapshotDeferred = Defer();
let finalized = false;
const hdl = (s: any) => {
@ -262,48 +259,43 @@ function giveSnapshot() {
cause: err,
}));
}).finally(async () => {
finalized = true;
if (!snapshot?.html) {
finalized = true;
return;
}
screenshot = await page.screenshot({
type: 'jpeg',
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
screenshot = await page.screenshot();
if (!snapshot.title || !snapshot.parsed?.content) {
const salvaged = await this.salvage(url, page);
if (salvaged) {
screenshot = await page.screenshot({
type: 'jpeg',
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
screenshot = await page.screenshot();
}
}
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
const nowDate = new Date();
Crawled.save(
Crawled.from({
url,
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + 1000 * 3600 * 24 * 7),
urlPathDigest: digest,
snapshot: { ...snapshot, screenshot: screenshot?.toString('base64') || '' },
}).degradeForFireStore()
).catch((err) => {
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
});
finalized = true;
this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{ ...snapshot, screenshot },
{ ...options, url: parsedUrl }
);
});
try {
let lastHTML = snapshot?.html;
while (true) {
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
if (finalized) {
yield { ...snapshot, screenshot } as PageSnapshot;
break;
}
yield snapshot;
if (snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, screenshot } as PageSnapshot;
}
}
} finally {
gotoPromise.finally(() => {
@ -333,6 +325,8 @@ function giveSnapshot() {
this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
});
this.logger.info(`Salvation completed.`);
return true;
}
}

View File

@ -0,0 +1,39 @@
export function tidyMarkdown(markdown: string): string {
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
// Remove internal new lines and excessive spaces within the text
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
// Normalize by removing excessive spaces and new lines
text = text.replace(/\s+/g, ' ').trim();
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
linkUrl = linkUrl.replace(/\s+/g, '').trim();
if (imgUrl) {
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
} else {
return `[${text}](${linkUrl})`;
}
});
// Step 2: Normalize regular links that may be broken across lines
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
// Step 4: Remove leading spaces from each line
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
return normalizedMarkdown.trim();
}

View File

@ -0,0 +1,3 @@
export function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
}