diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 2071418..e735c53 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -14,7 +14,7 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.6.5-047c0d8", + "civkit": "^0.7.0-0f8889a", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -23,13 +23,13 @@ "firebase-functions": "^4.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", - "jsdom": "^24.0.0", "langdetect": "^0.2.1", + "linkedom": "^0.18.4", "maxmind": "^4.3.18", "minio": "^7.1.3", "openai": "^4.20.0", "pdfjs-dist": "^4.2.67", - "puppeteer": "^22.7.1", + "puppeteer": "^23.3.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-page-proxy": "^2.0.0", @@ -48,7 +48,7 @@ "@types/bcrypt": "^5.0.0", "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", - "@types/node": "^18", + "@types/node": "^20.14.13", "@types/set-cookie-parser": "^2.4.7", "@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/parser": "^5.12.0", @@ -57,7 +57,7 @@ "eslint-plugin-import": "^2.25.4", "firebase-functions-test": "^3.0.0", "replicate": "^0.16.1", - "typescript": "^5.1.6" + "typescript": "^5.5.4" }, "engines": { "node": "20" @@ -1564,10 +1564,9 @@ } }, "node_modules/@mongodb-js/saslprep": { - "version": "1.1.5", - "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.5.tgz", - "integrity": "sha512-XLNOMH66KhJzUJNwT/qlMnS4WsNDWD5ASdyaSH3EtK+F4r/CFGa3jT4GNi4mfOitGvWXtdLgQJkQjxSVrio+jA==", - "optional": true, + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.9.tgz", + "integrity": "sha512-tVkljjeEaAhCqTzajSdgbQ6gE6f3oneVwa3iXR6csiEwXXOFsiC6Uh9iAjAhXPtqa/XMDHWjjeNH/77m/Yq2dw==", "peer": true, "dependencies": { "sparse-bitfield": "^3.0.3" @@ -1977,18 +1976,18 @@ "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==" }, "node_modules/@puppeteer/browsers": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.2.3.tgz", - "integrity": "sha512-bJ0UBsk0ESOs6RFcLXOt99a3yTDcOKlzfjad+rhFwdaG1Lu/Wzq58GHYCDTlZ9z6mldf4g+NTb+TXEfe0PpnsQ==", + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.4.0.tgz", + "integrity": "sha512-x8J1csfIygOwf6D6qUAZ0ASk3z63zPb7wkNeHRerCMh82qWKUrOgkuP005AJC8lDL6/evtXETGEJVcwykKT4/g==", "dependencies": { - "debug": "4.3.4", - "extract-zip": "2.0.1", - "progress": "2.0.3", - "proxy-agent": "6.4.0", - "semver": "7.6.0", - "tar-fs": "3.0.5", - "unbzip2-stream": "1.4.3", - "yargs": "17.7.2" + "debug": "^4.3.6", + "extract-zip": "^2.0.1", + "progress": "^2.0.3", + "proxy-agent": "^6.4.0", + "semver": "^7.6.3", + "tar-fs": "^3.0.6", + "unbzip2-stream": "^1.4.3", + "yargs": "^17.7.2" }, "bin": { "browsers": "lib/cjs/main-cli.js" @@ -2299,9 +2298,9 @@ "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==" }, "node_modules/@types/node": { - "version": "18.19.31", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.31.tgz", - "integrity": "sha512-ArgCD39YpyyrtFKIqMDvjz79jto5fcI/SVUs2HwB+f0dAzq68yqOdyaSivLiLugSziTpNXLQrVb7RZFmdZzbhA==", + "version": "20.14.13", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.13.tgz", + "integrity": "sha512-+bHoGiZb8UiQ0+WEtmph2IWQCjIqg8MDZMAV+ppRRhUZnquF5mQkP/9vpSwJClEiSM/C7fZZExPzfU0vJTyp8w==", "dependencies": { "undici-types": "~5.26.4" } @@ -2424,12 +2423,11 @@ "peer": true }, "node_modules/@types/whatwg-url": { - "version": "8.2.2", - "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-8.2.2.tgz", - "integrity": "sha512-FtQu10RWgn3D9U4aazdwIE2yzphmTJREDqNdODHrbrZmmMqI0vMheC/6NE/J1Yveaj8H+ela+YwWTjq5PGmuhA==", + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", "peer": true, "dependencies": { - "@types/node": "*", "@types/webidl-conversions": "*" } }, @@ -3227,31 +3225,41 @@ "optional": true }, "node_modules/bare-fs": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz", - "integrity": "sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.3.3.tgz", + "integrity": "sha512-7RYKL+vZVCyAsMLi5SPu7QGauGGT8avnP/HO571ndEuV4MYdGXvLhtW67FuLPeEI8EiIY7zbbRR9x7x7HU0kgw==", "optional": true, "dependencies": { "bare-events": "^2.0.0", "bare-path": "^2.0.0", - "streamx": "^2.13.0" + "bare-stream": "^2.0.0" } }, "node_modules/bare-os": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz", - "integrity": "sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w==", + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.4.2.tgz", + "integrity": "sha512-HZoJwzC+rZ9lqEemTMiO0luOePoGYNBgsLLgegKR/cljiJvcDNhDZQkzC+NC5Oh0aHbdBNSOHpghwMuB5tqhjg==", "optional": true }, "node_modules/bare-path": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz", - "integrity": "sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.3.tgz", + "integrity": "sha512-lh/eITfU8hrj9Ru5quUp0Io1kJWIk1bTjzo7JH1P5dWmQ2EL4hFUlfI8FonAhSlgIfhn63p84CDY/x+PisgcXA==", "optional": true, "dependencies": { "bare-os": "^2.1.0" } }, + "node_modules/bare-stream": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.2.1.tgz", + "integrity": "sha512-YTB47kHwBW9zSG8LD77MIBAAQXjU2WjAkMHeeb7hUplVs6+IoM5I7uEVQNPMB7lj9r8I76UMdoMkGnCodHOLqg==", + "optional": true, + "dependencies": { + "b4a": "^1.6.6", + "streamx": "^2.18.0" + } + }, "node_modules/base32.js": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/base32.js/-/base32.js-0.1.0.tgz", @@ -3374,6 +3382,11 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -3444,12 +3457,12 @@ } }, "node_modules/bson": { - "version": "5.5.1", - "resolved": "https://registry.npmjs.org/bson/-/bson-5.5.1.tgz", - "integrity": "sha512-ix0EwukN2EpC0SRWIj/7B5+A6uQMQy6KMREI9qQqvgpkV2frH63T0UDVd1SYedL6dNCmDBYB3QtXi4ISk9YT+g==", + "version": "6.8.0", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.8.0.tgz", + "integrity": "sha512-iOJg8pr7wq2tg/zSlCCHMi3hMm5JTOxLTagf3zxhcenHsFp+c6uOs6K7W5UE7A4QIJGtqh/ZovFNMP4mOPJynQ==", "peer": true, "engines": { - "node": ">=14.20.1" + "node": ">=16.20.1" } }, "node_modules/buffer": { @@ -3659,13 +3672,13 @@ } }, "node_modules/chromium-bidi": { - "version": "0.5.19", - "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.5.19.tgz", - "integrity": "sha512-UA6zL77b7RYCjJkZBsZ0wlvCTD+jTjllZ8f6wdO4buevXgTZYjV+XLB9CiEa2OuuTGGTLnI7eN9I60YxuALGQg==", + "version": "0.6.5", + "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.6.5.tgz", + "integrity": "sha512-RuLrmzYrxSb0s9SgpB+QN5jJucPduZQ/9SIe76MDxYJuecPW5mxMdacJ1f4EtgiV+R0p3sCkznTMvH0MPGFqjA==", "dependencies": { "mitt": "3.0.1", "urlpattern-polyfill": "10.0.0", - "zod": "3.22.4" + "zod": "3.23.8" }, "peerDependencies": { "devtools-protocol": "*" @@ -3688,9 +3701,9 @@ } }, "node_modules/civkit": { - "version": "0.6.5-047c0d8", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.6.5-047c0d8.tgz", - "integrity": "sha512-4FWHrkJQHbTD3wjNeihxOzm7GSgQa9BUgSvPOLsfKybeEw9Pv+I94uDUP8PczL1TpHO6hIbIE2KJjzSOx6PYqg==", + "version": "0.7.0-0f8889a", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.7.0-0f8889a.tgz", + "integrity": "sha512-T14Jk3loghFqluWUUQmuEn0hO2pwWw+tsfdG4++NPqvS2W/lclZoA1EyBIZ8Uk0MYqEp02O6BwwbAoq+g++hMw==", "dependencies": { "lodash": "^4.17.21", "tslib": "^2.5.0" @@ -3719,11 +3732,13 @@ "pino": "^8.11.0", "reflect-metadata": "^0.1.13", "smtp-server": "^3.11.0", - "tld-extract": "^2.1.0" + "tld-extract": "^2.1.0", + "zod": "*", + "zod-openai": "*" }, "peerDependencies": { - "mongodb": "^5.2.0", - "tsyringe": "^4.7.0" + "mongodb": "^6", + "tsyringe": "^4" } }, "node_modules/cjs-module-lexer": { @@ -4049,17 +4064,37 @@ "node": ">= 8" } }, - "node_modules/cssstyle": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz", - "integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==", + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", "dependencies": { - "rrweb-cssom": "^0.6.0" + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" }, - "engines": { - "node": ">=18" + "funding": { + "url": "https://github.com/sponsors/fb55" } }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cssom": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz", + "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==" + }, "node_modules/data-uri-to-buffer": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", @@ -4068,41 +4103,6 @@ "node": ">= 14" } }, - "node_modules/data-urls": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", - "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==", - "dependencies": { - "whatwg-mimetype": "^4.0.0", - "whatwg-url": "^14.0.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/data-urls/node_modules/tr46": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz", - "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==", - "dependencies": { - "punycode": "^2.3.1" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/data-urls/node_modules/whatwg-url": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz", - "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==", - "dependencies": { - "tr46": "^5.0.0", - "webidl-conversions": "^7.0.0" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/data-view-buffer": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz", @@ -4160,11 +4160,11 @@ "integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ==" }, "node_modules/debug": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", - "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz", + "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==", "dependencies": { - "ms": "2.1.2" + "ms": "^2.1.3" }, "engines": { "node": ">=6.0" @@ -4175,11 +4175,6 @@ } } }, - "node_modules/decimal.js": { - "version": "10.4.3", - "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", - "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==" - }, "node_modules/decode-uri-component": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz", @@ -4359,9 +4354,9 @@ } }, "node_modules/devtools-protocol": { - "version": "0.0.1273771", - "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1273771.tgz", - "integrity": "sha512-QDbb27xcTVReQQW/GHJsdQqGKwYBE7re7gxehj467kKP2DKuYBUj6i2k5LRiAC66J1yZG/9gsxooz/s9pcm0Og==" + "version": "0.0.1330662", + "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1330662.tgz", + "integrity": "sha512-pzh6YQ8zZfz3iKlCvgzVCu22NdpZ8hNmwU6WnQjNVquh0A9iVosPtNLWDwaWVGyrntQlltPFztTMK5Cg6lfCuw==" }, "node_modules/diff-sequences": { "version": "29.6.3", @@ -5461,14 +5456,6 @@ "@google-cloud/storage": "^7.7.0" } }, - "node_modules/firebase-admin/node_modules/@types/node": { - "version": "20.12.7", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz", - "integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==", - "dependencies": { - "undici-types": "~5.26.4" - } - }, "node_modules/firebase-functions": { "version": "4.9.0", "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-4.9.0.tgz", @@ -5793,15 +5780,33 @@ } }, "node_modules/gcp-metadata": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz", - "integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==", + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-5.3.0.tgz", + "integrity": "sha512-FNTkdNEnBdlqF2oatizolQqNANMrcqJt6AAYt99B3y1aLLC8Hc5IOBb+ZnnzllodEEf6xMBp6wRcBbc16fa65w==", + "optional": true, + "peer": true, "dependencies": { - "gaxios": "^6.0.0", + "gaxios": "^5.0.0", "json-bigint": "^1.0.0" }, "engines": { - "node": ">=14" + "node": ">=12" + } + }, + "node_modules/gcp-metadata/node_modules/gaxios": { + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-5.1.3.tgz", + "integrity": "sha512-95hVgBRgEIRQQQHIbnxBXeHbW4TqFk4ZDJW7wmVtvYar72FdhRIo1UGOLS2eRAKCPEdPBWu+M7+A33D9CdX9rA==", + "optional": true, + "peer": true, + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^5.0.0", + "is-stream": "^2.0.0", + "node-fetch": "^2.6.9" + }, + "engines": { + "node": ">=12" } }, "node_modules/generic-pool": { @@ -6023,6 +6028,18 @@ "node": ">=14" } }, + "node_modules/google-auth-library/node_modules/gcp-metadata": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz", + "integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==", + "dependencies": { + "gaxios": "^6.0.0", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/google-gax": { "version": "4.3.2", "resolved": "https://registry.npmjs.org/google-gax/-/google-gax-4.3.2.tgz", @@ -6184,17 +6201,6 @@ "node": ">= 0.4" } }, - "node_modules/html-encoding-sniffer": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", - "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", - "dependencies": { - "whatwg-encoding": "^3.1.1" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -6383,6 +6389,7 @@ "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "optional": true, "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, @@ -6780,11 +6787,6 @@ "node": ">=0.10.0" } }, - "node_modules/is-potential-custom-element-name": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", - "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==" - }, "node_modules/is-regex": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", @@ -7653,91 +7655,6 @@ "node": ">=0.1.90" } }, - "node_modules/jsdom": { - "version": "24.0.0", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz", - "integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==", - "dependencies": { - "cssstyle": "^4.0.1", - "data-urls": "^5.0.0", - "decimal.js": "^10.4.3", - "form-data": "^4.0.0", - "html-encoding-sniffer": "^4.0.0", - "http-proxy-agent": "^7.0.0", - "https-proxy-agent": "^7.0.2", - "is-potential-custom-element-name": "^1.0.1", - "nwsapi": "^2.2.7", - "parse5": "^7.1.2", - "rrweb-cssom": "^0.6.0", - "saxes": "^6.0.0", - "symbol-tree": "^3.2.4", - "tough-cookie": "^4.1.3", - "w3c-xmlserializer": "^5.0.0", - "webidl-conversions": "^7.0.0", - "whatwg-encoding": "^3.1.1", - "whatwg-mimetype": "^4.0.0", - "whatwg-url": "^14.0.0", - "ws": "^8.16.0", - "xml-name-validator": "^5.0.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "canvas": "^2.11.2" - }, - "peerDependenciesMeta": { - "canvas": { - "optional": true - } - } - }, - "node_modules/jsdom/node_modules/agent-base": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz", - "integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==", - "dependencies": { - "debug": "^4.3.4" - }, - "engines": { - "node": ">= 14" - } - }, - "node_modules/jsdom/node_modules/https-proxy-agent": { - "version": "7.0.4", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz", - "integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==", - "dependencies": { - "agent-base": "^7.0.2", - "debug": "4" - }, - "engines": { - "node": ">= 14" - } - }, - "node_modules/jsdom/node_modules/tr46": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz", - "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==", - "dependencies": { - "punycode": "^2.3.1" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/jsdom/node_modules/whatwg-url": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz", - "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==", - "dependencies": { - "tr46": "^5.0.0", - "webidl-conversions": "^7.0.0" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/jsesc": { "version": "2.5.2", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz", @@ -8156,6 +8073,23 @@ "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==" }, + "node_modules/linkedom": { + "version": "0.18.4", + "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.4.tgz", + "integrity": "sha512-JhLErxMIEOKByMi3fURXgI1fYOzR87L1Cn0+MI9GlMckFrqFZpV1SUGox1jcKtsKN3y6JgclcQf0FzZT//BuGw==", + "dependencies": { + "css-select": "^5.1.0", + "cssom": "^0.5.0", + "html-escaper": "^3.0.3", + "htmlparser2": "^9.1.0", + "uhyphen": "^0.2.0" + } + }, + "node_modules/linkedom/node_modules/html-escaper": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", + "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==" + }, "node_modules/locate-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", @@ -8335,7 +8269,6 @@ "version": "1.5.0", "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", - "optional": true, "peer": true }, "node_modules/merge-deep": { @@ -8564,27 +8497,26 @@ } }, "node_modules/mongodb": { - "version": "5.9.2", - "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz", - "integrity": "sha512-H60HecKO4Bc+7dhOv4sJlgvenK4fQNqqUIlXxZYQNbfEWSALGAwGoyJd/0Qwk4TttFXUOHJ2ZJQe/52ScaUwtQ==", + "version": "6.8.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.8.0.tgz", + "integrity": "sha512-HGQ9NWDle5WvwMnrvUxsFYPd3JEbqD3RgABHBQRuoCEND0qzhsd0iH5ypHsf1eJ+sXmvmyKpP+FLOKY8Il7jMw==", "peer": true, "dependencies": { - "bson": "^5.5.0", - "mongodb-connection-string-url": "^2.6.0", - "socks": "^2.7.1" + "@mongodb-js/saslprep": "^1.1.5", + "bson": "^6.7.0", + "mongodb-connection-string-url": "^3.0.0" }, "engines": { - "node": ">=14.20.1" - }, - "optionalDependencies": { - "@mongodb-js/saslprep": "^1.1.0" + "node": ">=16.20.1" }, "peerDependencies": { "@aws-sdk/credential-providers": "^3.188.0", - "@mongodb-js/zstd": "^1.0.0", - "kerberos": "^1.0.0 || ^2.0.0", - "mongodb-client-encryption": ">=2.3.0 <3", - "snappy": "^7.2.2" + "@mongodb-js/zstd": "^1.1.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.2.2", + "socks": "^2.7.1" }, "peerDependenciesMeta": { "@aws-sdk/credential-providers": { @@ -8593,6 +8525,9 @@ "@mongodb-js/zstd": { "optional": true }, + "gcp-metadata": { + "optional": true + }, "kerberos": { "optional": true }, @@ -8601,23 +8536,26 @@ }, "snappy": { "optional": true + }, + "socks": { + "optional": true } } }, "node_modules/mongodb-connection-string-url": { - "version": "2.6.0", - "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-2.6.0.tgz", - "integrity": "sha512-WvTZlI9ab0QYtTYnuMLgobULWhokRjtC7db9LtcVfJ+Hsnyr5eo6ZtNAt3Ly24XZScGMelOcGtm7lSn0332tPQ==", + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.1.tgz", + "integrity": "sha512-XqMGwRX0Lgn05TDB4PyG2h2kKO/FfWJyCzYQbIhXUxz7ETt0I/FqHjUeqj37irJ+Dl1ZtU82uYyj14u2XsZKfg==", "peer": true, "dependencies": { - "@types/whatwg-url": "^8.2.1", - "whatwg-url": "^11.0.0" + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^13.0.0" } }, "node_modules/ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" }, "node_modules/napi-build-utils": { "version": "1.0.2", @@ -8836,10 +8774,16 @@ "set-blocking": "^2.0.0" } }, - "node_modules/nwsapi": { - "version": "2.2.10", - "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz", - "integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ==" + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } }, "node_modules/object-assign": { "version": "4.1.1", @@ -9009,6 +8953,14 @@ "openai": "bin/cli" } }, + "node_modules/openai/node_modules/@types/node": { + "version": "18.19.42", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.42.tgz", + "integrity": "sha512-d2ZFc/3lnK2YCYhos8iaNIYu9Vfhr92nHiyJHRltXWjXUBjEE+A4I58Tdbnw4VhggSW+2j5y5gTrLs4biNnubg==", + "dependencies": { + "undici-types": "~5.26.4" + } + }, "node_modules/optionator": { "version": "0.9.3", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz", @@ -9155,17 +9107,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/parse5": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", - "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==", - "dependencies": { - "entities": "^4.4.0" - }, - "funding": { - "url": "https://github.com/inikulin/parse5?sponsor=1" - } - }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -9691,33 +9632,36 @@ } }, "node_modules/puppeteer": { - "version": "22.7.1", - "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-22.7.1.tgz", - "integrity": "sha512-JBCBCwQ9+dyPp5haqeecgv0N0vgWFx44woUeKJaPeJT8CU3RXrd8F/tqJQbuAmcWlbMhYJSlTJkIFrwVAs6BNA==", + "version": "23.3.0", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.3.0.tgz", + "integrity": "sha512-e2jY8cdWSUGsrLxqGm3hIbJq/UIk1uOY8XY7SM51leXkH7shrIyE91lK90Q9byX6tte+cyL3HKqlWBEd6TjWTA==", "hasInstallScript": true, "dependencies": { - "@puppeteer/browsers": "2.2.3", - "cosmiconfig": "9.0.0", - "devtools-protocol": "0.0.1273771", - "puppeteer-core": "22.7.1" + "@puppeteer/browsers": "2.4.0", + "chromium-bidi": "0.6.5", + "cosmiconfig": "^9.0.0", + "devtools-protocol": "0.0.1330662", + "puppeteer-core": "23.3.0", + "typed-query-selector": "^2.12.0" }, "bin": { - "puppeteer": "lib/esm/puppeteer/node/cli.js" + "puppeteer": "lib/cjs/puppeteer/node/cli.js" }, "engines": { "node": ">=18" } }, "node_modules/puppeteer-core": { - "version": "22.7.1", - "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-22.7.1.tgz", - "integrity": "sha512-jD7T7yN7PWGuJmNT0TAEboA26s0VVnvbgCxqgQIF+eNQW2u71ENaV2JwzSJiCHO+e72H4Ue6AgKD9USQ8xAcOQ==", + "version": "23.3.0", + "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.3.0.tgz", + "integrity": "sha512-sB2SsVMFs4gKad5OCdv6w5vocvtEUrRl0zQqSyRPbo/cj1Ktbarmhxy02Zyb9R9HrssBcJDZbkrvBnbaesPyYg==", "dependencies": { - "@puppeteer/browsers": "2.2.3", - "chromium-bidi": "0.5.19", - "debug": "4.3.4", - "devtools-protocol": "0.0.1273771", - "ws": "8.16.0" + "@puppeteer/browsers": "2.4.0", + "chromium-bidi": "0.6.5", + "debug": "^4.3.6", + "devtools-protocol": "0.0.1330662", + "typed-query-selector": "^2.12.0", + "ws": "^8.18.0" }, "engines": { "node": ">=18" @@ -10378,11 +10322,6 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/rrweb-cssom": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz", - "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==" - }, "node_modules/run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", @@ -10479,24 +10418,10 @@ "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" }, - "node_modules/saxes": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", - "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", - "dependencies": { - "xmlchars": "^2.2.0" - }, - "engines": { - "node": ">=v12.22.7" - } - }, "node_modules/semver": { - "version": "7.6.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz", - "integrity": "sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==", - "dependencies": { - "lru-cache": "^6.0.0" - }, + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", "bin": { "semver": "bin/semver.js" }, @@ -10504,22 +10429,6 @@ "node": ">=10" } }, - "node_modules/semver/node_modules/lru-cache": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", - "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", - "dependencies": { - "yallist": "^4.0.0" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/semver/node_modules/yallist": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", - "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==" - }, "node_modules/send": { "version": "0.18.0", "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", @@ -10567,11 +10476,6 @@ "node": ">=4" } }, - "node_modules/send/node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" - }, "node_modules/serve-static": { "version": "1.15.0", "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", @@ -10876,7 +10780,6 @@ "version": "3.0.3", "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", - "optional": true, "peer": true, "dependencies": { "memory-pager": "^1.0.2" @@ -10958,12 +10861,13 @@ } }, "node_modules/streamx": { - "version": "2.16.1", - "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz", - "integrity": "sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==", + "version": "2.20.0", + "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.20.0.tgz", + "integrity": "sha512-ZGd1LhDeGFucr1CUCTBOS58ZhEendd0ttpGT3usTvosS4ntIwKN9LJFp+OeCSprsCPL14BXVRZlHGRY1V9PVzQ==", "dependencies": { - "fast-fifo": "^1.1.0", - "queue-tick": "^1.0.1" + "fast-fifo": "^1.3.2", + "queue-tick": "^1.0.1", + "text-decoder": "^1.1.0" }, "optionalDependencies": { "bare-events": "^2.2.0" @@ -11150,11 +11054,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/symbol-tree": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", - "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" - }, "node_modules/tar": { "version": "6.2.1", "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz", @@ -11172,9 +11071,9 @@ } }, "node_modules/tar-fs": { - "version": "3.0.5", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz", - "integrity": "sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==", + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.6.tgz", + "integrity": "sha512-iokBDQQkUyeXhgPYaZxmczGPhnhXZ0CmrqI+MOb/WFGS9DW5wnfrLgtjUJBvz50vQ3qfRwJ62QVoCFu8mPVu5w==", "dependencies": { "pump": "^3.0.0", "tar-stream": "^3.1.5" @@ -11263,6 +11162,14 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/text-decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz", + "integrity": "sha512-8zll7REEv4GDD3x4/0pW+ppIxSNs7H1J10IKFZsuOMscumCdM2a+toDGLPA3T+1+fLBql4zbt5z83GEQGGV5VA==", + "dependencies": { + "b4a": "^1.6.4" + } + }, "node_modules/text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", @@ -11369,15 +11276,15 @@ } }, "node_modules/tr46": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz", - "integrity": "sha512-l7FvfAHlcmulp8kr+flpQZmVwtu7nfRV7NZujtN0OqES8EL4O4e0qqzL0DC5gAvx/ZC/9lk6rhcUwYvkBnBnYA==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz", + "integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==", "peer": true, "dependencies": { - "punycode": "^2.1.1" + "punycode": "^2.3.0" }, "engines": { - "node": ">=12" + "node": ">=14" } }, "node_modules/ts-deepmerge": { @@ -11613,10 +11520,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/typed-query-selector": { + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz", + "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==" + }, "node_modules/typescript": { - "version": "5.4.5", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz", - "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", + "version": "5.5.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz", + "integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==", "devOptional": true, "bin": { "tsc": "bin/tsc", @@ -11626,6 +11538,11 @@ "node": ">=14.17" } }, + "node_modules/uhyphen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", + "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==" + }, "node_modules/unbox-primitive": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz", @@ -11802,17 +11719,6 @@ "node": ">= 0.8" } }, - "node_modules/w3c-xmlserializer": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", - "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", - "dependencies": { - "xml-name-validator": "^5.0.0" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/walker": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz", @@ -11846,6 +11752,7 @@ "version": "7.0.0", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "peer": true, "engines": { "node": ">=12" } @@ -11871,36 +11778,17 @@ "node": ">=0.8.0" } }, - "node_modules/whatwg-encoding": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", - "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", - "dependencies": { - "iconv-lite": "0.6.3" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/whatwg-mimetype": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", - "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", - "engines": { - "node": ">=18" - } - }, "node_modules/whatwg-url": { - "version": "11.0.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz", - "integrity": "sha512-RKT8HExMpoYx4igMiVMY83lN6UeITKJlBQ+vR/8ZJ8OCdSiN3RwCq+9gH0+Xzj0+5IrM6i4j/6LuvzbZIQgEcQ==", + "version": "13.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-13.0.0.tgz", + "integrity": "sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==", "peer": true, "dependencies": { - "tr46": "^3.0.0", + "tr46": "^4.1.1", "webidl-conversions": "^7.0.0" }, "engines": { - "node": ">=12" + "node": ">=16" } }, "node_modules/which": { @@ -11996,9 +11884,9 @@ } }, "node_modules/ws": { - "version": "8.16.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz", - "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==", + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", "engines": { "node": ">=10.0.0" }, @@ -12020,14 +11908,6 @@ "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz", "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw==" }, - "node_modules/xml-name-validator": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", - "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", - "engines": { - "node": ">=18" - } - }, "node_modules/xml2js": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz", @@ -12048,11 +11928,6 @@ "node": ">=4.0" } }, - "node_modules/xmlchars": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", - "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" - }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", @@ -12137,9 +12012,9 @@ } }, "node_modules/zod": { - "version": "3.22.4", - "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz", - "integrity": "sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==", + "version": "3.23.8", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz", + "integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==", "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/backend/functions/package.json b/backend/functions/package.json index fa5aaa6..743bf8a 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -34,7 +34,7 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.6.5-047c0d8", + "civkit": "^0.7.0-0f8889a", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -43,13 +43,13 @@ "firebase-functions": "^4.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", - "jsdom": "^24.0.0", "langdetect": "^0.2.1", + "linkedom": "^0.18.4", "maxmind": "^4.3.18", "minio": "^7.1.3", "openai": "^4.20.0", "pdfjs-dist": "^4.2.67", - "puppeteer": "^22.7.1", + "puppeteer": "^23.3.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-page-proxy": "^2.0.0", @@ -68,7 +68,7 @@ "@types/bcrypt": "^5.0.0", "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", - "@types/node": "^18", + "@types/node": "^20.14.13", "@types/set-cookie-parser": "^2.4.7", "@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/parser": "^5.12.0", @@ -77,7 +77,7 @@ "eslint-plugin-import": "^2.25.4", "firebase-functions-test": "^3.0.0", "replicate": "^0.16.1", - "typescript": "^5.1.6" + "typescript": "^5.5.4" }, "private": true, "exports": { diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 6dbc4da..ba3dfb0 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -1,7 +1,6 @@ import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, - HashManager, AssertionFailureError, ParamValidationError, Defer, } from 'civkit'; import { singleton } from 'tsyringe'; @@ -11,22 +10,17 @@ import _ from 'lodash'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { Request, Response } from 'express'; const pNormalizeUrl = import("@esm2cjs/normalize-url"); -import { AltTextService } from '../services/alt-text'; -import TurndownService from 'turndown'; import { Crawled } from '../db/crawled'; -import { cleanAttribute } from '../utils/misc'; import { randomUUID } from 'crypto'; import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; import { countGPTToken as estimateToken } from '../shared/utils/openai'; import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; -import { PDFExtractor } from '../services/pdf-extract'; import { DomainBlockade } from '../db/domain-blockade'; import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker'; import { JSDomControl } from '../services/jsdom'; - -const md5Hasher = new HashManager('md5', 'hex'); +import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean; @@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions { keepImgDataUrl?: boolean; } -export interface FormattedPage { - title?: string; - description?: string; - url?: string; - content?: string; - publishedTime?: string; - html?: string; - text?: string; - screenshotUrl?: string; - screenshot?: Buffer; - pageshotUrl?: string; - pageshot?: Buffer; - links?: { [k: string]: string; }; - images?: { [k: string]: string; }; - usage?: { - total_tokens?: number; - totalTokens?: number; - tokens?: number; - }; - - toString: () => string; -} - const indexProto = { toString: function (): string { return _(this) @@ -72,8 +43,6 @@ const indexProto = { export class CrawlerHost extends RPCHost { logger = this.globalLogger.child({ service: this.constructor.name }); - turnDownPlugins = [require('turndown-plugin-gfm').tables]; - cacheRetentionMs = 1000 * 3600 * 24 * 7; cacheValidMs = 1000 * 3600; urlValidMs = 1000 * 3600 * 4; @@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost { protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, protected jsdomControl: JSDomControl, - protected altTextService: AltTextService, - protected pdfExtractor: PDFExtractor, + protected snapshotFormatter: SnapshotFormatter, protected firebaseObjectStorage: FirebaseStorageBucketControl, protected rateLimitControl: RateLimitControl, protected threadLocal: AsyncContext, @@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost { return indexObject; } - getTurndown(options?: { - noRules?: boolean | string, - url?: string | URL; - imgDataUrlToObjectUrl?: boolean; - }) { - const turnDownService = new TurndownService({ - codeBlockStyle: 'fenced', - preformattedCode: true, - } as any); - if (!options?.noRules) { - turnDownService.addRule('remove-irrelevant', { - filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'], - replacement: () => '' - }); - turnDownService.addRule('truncate-svg', { - filter: 'svg' as any, - replacement: () => '' - }); - turnDownService.addRule('title-as-h1', { - filter: ['title'], - replacement: (innerText) => `${innerText}\n===============\n` - }); - } - - if (options?.imgDataUrlToObjectUrl) { - turnDownService.addRule('data-url-to-pseudo-object-url', { - filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')), - replacement: (_content, node: any) => { - const src = (node.getAttribute('src') || '').trim(); - const alt = cleanAttribute(node.getAttribute('alt')) || ''; - - if (options.url) { - const refUrl = new URL(options.url); - const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`); - - return `![${alt}](${mappedUrl})`; - } - - return `![${alt}](blob:${md5Hasher.hash(src)})`; - } - }); - } - - turnDownService.addRule('improved-paragraph', { - filter: 'p', - replacement: (innerText) => { - const trimmed = innerText.trim(); - if (!trimmed) { - return ''; - } - - return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`; - } - }); - turnDownService.addRule('improved-inline-link', { - filter: function (node, options) { - return Boolean( - options.linkStyle === 'inlined' && - node.nodeName === 'A' && - node.getAttribute('href') - ); - }, - - replacement: function (content, node: any) { - let href = node.getAttribute('href'); - if (href) href = href.replace(/([()])/g, '\\$1'); - let title = cleanAttribute(node.getAttribute('title')); - if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; - - const fixedContent = content.replace(/\s+/g, ' ').trim(); - let fixedHref = href.replace(/\s+/g, '').trim(); - if (options?.url) { - try { - fixedHref = new URL(fixedHref, options.url).toString(); - } catch (_err) { - void 0; - } - } - - return `[${fixedContent}](${fixedHref}${title || ''})`; - } - }); - turnDownService.addRule('improved-code', { - filter: function (node: any) { - let hasSiblings = node.previousSibling || node.nextSibling; - let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; - - return node.nodeName === 'CODE' && !isCodeBlock; - }, - - replacement: function (inputContent: any) { - if (!inputContent) return ''; - let content = inputContent; - - let delimiter = '`'; - let matches = content.match(/`+/gm) || []; - while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; - if (content.includes('\n')) { - delimiter = '```'; - } - - let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; - - return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter; - } - }); - - return turnDownService; - } - - getGeneralSnapshotMixins(snapshot: PageSnapshot) { - let inferred; - const mixin: any = {}; - if (this.threadLocal.get('withImagesSummary')) { - inferred ??= this.jsdomControl.inferSnapshot(snapshot); - const imageSummary = {} as { [k: string]: string; }; - const imageIdxTrack = new Map(); - - let imgIdx = 0; - - for (const img of inferred.imgs) { - const imgSerial = ++imgIdx; - const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : []; - idxArr.push(imgSerial); - imageIdxTrack.set(img.src, idxArr); - imageSummary[img.src] = img.alt || ''; - } - - mixin.images = - _(imageSummary) - .toPairs() - .map( - ([url, alt], i) => { - return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; - } - ).fromPairs() - .value(); - } - if (this.threadLocal.get('withLinksSummary')) { - inferred ??= this.jsdomControl.inferSnapshot(snapshot); - mixin.links = _.invert(inferred.links || {}); - } - - return mixin; - } - - async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & { - screenshotUrl?: string; - pageshotUrl?: string; - }, nominalUrl?: URL) { - if (mode === 'screenshot') { - if (snapshot.screenshot && !snapshot.screenshotUrl) { - const fid = `instant-screenshots/${randomUUID()}`; - await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, { - metadata: { - contentType: 'image/png', - } - }); - snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); - } - - return { - ...this.getGeneralSnapshotMixins(snapshot), - // html: snapshot.html, - screenshotUrl: snapshot.screenshotUrl, - toString() { - return this.screenshotUrl; - } - } as FormattedPage; - } - if (mode === 'pageshot') { - if (snapshot.pageshot && !snapshot.pageshotUrl) { - const fid = `instant-screenshots/${randomUUID()}`; - await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, { - metadata: { - contentType: 'image/png', - } - }); - snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); - } - - return { - ...this.getGeneralSnapshotMixins(snapshot), - html: snapshot.html, - pageshotUrl: snapshot.pageshotUrl, - toString() { - return this.pageshotUrl; - } - } as FormattedPage; - } - if (mode === 'html') { - return { - ...this.getGeneralSnapshotMixins(snapshot), - html: snapshot.html, - toString() { - return this.html; - } - } as FormattedPage; - } - - let pdfMode = false; - if (snapshot.pdfs?.length && !snapshot.title) { - const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0], - this.threadLocal.get('cacheTolerance') - ); - if (pdf) { - pdfMode = true; - snapshot.title = pdf.meta?.Title; - snapshot.text = pdf.text || snapshot.text; - snapshot.parsed = { - content: pdf.content, - textContent: pdf.content, - length: pdf.content?.length, - byline: pdf.meta?.Author, - lang: pdf.meta?.Language || undefined, - title: pdf.meta?.Title, - publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(), - }; - } - } - - if (mode === 'text') { - return { - ...this.getGeneralSnapshotMixins(snapshot), - text: snapshot.text, - toString() { - return this.text; - } - } as FormattedPage; - } - const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl')); - - let contentText = ''; - const imageSummary = {} as { [k: string]: string; }; - const imageIdxTrack = new Map(); - const uid = this.threadLocal.get('uid'); - do { - if (pdfMode) { - contentText = snapshot.parsed?.content || snapshot.text; - break; - } - - if ( - snapshot.maxElemDepth! > 256 || - (!uid && snapshot.elemCount! > 10_000) || - snapshot.elemCount! > 70_000 - ) { - this.logger.warn('Degrading to text to protect the server', { url: snapshot.href }); - contentText = snapshot.text; - break; - } - - const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); - let toBeTurnedToMd = jsDomElementOfHTML; - let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); - if (mode !== 'markdown' && snapshot.parsed?.content) { - const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); - const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); - const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; - - // If Readability did its job - if (par2.length >= 0.3 * par1.length) { - turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); - if (snapshot.parsed.content) { - toBeTurnedToMd = jsDomElementOfParsed; - } - } - } - - for (const plugin of this.turnDownPlugins) { - turnDownService = turnDownService.use(plugin); - } - const urlToAltMap: { [k: string]: string | undefined; } = {}; - if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) { - const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => { - const r = await this.altTextService.getAltText(x).catch((err: any) => { - this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); - return undefined; - }); - if (r && x.src) { - urlToAltMap[x.src.trim()] = r; - } - }); - - await Promise.all(tasks); - } - let imgIdx = 0; - turnDownService.addRule('img-generated-alt', { - filter: 'img', - replacement: (_content, node: any) => { - let linkPreferredSrc = (node.getAttribute('src') || '').trim(); - if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) { - const dataSrc = (node.getAttribute('data-src') || '').trim(); - if (dataSrc && !dataSrc.startsWith('data:')) { - linkPreferredSrc = dataSrc; - } - } - - let src; - try { - src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString(); - } catch (_err) { - void 0; - } - const alt = cleanAttribute(node.getAttribute('alt')); - if (!src) { - return ''; - } - const mapped = urlToAltMap[src]; - const imgSerial = ++imgIdx; - const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : []; - idxArr.push(imgSerial); - imageIdxTrack.set(src, idxArr); - - if (mapped) { - imageSummary[src] = mapped || alt; - - if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { - const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); - mappedUrl.protocol = 'blob:'; - - return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`; - } - - return `![Image ${imgIdx}: ${mapped || alt}](${src})`; - } - - imageSummary[src] = alt || ''; - - if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { - const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); - mappedUrl.protocol = 'blob:'; - - return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`; - } - - return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`; - } - }); - - if (toBeTurnedToMd) { - try { - contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); - } catch (err) { - this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); - try { - contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); - } catch (err2) { - this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); - } - } - } - - if ( - !contentText || (contentText.startsWith('<') && contentText.endsWith('>')) - && toBeTurnedToMd !== jsDomElementOfHTML - ) { - try { - contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); - } catch (err) { - this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); - try { - contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); - } catch (err2) { - this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); - } - } - } - if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) { - contentText = snapshot.text; - } - } while (false); - - const cleanText = (contentText || '').trim(); - - const formatted: FormattedPage = { - title: (snapshot.parsed?.title || snapshot.title || '').trim(), - url: nominalUrl?.toString() || snapshot.href?.trim(), - content: cleanText, - publishedTime: snapshot.parsed?.publishedTime || undefined, - - toString() { - if (mode === 'markdown') { - return this.content as string; - } - - const mixins = []; - if (this.publishedTime) { - mixins.push(`Published Time: ${this.publishedTime}`); - } - const suffixMixins = []; - if (this.images) { - const imageSummaryChunks = ['Images:']; - for (const [k, v] of Object.entries(this.images)) { - imageSummaryChunks.push(`- ![${k}](${v})`); - } - if (imageSummaryChunks.length === 1) { - imageSummaryChunks.push('This page does not seem to contain any images.'); - } - suffixMixins.push(imageSummaryChunks.join('\n')); - } - if (this.links) { - const linkSummaryChunks = ['Links/Buttons:']; - for (const [k, v] of Object.entries(this.links)) { - linkSummaryChunks.push(`- [${k}](${v})`); - } - if (linkSummaryChunks.length === 1) { - linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); - } - suffixMixins.push(linkSummaryChunks.join('\n')); - } - - return `Title: ${this.title} - -URL Source: ${this.url} -${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} -Markdown Content: -${this.content} -${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; - } - }; - - if (this.threadLocal.get('withImagesSummary')) { - formatted.images = - _(imageSummary) - .toPairs() - .map( - ([url, alt], i) => { - return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; - } - ).fromPairs() - .value(); - } - if (this.threadLocal.get('withLinksSummary')) { - formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {}); - } - - return formatted as FormattedPage; - } - @CloudHTTPv2({ name: 'crawl2', runtime: { @@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; }) @CloudHTTPv2({ runtime: { - memory: '4GiB', + memory: '8GiB', cpu: 4, timeoutSeconds: 300, concurrency: 22, @@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; continue; } - const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl); + const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); sseStream.write({ event: 'data', @@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; continue; } - const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl); + const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.timeout === undefined) { @@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl); + const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); return formatted; @@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; continue; } - const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl); + const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.timeout === undefined) { if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { - return assignTransferProtocolMeta(`${formatted}`, + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } ); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { - return assignTransferProtocolMeta(`${formatted}`, + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } ); } - return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); } } @@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl); + const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { - return assignTransferProtocolMeta(`${formatted}`, + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } ); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { - return assignTransferProtocolMeta(`${formatted}`, + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } ); } - return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); } async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) { @@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } catch (err) { if (lastSnapshot) { - return this.formatSnapshot(mode, lastSnapshot, url); + return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } throw err; @@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; throw new AssertionFailureError(`No content available`); } - return this.formatSnapshot(mode, lastSnapshot, url); + return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } } diff --git a/backend/functions/src/cloud-functions/data-crunching.ts b/backend/functions/src/cloud-functions/data-crunching.ts index c71ff9f..ebd41a5 100644 --- a/backend/functions/src/cloud-functions/data-crunching.ts +++ b/backend/functions/src/cloud-functions/data-crunching.ts @@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises'; import { createGzip } from 'zlib'; import { getFunctions } from 'firebase-admin/functions'; import { GoogleAuth } from 'google-auth-library'; +import { SnapshotFormatter } from '../services/snapshot-formatter'; dayjs.extend(require('dayjs/plugin/utc')); @@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost { protected globalLogger: Logger, protected crawler: CrawlerHost, + protected snapshotFormatter: SnapshotFormatter, protected tempFileManager: TempFileManager, protected firebaseObjectStorage: FirebaseStorageBucketControl, ) { @@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost { try { const snapshot = JSON.parse(snapshotTxt.toString('utf-8')); - let formatted = await this.crawler.formatSnapshot('default', snapshot); + let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot); if (!formatted.content) { - formatted = await this.crawler.formatSnapshot('markdown', snapshot); + formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot); } await nextDrainDeferred.promise; diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 11e5afe..0af9ef3 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -11,11 +11,12 @@ import _ from 'lodash'; import { Request, Response } from 'express'; import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search'; -import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler'; +import { CrawlerHost, ExtraScrappingOptions } from './crawler'; import { WebSearchQueryParams } from '../shared/3rd-party/brave-search'; import { SearchResult } from '../db/searched'; import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types'; import { CrawlerOptions } from '../dto/scrapping-options'; +import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter'; @singleton() @@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost { protected threadLocal: AsyncContext, protected braveSearchService: BraveSearchService, protected crawler: CrawlerHost, + protected snapshotFormatter: SnapshotFormatter, ) { super(...arguments); } @@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost { if (snapshotMap.has(x)) { return snapshotMap.get(x); } - return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => { + return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => { r.title ??= upstreamSearchResult.title; r.description = upstreamSearchResult.description; snapshotMap.set(x, r); diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 84adcaf..176f7dc 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe'; import { AsyncService, marshalErrorLike } from 'civkit'; import { Logger } from '../shared/services/logger'; import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; -import { JSDOM, VirtualConsole } from 'jsdom'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; +import { Threaded } from '../shared/services/threaded'; -const virtualConsole = new VirtualConsole(); -virtualConsole.on('error', () => void 0); +const pLinkedom = import('linkedom'); @singleton() export class JSDomControl extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); + linkedom!: Awaited; + constructor( protected globalLogger: Logger, ) { @@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService { override async init() { await this.dependencyReady(); + this.linkedom = await pLinkedom; this.emit('ready'); } - narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { + async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { targetSelector?: string | string[]; removeSelector?: string | string[]; withIframe?: boolean; - }): PageSnapshot | undefined { + }) { if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) { return snapshot; } if (!snapshot?.html) { return snapshot; } + + return this.actualNarrowSnapshot(snapshot, options); + } + + @Threaded() + async actualNarrowSnapshot(snapshot: PageSnapshot, options?: { + targetSelector?: string | string[]; + removeSelector?: string | string[]; + withIframe?: boolean; + }): Promise { + const t0 = Date.now(); - const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); + const jsdom = this.linkedom.parseHTML(snapshot.html); const allNodes: Node[] = []; jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); if (options?.withIframe) { @@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService { let rootDoc: Document; if (allNodes.length === 1 && allNodes[0].nodeName === '#document') { rootDoc = allNodes[0] as any; - if (rootDoc.body.textContent) { - textChunks.push(rootDoc.body.textContent); + if (rootDoc.body.innerText) { + textChunks.push(rootDoc.body.innerText); } } else { - rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document; + rootDoc = this.linkedom.parseHTML('').window.document; for (const n of allNodes) { rootDoc.body.appendChild(n); rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); - if (n.textContent) { - textChunks.push(n.textContent); + if ((n as HTMLElement).innerText) { + textChunks.push((n as HTMLElement).innerText); } } } @@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService { this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); } - // No innerText in jsdom - // https://github.com/jsdom/jsdom/issues/1245 - const textContent = textChunks.join('\n\n'); - const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n'); - const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]) .flat() @@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService { title: snapshot.title || jsdom.window.document.title, parsed, html: rootDoc.documentElement.outerHTML, - text: cleanedText, + text: textChunks.join('\n'), imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], } as PageSnapshot; @@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService { return r; } + @Threaded() inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { const t0 = Date.now(); const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; try { - const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); + const jsdom = this.linkedom.parseHTML(snapshot.html); + jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); const links = Array.from(jsdom.window.document.querySelectorAll('a[href]')) .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()]) @@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService { return extendedSnapshot; } - snippetToElement(snippet?: string, url?: string) { - const parsed = new JSDOM(snippet || '', { url, virtualConsole }); + const parsed = this.linkedom.parseHTML(snippet || ''); return parsed.window.document.documentElement; } diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 75229c1..c4598c3 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -1,7 +1,7 @@ import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; -import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred, perNextTick } from 'civkit'; +import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit'; import { Logger } from '../shared/services/logger'; import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer'; @@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService { browser!: Browser; logger = this.globalLogger.child({ service: this.constructor.name }); - private __healthCheckInterval?: NodeJS.Timeout; private __reqCapInterval?: NodeJS.Timeout; __loadedPage: Page[] = []; @@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService { livePages = new Set(); lastPageCratedAt: number = 0; - rpsCap: number = 300; + rpsCap: number = 500; lastReqSentAt: number = 0; requestDeferredQueue: Deferred[] = []; @@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService { }); } - briefPages() { - this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`); - } - override async init() { - if (this.__healthCheckInterval) { - clearInterval(this.__healthCheckInterval); - this.__healthCheckInterval = undefined; - } if (this.__reqCapInterval) { clearInterval(this.__reqCapInterval); this.__reqCapInterval = undefined; @@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService { this.emit('ready'); - this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref(); this.newPage().then((r) => this.__loadedPage.push(r)); } - @maxConcurrency(1) - async healthCheck() { - if (Date.now() - this.lastPageCratedAt <= 10_000) { - this.briefPages(); - return; - } - const healthyPage = await this.newPage().catch((err) => { - this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) }); - return null; - }); - - if (healthyPage) { - this.__loadedPage.push(healthyPage); - - if (this.__loadedPage.length > 3) { - this.ditchPage(this.__loadedPage.shift()!); - } - - this.briefPages(); - - return; - } - - this.logger.warn(`Trying to clean up...`); - this.browser.process()?.kill('SIGKILL'); - Reflect.deleteProperty(this, 'browser'); - this.emit('crippled'); - this.logger.warn(`Browser killed`); - } - @perNextTick() reqCapRoutine() { const now = Date.now(); @@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad); try { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); + screenshot = Buffer.from(await page.screenshot()); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad); if (salvaged) { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); - pageshot = await page.screenshot({ fullPage: true }); + screenshot = Buffer.from(await page.screenshot()); + pageshot = Buffer.from(await page.screenshot({ fullPage: true })); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad); .then(async () => { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); - pageshot = await page.screenshot({ fullPage: true }); + screenshot = Buffer.from(await page.screenshot()); + pageshot = Buffer.from(await page.screenshot({ fullPage: true })); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad); break; } if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { - screenshot = await page.screenshot(); - pageshot = await page.screenshot({ fullPage: true }); + screenshot = Buffer.from(await page.screenshot()); + pageshot = Buffer.from(await page.screenshot({ fullPage: true })); lastHTML = snapshot.html; } if (snapshot || screenshot) { diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts new file mode 100644 index 0000000..5ab129b --- /dev/null +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -0,0 +1,539 @@ +import { randomUUID } from 'crypto'; +import { container, singleton } from 'tsyringe'; +import { AsyncService, HashManager, marshalErrorLike } from 'civkit'; +import TurndownService from 'turndown'; +import { Logger } from '../shared/services/logger'; +import { PageSnapshot } from './puppeteer'; +import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; +import { AsyncContext } from '../shared/services/async-context'; +import { Threaded } from '../shared/services/threaded'; +import { JSDomControl } from './jsdom'; +import { AltTextService } from './alt-text'; +import { PDFExtractor } from './pdf-extract'; +import { cleanAttribute } from '../utils/misc'; +import _ from 'lodash'; + + +export interface FormattedPage { + title?: string; + description?: string; + url?: string; + content?: string; + publishedTime?: string; + html?: string; + text?: string; + screenshotUrl?: string; + screenshot?: Buffer; + pageshotUrl?: string; + pageshot?: Buffer; + links?: { [k: string]: string; }; + images?: { [k: string]: string; }; + usage?: { + total_tokens?: number; + totalTokens?: number; + tokens?: number; + }; + + textRepresentation?: string; + + [Symbol.dispose]: () => void; +} + +export const md5Hasher = new HashManager('md5', 'hex'); + +@singleton() +export class SnapshotFormatter extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + turnDownPlugins = [require('turndown-plugin-gfm').tables]; + + constructor( + protected globalLogger: Logger, + protected jsdomControl: JSDomControl, + protected altTextService: AltTextService, + protected pdfExtractor: PDFExtractor, + protected threadLocal: AsyncContext, + protected firebaseObjectStorage: FirebaseStorageBucketControl, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + } + + + @Threaded() + async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & { + screenshotUrl?: string; + pageshotUrl?: string; + }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) { + const t0 = Date.now(); + if (mode === 'screenshot') { + if (snapshot.screenshot && !snapshot.screenshotUrl) { + const fid = `instant-screenshots/${randomUUID()}`; + await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, { + metadata: { + contentType: 'image/png', + } + }); + snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs); + } + + const f = { + ...this.getGeneralSnapshotMixins(snapshot), + // html: snapshot.html, + screenshotUrl: snapshot.screenshotUrl, + }; + + Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false }); + + const dt = Date.now() - t0; + this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + + return f as FormattedPage; + } + if (mode === 'pageshot') { + if (snapshot.pageshot && !snapshot.pageshotUrl) { + const fid = `instant-screenshots/${randomUUID()}`; + await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, { + metadata: { + contentType: 'image/png', + } + }); + snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs); + } + + const f = { + ...this.getGeneralSnapshotMixins(snapshot), + html: snapshot.html, + pageshotUrl: snapshot.pageshotUrl, + } as FormattedPage; + + Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false }); + + const dt = Date.now() - t0; + this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + + return f; + } + if (mode === 'html') { + const f = { + ...this.getGeneralSnapshotMixins(snapshot), + html: snapshot.html, + } as FormattedPage; + + Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false }); + + const dt = Date.now() - t0; + this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + + return f; + } + + let pdfMode = false; + if (snapshot.pdfs?.length && !snapshot.title) { + const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0], + this.threadLocal.get('cacheTolerance') + ); + if (pdf) { + pdfMode = true; + snapshot.title = pdf.meta?.Title; + snapshot.text = pdf.text || snapshot.text; + snapshot.parsed = { + content: pdf.content, + textContent: pdf.content, + length: pdf.content?.length, + byline: pdf.meta?.Author, + lang: pdf.meta?.Language || undefined, + title: pdf.meta?.Title, + publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(), + }; + } + } + + if (mode === 'text') { + const f = { + ...this.getGeneralSnapshotMixins(snapshot), + text: snapshot.text, + } as FormattedPage; + + Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false }); + + const dt = Date.now() - t0; + this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + + return f; + } + const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl')); + + let contentText = ''; + const imageSummary = {} as { [k: string]: string; }; + const imageIdxTrack = new Map(); + const uid = this.threadLocal.get('uid'); + do { + if (pdfMode) { + contentText = snapshot.parsed?.content || snapshot.text; + break; + } + + if ( + snapshot.maxElemDepth! > 256 || + (!uid && snapshot.elemCount! > 10_000) || + snapshot.elemCount! > 70_000 + ) { + this.logger.warn('Degrading to text to protect the server', { url: snapshot.href }); + contentText = snapshot.text; + break; + } + + const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); + let toBeTurnedToMd = jsDomElementOfHTML; + let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); + if (mode !== 'markdown' && snapshot.parsed?.content) { + const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); + const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); + const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; + + // If Readability did its job + if (par2.length >= 0.3 * par1.length) { + turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); + if (snapshot.parsed.content) { + toBeTurnedToMd = jsDomElementOfParsed; + } + } + } + + for (const plugin of this.turnDownPlugins) { + turnDownService = turnDownService.use(plugin); + } + const urlToAltMap: { [k: string]: string | undefined; } = {}; + if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) { + const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => { + const r = await this.altTextService.getAltText(x).catch((err: any) => { + this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); + return undefined; + }); + if (r && x.src) { + urlToAltMap[x.src.trim()] = r; + } + }); + + await Promise.all(tasks); + } + let imgIdx = 0; + turnDownService.addRule('img-generated-alt', { + filter: 'img', + replacement: (_content, node: any) => { + let linkPreferredSrc = (node.getAttribute('src') || '').trim(); + if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) { + const dataSrc = (node.getAttribute('data-src') || '').trim(); + if (dataSrc && !dataSrc.startsWith('data:')) { + linkPreferredSrc = dataSrc; + } + } + + let src; + try { + src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString(); + } catch (_err) { + void 0; + } + const alt = cleanAttribute(node.getAttribute('alt')); + if (!src) { + return ''; + } + const mapped = urlToAltMap[src]; + const imgSerial = ++imgIdx; + const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : []; + idxArr.push(imgSerial); + imageIdxTrack.set(src, idxArr); + + if (mapped) { + imageSummary[src] = mapped || alt; + + if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { + const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); + mappedUrl.protocol = 'blob:'; + + return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`; + } + + return `![Image ${imgIdx}: ${mapped || alt}](${src})`; + } + + imageSummary[src] = alt || ''; + + if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { + const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); + mappedUrl.protocol = 'blob:'; + + return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`; + } + + return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`; + } + }); + + if (toBeTurnedToMd) { + try { + contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); + } catch (err) { + this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); + try { + contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); + } catch (err2) { + this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); + } + } + } + + if ( + !contentText || (contentText.startsWith('<') && contentText.endsWith('>')) + && toBeTurnedToMd !== jsDomElementOfHTML + ) { + try { + contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); + } catch (err) { + this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); + try { + contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); + } catch (err2) { + this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); + } + } + } + if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) { + contentText = snapshot.text; + } + } while (false); + + const cleanText = (contentText || '').trim(); + + const formatted: FormattedPage = { + title: (snapshot.parsed?.title || snapshot.title || '').trim(), + url: nominalUrl?.toString() || snapshot.href?.trim(), + content: cleanText, + publishedTime: snapshot.parsed?.publishedTime || undefined, + [Symbol.dispose]: () => { }, + }; + + if (this.threadLocal.get('withImagesSummary')) { + formatted.images = + _(imageSummary) + .toPairs() + .map( + ([url, alt], i) => { + return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; + } + ).fromPairs() + .value(); + } + if (this.threadLocal.get('withLinksSummary')) { + formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {}); + } + + const textRepresentation = (function (this: typeof formatted) { + if (mode === 'markdown') { + return this.content as string; + } + + const mixins = []; + if (this.publishedTime) { + mixins.push(`Published Time: ${this.publishedTime}`); + } + const suffixMixins = []; + if (this.images) { + const imageSummaryChunks = ['Images:']; + for (const [k, v] of Object.entries(this.images)) { + imageSummaryChunks.push(`- ![${k}](${v})`); + } + if (imageSummaryChunks.length === 1) { + imageSummaryChunks.push('This page does not seem to contain any images.'); + } + suffixMixins.push(imageSummaryChunks.join('\n')); + } + if (this.links) { + const linkSummaryChunks = ['Links/Buttons:']; + for (const [k, v] of Object.entries(this.links)) { + linkSummaryChunks.push(`- [${k}](${v})`); + } + if (linkSummaryChunks.length === 1) { + linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); + } + suffixMixins.push(linkSummaryChunks.join('\n')); + } + + return `Title: ${this.title} + +URL Source: ${this.url} +${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} +Markdown Content: +${this.content} +${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; + }).call(formatted); + + Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false }); + + const dt = Date.now() - t0; + this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + + return formatted as FormattedPage; + } + + getGeneralSnapshotMixins(snapshot: PageSnapshot) { + let inferred; + const mixin: any = {}; + if (this.threadLocal.get('withImagesSummary')) { + inferred ??= this.jsdomControl.inferSnapshot(snapshot); + const imageSummary = {} as { [k: string]: string; }; + const imageIdxTrack = new Map(); + + let imgIdx = 0; + + for (const img of inferred.imgs) { + const imgSerial = ++imgIdx; + const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : []; + idxArr.push(imgSerial); + imageIdxTrack.set(img.src, idxArr); + imageSummary[img.src] = img.alt || ''; + } + + mixin.images = + _(imageSummary) + .toPairs() + .map( + ([url, alt], i) => { + return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; + } + ).fromPairs() + .value(); + } + if (this.threadLocal.get('withLinksSummary')) { + inferred ??= this.jsdomControl.inferSnapshot(snapshot); + mixin.links = _.invert(inferred.links || {}); + } + + return mixin; + } + + getTurndown(options?: { + noRules?: boolean | string, + url?: string | URL; + imgDataUrlToObjectUrl?: boolean; + }) { + const turnDownService = new TurndownService({ + codeBlockStyle: 'fenced', + preformattedCode: true, + } as any); + if (!options?.noRules) { + turnDownService.addRule('remove-irrelevant', { + filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'], + replacement: () => '' + }); + turnDownService.addRule('truncate-svg', { + filter: 'svg' as any, + replacement: () => '' + }); + turnDownService.addRule('title-as-h1', { + filter: ['title'], + replacement: (innerText) => `${innerText}\n===============\n` + }); + } + + if (options?.imgDataUrlToObjectUrl) { + turnDownService.addRule('data-url-to-pseudo-object-url', { + filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')), + replacement: (_content, node: any) => { + const src = (node.getAttribute('src') || '').trim(); + const alt = cleanAttribute(node.getAttribute('alt')) || ''; + + if (options.url) { + const refUrl = new URL(options.url); + const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`); + + return `![${alt}](${mappedUrl})`; + } + + return `![${alt}](blob:${md5Hasher.hash(src)})`; + } + }); + } + + turnDownService.addRule('improved-paragraph', { + filter: 'p', + replacement: (innerText) => { + const trimmed = innerText.trim(); + if (!trimmed) { + return ''; + } + + return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`; + } + }); + turnDownService.addRule('improved-inline-link', { + filter: function (node, options) { + return Boolean( + options.linkStyle === 'inlined' && + node.nodeName === 'A' && + node.getAttribute('href') + ); + }, + + replacement: function (content, node: any) { + let href = node.getAttribute('href'); + if (href) href = href.replace(/([()])/g, '\\$1'); + let title = cleanAttribute(node.getAttribute('title')); + if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; + + const fixedContent = content.replace(/\s+/g, ' ').trim(); + let fixedHref = href.replace(/\s+/g, '').trim(); + if (options?.url) { + try { + fixedHref = new URL(fixedHref, options.url).toString(); + } catch (_err) { + void 0; + } + } + + return `[${fixedContent}](${fixedHref}${title || ''})`; + } + }); + turnDownService.addRule('improved-code', { + filter: function (node: any) { + let hasSiblings = node.previousSibling || node.nextSibling; + let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; + + return node.nodeName === 'CODE' && !isCodeBlock; + }, + + replacement: function (inputContent: any) { + if (!inputContent) return ''; + let content = inputContent; + + let delimiter = '`'; + let matches = content.match(/`+/gm) || []; + while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; + if (content.includes('\n')) { + delimiter = '```'; + } + + let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; + + return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter; + } + }); + + return turnDownService; + } + + +} + +const snapshotFormatter = container.resolve(SnapshotFormatter); + +export default snapshotFormatter;