mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 15:55:57 +08:00
feat: control cache tolerance and select target using headers
This commit is contained in:
parent
029f568c38
commit
6f65083f8d
@ -59,7 +59,11 @@ As you have already seen above, one can control the behavior of the Reader API u
|
|||||||
- `x-respond-with: text` returns `document.body.innerText`
|
- `x-respond-with: text` returns `document.body.innerText`
|
||||||
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
||||||
- You can specify a proxy server via the `x-proxy-url` header.
|
- You can specify a proxy server via the `x-proxy-url` header.
|
||||||
- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
|
- You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds).
|
||||||
|
- You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`).
|
||||||
|
- If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page.
|
||||||
|
- By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target.
|
||||||
|
- By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element.
|
||||||
|
|
||||||
|
|
||||||
### Streaming mode
|
### Streaming mode
|
||||||
|
235
backend/functions/package-lock.json
generated
235
backend/functions/package-lock.json
generated
@ -23,6 +23,7 @@
|
|||||||
"generic-pool": "^3.9.0",
|
"generic-pool": "^3.9.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
|
"jsdom": "^24.0.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
"maxmind": "^4.3.18",
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
@ -4036,6 +4037,17 @@
|
|||||||
"node": ">= 8"
|
"node": ">= 8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/cssstyle": {
|
||||||
|
"version": "4.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
|
||||||
|
"integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
|
||||||
|
"dependencies": {
|
||||||
|
"rrweb-cssom": "^0.6.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/data-uri-to-buffer": {
|
"node_modules/data-uri-to-buffer": {
|
||||||
"version": "6.0.2",
|
"version": "6.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||||
@ -4044,6 +4056,41 @@
|
|||||||
"node": ">= 14"
|
"node": ">= 14"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/data-urls": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==",
|
||||||
|
"dependencies": {
|
||||||
|
"whatwg-mimetype": "^4.0.0",
|
||||||
|
"whatwg-url": "^14.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/data-urls/node_modules/tr46": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
||||||
|
"dependencies": {
|
||||||
|
"punycode": "^2.3.1"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/data-urls/node_modules/whatwg-url": {
|
||||||
|
"version": "14.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
||||||
|
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
||||||
|
"dependencies": {
|
||||||
|
"tr46": "^5.0.0",
|
||||||
|
"webidl-conversions": "^7.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/data-view-buffer": {
|
"node_modules/data-view-buffer": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
||||||
@ -4116,6 +4163,11 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/decimal.js": {
|
||||||
|
"version": "10.4.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
|
||||||
|
"integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
|
||||||
|
},
|
||||||
"node_modules/decode-uri-component": {
|
"node_modules/decode-uri-component": {
|
||||||
"version": "0.2.2",
|
"version": "0.2.2",
|
||||||
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
||||||
@ -6119,6 +6171,17 @@
|
|||||||
"node": ">= 0.4"
|
"node": ">= 0.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/html-encoding-sniffer": {
|
||||||
|
"version": "4.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
|
||||||
|
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
|
||||||
|
"dependencies": {
|
||||||
|
"whatwg-encoding": "^3.1.1"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/html-escaper": {
|
"node_modules/html-escaper": {
|
||||||
"version": "2.0.2",
|
"version": "2.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
||||||
@ -6307,7 +6370,6 @@
|
|||||||
"version": "0.6.3",
|
"version": "0.6.3",
|
||||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||||
"optional": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||||
},
|
},
|
||||||
@ -6705,6 +6767,11 @@
|
|||||||
"node": ">=0.10.0"
|
"node": ">=0.10.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/is-potential-custom-element-name": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
||||||
|
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
|
||||||
|
},
|
||||||
"node_modules/is-regex": {
|
"node_modules/is-regex": {
|
||||||
"version": "1.1.4",
|
"version": "1.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
||||||
@ -7573,6 +7640,91 @@
|
|||||||
"node": ">=0.1.90"
|
"node": ">=0.1.90"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/jsdom": {
|
||||||
|
"version": "24.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
|
||||||
|
"integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
|
||||||
|
"dependencies": {
|
||||||
|
"cssstyle": "^4.0.1",
|
||||||
|
"data-urls": "^5.0.0",
|
||||||
|
"decimal.js": "^10.4.3",
|
||||||
|
"form-data": "^4.0.0",
|
||||||
|
"html-encoding-sniffer": "^4.0.0",
|
||||||
|
"http-proxy-agent": "^7.0.0",
|
||||||
|
"https-proxy-agent": "^7.0.2",
|
||||||
|
"is-potential-custom-element-name": "^1.0.1",
|
||||||
|
"nwsapi": "^2.2.7",
|
||||||
|
"parse5": "^7.1.2",
|
||||||
|
"rrweb-cssom": "^0.6.0",
|
||||||
|
"saxes": "^6.0.0",
|
||||||
|
"symbol-tree": "^3.2.4",
|
||||||
|
"tough-cookie": "^4.1.3",
|
||||||
|
"w3c-xmlserializer": "^5.0.0",
|
||||||
|
"webidl-conversions": "^7.0.0",
|
||||||
|
"whatwg-encoding": "^3.1.1",
|
||||||
|
"whatwg-mimetype": "^4.0.0",
|
||||||
|
"whatwg-url": "^14.0.0",
|
||||||
|
"ws": "^8.16.0",
|
||||||
|
"xml-name-validator": "^5.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"canvas": "^2.11.2"
|
||||||
|
},
|
||||||
|
"peerDependenciesMeta": {
|
||||||
|
"canvas": {
|
||||||
|
"optional": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/jsdom/node_modules/agent-base": {
|
||||||
|
"version": "7.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
|
||||||
|
"integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
|
||||||
|
"dependencies": {
|
||||||
|
"debug": "^4.3.4"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 14"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/jsdom/node_modules/https-proxy-agent": {
|
||||||
|
"version": "7.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
|
||||||
|
"integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
|
||||||
|
"dependencies": {
|
||||||
|
"agent-base": "^7.0.2",
|
||||||
|
"debug": "4"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 14"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/jsdom/node_modules/tr46": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
||||||
|
"dependencies": {
|
||||||
|
"punycode": "^2.3.1"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/jsdom/node_modules/whatwg-url": {
|
||||||
|
"version": "14.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
||||||
|
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
||||||
|
"dependencies": {
|
||||||
|
"tr46": "^5.0.0",
|
||||||
|
"webidl-conversions": "^7.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/jsesc": {
|
"node_modules/jsesc": {
|
||||||
"version": "2.5.2",
|
"version": "2.5.2",
|
||||||
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
||||||
@ -8671,6 +8823,11 @@
|
|||||||
"set-blocking": "^2.0.0"
|
"set-blocking": "^2.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/nwsapi": {
|
||||||
|
"version": "2.2.10",
|
||||||
|
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz",
|
||||||
|
"integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ=="
|
||||||
|
},
|
||||||
"node_modules/object-assign": {
|
"node_modules/object-assign": {
|
||||||
"version": "4.1.1",
|
"version": "4.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||||
@ -8985,6 +9142,17 @@
|
|||||||
"url": "https://github.com/sponsors/sindresorhus"
|
"url": "https://github.com/sponsors/sindresorhus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/parse5": {
|
||||||
|
"version": "7.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
||||||
|
"integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
|
||||||
|
"dependencies": {
|
||||||
|
"entities": "^4.4.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/parseurl": {
|
"node_modules/parseurl": {
|
||||||
"version": "1.3.3",
|
"version": "1.3.3",
|
||||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||||
@ -10185,6 +10353,11 @@
|
|||||||
"url": "https://github.com/sponsors/isaacs"
|
"url": "https://github.com/sponsors/isaacs"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/rrweb-cssom": {
|
||||||
|
"version": "0.6.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
|
||||||
|
"integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
|
||||||
|
},
|
||||||
"node_modules/run-parallel": {
|
"node_modules/run-parallel": {
|
||||||
"version": "1.2.0",
|
"version": "1.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
||||||
@ -10281,6 +10454,17 @@
|
|||||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
||||||
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
||||||
},
|
},
|
||||||
|
"node_modules/saxes": {
|
||||||
|
"version": "6.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
|
||||||
|
"integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
|
||||||
|
"dependencies": {
|
||||||
|
"xmlchars": "^2.2.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=v12.22.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/semver": {
|
"node_modules/semver": {
|
||||||
"version": "7.6.0",
|
"version": "7.6.0",
|
||||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
|
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
|
||||||
@ -10941,6 +11125,11 @@
|
|||||||
"url": "https://github.com/sponsors/ljharb"
|
"url": "https://github.com/sponsors/ljharb"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/symbol-tree": {
|
||||||
|
"version": "3.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
|
||||||
|
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
|
||||||
|
},
|
||||||
"node_modules/tar": {
|
"node_modules/tar": {
|
||||||
"version": "6.2.1",
|
"version": "6.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
||||||
@ -11589,6 +11778,17 @@
|
|||||||
"node": ">= 0.8"
|
"node": ">= 0.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/w3c-xmlserializer": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
|
||||||
|
"dependencies": {
|
||||||
|
"xml-name-validator": "^5.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/walker": {
|
"node_modules/walker": {
|
||||||
"version": "1.0.8",
|
"version": "1.0.8",
|
||||||
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
||||||
@ -11622,7 +11822,6 @@
|
|||||||
"version": "7.0.0",
|
"version": "7.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
||||||
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
||||||
"peer": true,
|
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12"
|
"node": ">=12"
|
||||||
}
|
}
|
||||||
@ -11648,6 +11847,25 @@
|
|||||||
"node": ">=0.8.0"
|
"node": ">=0.8.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/whatwg-encoding": {
|
||||||
|
"version": "3.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||||
|
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||||
|
"dependencies": {
|
||||||
|
"iconv-lite": "0.6.3"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/whatwg-mimetype": {
|
||||||
|
"version": "4.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
||||||
|
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/whatwg-url": {
|
"node_modules/whatwg-url": {
|
||||||
"version": "11.0.0",
|
"version": "11.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
|
||||||
@ -11778,6 +11996,14 @@
|
|||||||
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
||||||
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
||||||
},
|
},
|
||||||
|
"node_modules/xml-name-validator": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/xml2js": {
|
"node_modules/xml2js": {
|
||||||
"version": "0.5.0",
|
"version": "0.5.0",
|
||||||
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
||||||
@ -11798,6 +12024,11 @@
|
|||||||
"node": ">=4.0"
|
"node": ">=4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/xmlchars": {
|
||||||
|
"version": "2.2.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
|
||||||
|
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
|
||||||
|
},
|
||||||
"node_modules/y18n": {
|
"node_modules/y18n": {
|
||||||
"version": "5.0.8",
|
"version": "5.0.8",
|
||||||
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
||||||
|
@ -43,6 +43,7 @@
|
|||||||
"generic-pool": "^3.9.0",
|
"generic-pool": "^3.9.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
|
"jsdom": "^24.0.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
"maxmind": "^4.3.18",
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
|
@ -24,6 +24,10 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
|||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
|
targetSelector?: string;
|
||||||
|
}
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class CrawlerHost extends RPCHost {
|
export class CrawlerHost extends RPCHost {
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
@ -31,7 +35,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
||||||
|
|
||||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||||
cacheValidMs = 1000 * 300;
|
cacheValidMs = 1000 * 3600;
|
||||||
urlValidMs = 1000 * 3600 * 4;
|
urlValidMs = 1000 * 3600 * 4;
|
||||||
|
|
||||||
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
||||||
@ -299,8 +303,13 @@ ${this.content}
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Cache-Tolerance': {
|
||||||
|
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-No-Cache': {
|
'X-No-Cache': {
|
||||||
description: `Ignores internal cache if this header is specified with a value.`,
|
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
@ -315,6 +324,20 @@ ${this.content}
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Wait-For-Selector': {
|
||||||
|
description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` +
|
||||||
|
'Example: `X-Wait-For-Selector: .content-block`\n'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Target-Selector': {
|
||||||
|
description: `Specifies a CSS selector for return target instead of the full html. \n\n` +
|
||||||
|
'Implies `X-Wait-For-Selector: (same selector)`'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-Proxy-Url': {
|
'X-Proxy-Url': {
|
||||||
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
||||||
`Supported protocols:\n` +
|
`Supported protocols:\n` +
|
||||||
@ -426,7 +449,15 @@ ${this.content}
|
|||||||
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
||||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||||
const cacheTolerance = noCache ? 0 : this.cacheValidMs;
|
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
||||||
|
if (isNaN(cacheTolerance)) {
|
||||||
|
cacheTolerance = this.cacheValidMs;
|
||||||
|
if (noCache) {
|
||||||
|
cacheTolerance = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const targetSelector = ctx.req.get('x-target-selector') || undefined;
|
||||||
|
const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
@ -444,10 +475,12 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
||||||
|
|
||||||
const crawlOpts: ScrappingOptions = {
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
proxyUrl: ctx.req.get('x-proxy-url'),
|
proxyUrl: ctx.req.get('x-proxy-url'),
|
||||||
cookies,
|
cookies,
|
||||||
favorScreenshot: customMode === 'screenshot'
|
favorScreenshot: customMode === 'screenshot',
|
||||||
|
waitForSelector,
|
||||||
|
targetSelector,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
@ -484,7 +517,7 @@ ${this.content}
|
|||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -506,7 +539,7 @@ ${this.content}
|
|||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -642,24 +675,32 @@ ${this.content}
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
||||||
let cache;
|
let cache;
|
||||||
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
||||||
yield cache.snapshot;
|
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (crawlOpts?.targetSelector) {
|
||||||
|
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
||||||
|
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
if (cache) {
|
if (cache) {
|
||||||
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
||||||
yield cache.snapshot;
|
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
throw err;
|
throw err;
|
||||||
|
@ -181,7 +181,13 @@ export class SearcherHost extends RPCHost {
|
|||||||
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
||||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||||
const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
|
let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
||||||
|
if (isNaN(pageCacheTolerance)) {
|
||||||
|
pageCacheTolerance = this.pageCacheToleranceMs;
|
||||||
|
if (noCache) {
|
||||||
|
pageCacheTolerance = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
|
@ -4,6 +4,7 @@ import { container, singleton } from 'tsyringe';
|
|||||||
import genericPool from 'generic-pool';
|
import genericPool from 'generic-pool';
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { JSDOM } from 'jsdom';
|
||||||
|
|
||||||
import type { Browser, CookieParam, Page } from 'puppeteer';
|
import type { Browser, CookieParam, Page } from 'puppeteer';
|
||||||
import puppeteer from 'puppeteer-extra';
|
import puppeteer from 'puppeteer-extra';
|
||||||
@ -11,7 +12,7 @@ import puppeteer from 'puppeteer-extra';
|
|||||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||||
import { ServiceCrashedError } from '../shared/lib/errors';
|
import { ServiceCrashedError } from '../shared/lib/errors';
|
||||||
|
import { Readability } from '@mozilla/readability';
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
|
|
||||||
@ -52,6 +53,7 @@ export interface ScrappingOptions {
|
|||||||
proxyUrl?: string;
|
proxyUrl?: string;
|
||||||
cookies?: CookieParam[];
|
cookies?: CookieParam[];
|
||||||
favorScreenshot?: boolean;
|
favorScreenshot?: boolean;
|
||||||
|
waitForSelector?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -142,7 +144,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.browser.once('disconnected', () => {
|
this.browser.once('disconnected', () => {
|
||||||
this.logger.warn(`Browser disconnected`);
|
this.logger.warn(`Browser disconnected`);
|
||||||
this.emit('crippled');
|
this.emit('crippled');
|
||||||
process.nextTick(()=> this.serviceReady());
|
process.nextTick(() => this.serviceReady());
|
||||||
});
|
});
|
||||||
this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
|
this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
|
||||||
|
|
||||||
@ -344,6 +346,18 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
{ ...options, url: parsedUrl }
|
{ ...options, url: parsedUrl }
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
if (options?.waitForSelector) {
|
||||||
|
page.waitForSelector(options.waitForSelector)
|
||||||
|
.then(async () => {
|
||||||
|
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
||||||
|
screenshot = await page.screenshot();
|
||||||
|
finalized = true;
|
||||||
|
nextSnapshotDeferred.resolve(snapshot);
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let lastHTML = snapshot?.html;
|
let lastHTML = snapshot?.html;
|
||||||
@ -394,6 +408,49 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
|
||||||
|
if (!targetSelect) {
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
if (!snapshot?.html) {
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href });
|
||||||
|
const elem = jsdom.window.document.querySelector(targetSelect);
|
||||||
|
|
||||||
|
if (!elem) {
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
|
||||||
|
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href });
|
||||||
|
let parsed;
|
||||||
|
try {
|
||||||
|
parsed = new Readability(selectedJsDom.window.document).parse();
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||||
|
}
|
||||||
|
|
||||||
|
// No innerText in jsdom
|
||||||
|
// https://github.com/jsdom/jsdom/issues/1245
|
||||||
|
const textContent = elem.textContent;
|
||||||
|
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
||||||
|
|
||||||
|
const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]')).map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]).flat().filter(Boolean);
|
||||||
|
|
||||||
|
const imageSet = new Set(imageTags);
|
||||||
|
|
||||||
|
const r = {
|
||||||
|
...snapshot,
|
||||||
|
parsed,
|
||||||
|
html: elem.outerHTML,
|
||||||
|
text: cleanedText,
|
||||||
|
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||||
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const puppeteerControl = container.resolve(PuppeteerControl);
|
const puppeteerControl = container.resolve(PuppeteerControl);
|
||||||
|
7
backend/functions/src/types.d.ts
vendored
7
backend/functions/src/types.d.ts
vendored
@ -7,3 +7,10 @@ declare module 'langdetect' {
|
|||||||
export function detect(text: string): DetectionResult[];
|
export function detect(text: string): DetectionResult[];
|
||||||
export function detectOne(text: string): string | null;
|
export function detectOne(text: string): string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare module 'jsdom' {
|
||||||
|
export class JSDOM {
|
||||||
|
constructor(html: string, options?: any);
|
||||||
|
window: typeof window;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user