diff --git a/README.md b/README.md index c165026..1a0c01f 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,11 @@ As you have already seen above, one can control the behavior of the Reader API u - `x-respond-with: text` returns `document.body.innerText` - `x-respond-with: screenshot` returns the URL of the webpage's screenshot - You can specify a proxy server via the `x-proxy-url` header. -- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header. +- You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds). +- You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`). +- If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page. + - By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target. + - By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element. ### Streaming mode diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 83507ea..4f46e9a 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -23,6 +23,7 @@ "generic-pool": "^3.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", + "jsdom": "^24.0.0", "langdetect": "^0.2.1", "maxmind": "^4.3.18", "minio": "^7.1.3", @@ -4036,6 +4037,17 @@ "node": ">= 8" } }, + "node_modules/cssstyle": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz", + "integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==", + "dependencies": { + "rrweb-cssom": "^0.6.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/data-uri-to-buffer": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", @@ -4044,6 +4056,41 @@ "node": ">= 14" } }, + "node_modules/data-urls": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", + "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==", + "dependencies": { + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/data-urls/node_modules/tr46": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz", + "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/data-urls/node_modules/whatwg-url": { + "version": "14.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz", + "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==", + "dependencies": { + "tr46": "^5.0.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/data-view-buffer": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz", @@ -4116,6 +4163,11 @@ } } }, + "node_modules/decimal.js": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", + "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==" + }, "node_modules/decode-uri-component": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz", @@ -6119,6 +6171,17 @@ "node": ">= 0.4" } }, + "node_modules/html-encoding-sniffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", + "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", + "dependencies": { + "whatwg-encoding": "^3.1.1" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -6307,7 +6370,6 @@ "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", - "optional": true, "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, @@ -6705,6 +6767,11 @@ "node": ">=0.10.0" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==" + }, "node_modules/is-regex": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", @@ -7573,6 +7640,91 @@ "node": ">=0.1.90" } }, + "node_modules/jsdom": { + "version": "24.0.0", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz", + "integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==", + "dependencies": { + "cssstyle": "^4.0.1", + "data-urls": "^5.0.0", + "decimal.js": "^10.4.3", + "form-data": "^4.0.0", + "html-encoding-sniffer": "^4.0.0", + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.2", + "is-potential-custom-element-name": "^1.0.1", + "nwsapi": "^2.2.7", + "parse5": "^7.1.2", + "rrweb-cssom": "^0.6.0", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^4.1.3", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^7.0.0", + "whatwg-encoding": "^3.1.1", + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.0.0", + "ws": "^8.16.0", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "canvas": "^2.11.2" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/agent-base": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz", + "integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==", + "dependencies": { + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/jsdom/node_modules/https-proxy-agent": { + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz", + "integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==", + "dependencies": { + "agent-base": "^7.0.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/jsdom/node_modules/tr46": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz", + "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/jsdom/node_modules/whatwg-url": { + "version": "14.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz", + "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==", + "dependencies": { + "tr46": "^5.0.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/jsesc": { "version": "2.5.2", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz", @@ -8671,6 +8823,11 @@ "set-blocking": "^2.0.0" } }, + "node_modules/nwsapi": { + "version": "2.2.10", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz", + "integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ==" + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -8985,6 +9142,17 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", + "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==", + "dependencies": { + "entities": "^4.4.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -10185,6 +10353,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/rrweb-cssom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz", + "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==" + }, "node_modules/run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", @@ -10281,6 +10454,17 @@ "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/semver": { "version": "7.6.0", "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz", @@ -10941,6 +11125,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" + }, "node_modules/tar": { "version": "6.2.1", "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz", @@ -11589,6 +11778,17 @@ "node": ">= 0.8" } }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/walker": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz", @@ -11622,7 +11822,6 @@ "version": "7.0.0", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", - "peer": true, "engines": { "node": ">=12" } @@ -11648,6 +11847,25 @@ "node": ">=0.8.0" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "engines": { + "node": ">=18" + } + }, "node_modules/whatwg-url": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz", @@ -11778,6 +11996,14 @@ "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz", "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw==" }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "engines": { + "node": ">=18" + } + }, "node_modules/xml2js": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz", @@ -11798,6 +12024,11 @@ "node": ">=4.0" } }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/backend/functions/package.json b/backend/functions/package.json index fb7418c..800e061 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -43,6 +43,7 @@ "generic-pool": "^3.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", + "jsdom": "^24.0.0", "langdetect": "^0.2.1", "maxmind": "^4.3.18", "minio": "^7.1.3", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 19bdda8..760f4f3 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -24,6 +24,10 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai'; const md5Hasher = new HashManager('md5', 'hex'); +export interface ExtraScrappingOptions extends ScrappingOptions { + targetSelector?: string; +} + @singleton() export class CrawlerHost extends RPCHost { logger = this.globalLogger.child({ service: this.constructor.name }); @@ -31,7 +35,7 @@ export class CrawlerHost extends RPCHost { turnDownPlugins = [require('turndown-plugin-gfm').tables]; cacheRetentionMs = 1000 * 3600 * 24 * 7; - cacheValidMs = 1000 * 300; + cacheValidMs = 1000 * 3600; urlValidMs = 1000 * 3600 * 4; indexText = `[Usage1] https://r.jina.ai/YOUR_URL @@ -299,8 +303,13 @@ ${this.content} in: 'header', schema: { type: 'string' } }, + 'X-Cache-Tolerance': { + description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`, + in: 'header', + schema: { type: 'string' } + }, 'X-No-Cache': { - description: `Ignores internal cache if this header is specified with a value.`, + description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`, in: 'header', schema: { type: 'string' } }, @@ -315,6 +324,20 @@ ${this.content} in: 'header', schema: { type: 'string' } }, + 'X-Wait-For-Selector': { + description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` + + 'Example: `X-Wait-For-Selector: .content-block`\n' + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Target-Selector': { + description: `Specifies a CSS selector for return target instead of the full html. \n\n` + + 'Implies `X-Wait-For-Selector: (same selector)`' + , + in: 'header', + schema: { type: 'string' } + }, 'X-Proxy-Url': { description: `Specifies your custom proxy if you prefer to use one. \n\n` + `Supported protocols:\n` + @@ -426,7 +449,15 @@ ${this.content} const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); const noCache = Boolean(ctx.req.get('x-no-cache')); - const cacheTolerance = noCache ? 0 : this.cacheValidMs; + let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; + if (isNaN(cacheTolerance)) { + cacheTolerance = this.cacheValidMs; + if (noCache) { + cacheTolerance = 0; + } + } + const targetSelector = ctx.req.get('x-target-selector') || undefined; + const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector; const cookies: CookieParam[] = []; const setCookieHeaders = ctx.req.headers['x-set-cookie']; if (Array.isArray(setCookieHeaders)) { @@ -444,10 +475,12 @@ ${this.content} } this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); - const crawlOpts: ScrappingOptions = { + const crawlOpts: ExtraScrappingOptions = { proxyUrl: ctx.req.get('x-proxy-url'), cookies, - favorScreenshot: customMode === 'screenshot' + favorScreenshot: customMode === 'screenshot', + waitForSelector, + targetSelector, }; if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { @@ -484,7 +517,7 @@ ${this.content} if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) { lastScrapped = scrapped; - if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { + if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } @@ -506,7 +539,7 @@ ${this.content} for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) { lastScrapped = scrapped; - if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { + if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) { continue; } @@ -642,24 +675,32 @@ ${this.content} return r; } - async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) { + async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { let cache; if (cacheTolerance && !crawlOpts?.cookies?.length) { cache = await this.queryCache(urlToCrawl, cacheTolerance); } if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { - yield cache.snapshot; + yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector); return; } try { + if (crawlOpts?.targetSelector) { + for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { + yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector); + } + + return; + } + yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts); } catch (err: any) { if (cache) { this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) }); - yield cache.snapshot; + yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector); return; } throw err; diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 14c6f79..08b558f 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -181,7 +181,13 @@ export class SearcherHost extends RPCHost { const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); const noCache = Boolean(ctx.req.get('x-no-cache')); - const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs; + let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; + if (isNaN(pageCacheTolerance)) { + pageCacheTolerance = this.pageCacheToleranceMs; + if (noCache) { + pageCacheTolerance = 0; + } + } const cookies: CookieParam[] = []; const setCookieHeaders = ctx.req.headers['x-set-cookie']; if (Array.isArray(setCookieHeaders)) { diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index e70cabe..e2f945e 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -4,6 +4,7 @@ import { container, singleton } from 'tsyringe'; import genericPool from 'generic-pool'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { Logger } from '../shared/services/logger'; +import { JSDOM } from 'jsdom'; import type { Browser, CookieParam, Page } from 'puppeteer'; import puppeteer from 'puppeteer-extra'; @@ -11,7 +12,7 @@ import puppeteer from 'puppeteer-extra'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; import { ServiceCrashedError } from '../shared/lib/errors'; - +import { Readability } from '@mozilla/readability'; const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -52,6 +53,7 @@ export interface ScrappingOptions { proxyUrl?: string; cookies?: CookieParam[]; favorScreenshot?: boolean; + waitForSelector?: string; } @@ -142,7 +144,7 @@ export class PuppeteerControl extends AsyncService { this.browser.once('disconnected', () => { this.logger.warn(`Browser disconnected`); this.emit('crippled'); - process.nextTick(()=> this.serviceReady()); + process.nextTick(() => this.serviceReady()); }); this.logger.info(`Browser launched: ${this.browser.process()?.pid}`); @@ -344,6 +346,18 @@ document.addEventListener('load', handlePageLoad); { ...options, url: parsedUrl } ); }); + if (options?.waitForSelector) { + page.waitForSelector(options.waitForSelector) + .then(async () => { + snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; + screenshot = await page.screenshot(); + finalized = true; + nextSnapshotDeferred.resolve(snapshot); + }) + .catch((err) => { + this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) }); + }); + } try { let lastHTML = snapshot?.html; @@ -394,6 +408,49 @@ document.addEventListener('load', handlePageLoad); return true; } + + narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined { + if (!targetSelect) { + return snapshot; + } + if (!snapshot?.html) { + return snapshot; + } + + const jsdom = new JSDOM(snapshot.html, { url: snapshot.href }); + const elem = jsdom.window.document.querySelector(targetSelect); + + if (!elem) { + return snapshot; + } + + const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href }); + let parsed; + try { + parsed = new Readability(selectedJsDom.window.document).parse(); + } catch (err: any) { + this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); + } + + // No innerText in jsdom + // https://github.com/jsdom/jsdom/issues/1245 + const textContent = elem.textContent; + const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n'); + + const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]')).map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]).flat().filter(Boolean); + + const imageSet = new Set(imageTags); + + const r = { + ...snapshot, + parsed, + html: elem.outerHTML, + text: cleanedText, + imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], + } as PageSnapshot; + + return r; + } } const puppeteerControl = container.resolve(PuppeteerControl); diff --git a/backend/functions/src/types.d.ts b/backend/functions/src/types.d.ts index e31c720..796e64d 100644 --- a/backend/functions/src/types.d.ts +++ b/backend/functions/src/types.d.ts @@ -7,3 +7,10 @@ declare module 'langdetect' { export function detect(text: string): DetectionResult[]; export function detectOne(text: string): string | null; } + +declare module 'jsdom' { + export class JSDOM { + constructor(html: string, options?: any); + window: typeof window; + } +}