diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index dc18bbb..0173f62 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -27,7 +27,8 @@ import { DomainBlockade } from '../db/domain-blockade'; const md5Hasher = new HashManager('md5', 'hex'); export interface ExtraScrappingOptions extends ScrappingOptions { - targetSelector?: string; + targetSelector?: string | string[]; + removeSelector?: string | string[]; } export interface FormattedPage { @@ -131,12 +132,15 @@ export class CrawlerHost extends RPCHost { return indexObject; } - getTurndown(noRules?: boolean | string) { + getTurndown(options?: { + noRules?: boolean | string, + url?: string | URL; + }) { const turnDownService = new TurndownService({ codeBlockStyle: 'fenced', preformattedCode: true, } as any); - if (!noRules) { + if (!options?.noRules) { turnDownService.addRule('remove-irrelevant', { filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'], replacement: () => '' @@ -177,7 +181,14 @@ export class CrawlerHost extends RPCHost { if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; const fixedContent = content.replace(/\s+/g, ' ').trim(); - const fixedHref = href.replace(/\s+/g, '').trim(); + let fixedHref = href.replace(/\s+/g, '').trim(); + if (options?.url) { + try { + fixedHref = new URL(fixedHref, options.url).toString(); + } catch (_err) { + void 0; + } + } return `[${fixedContent}](${fixedHref}${title || ''})`; } @@ -317,7 +328,7 @@ export class CrawlerHost extends RPCHost { } const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content; - let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule'); + let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href }); for (const plugin of this.turnDownPlugins) { turnDownService = turnDownService.use(plugin); } @@ -380,7 +391,7 @@ export class CrawlerHost extends RPCHost { contentText = turnDownService.turndown(toBeTurnedToMd).trim(); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown(); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.href }); try { contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim(); } catch (err2) { @@ -397,7 +408,7 @@ export class CrawlerHost extends RPCHost { contentText = turnDownService.turndown(snapshot.html); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown(); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.href }); try { contentText = vanillaTurnDownService.turndown(snapshot.html); } catch (err2) { @@ -799,22 +810,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return r; } - async * cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { + async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { let cache; if (cacheTolerance && !crawlOpts?.cookies?.length) { cache = await this.queryCache(urlToCrawl, cacheTolerance); } if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { - yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector); + yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts); return; } try { - if (crawlOpts?.targetSelector) { + if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { - yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector); + yield this.puppeteerControl.narrowSnapshot(x, crawlOpts); } return; @@ -824,7 +835,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } catch (err: any) { if (cache) { this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) }); - yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector); + yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts); return; } throw err; @@ -853,7 +864,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } - async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) { + async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) { const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance)); const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined); @@ -910,8 +921,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; proxyUrl: opts.proxyUrl, cookies: opts.setCookies, favorScreenshot: opts.respondWith === 'screenshot', - waitForSelector: opts.waitForSelector, + removeSelector: opts.removeSelector, targetSelector: opts.targetSelector, + waitForSelector: opts.waitForSelector, overrideUserAgent: opts.userAgent, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, }; diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 4f59340..d8456aa 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -8,11 +8,10 @@ import { singleton } from 'tsyringe'; import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; -import { ScrappingOptions } from '../services/puppeteer'; import { Request, Response } from 'express'; import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; import { BraveSearchService } from '../services/brave-search'; -import { CrawlerHost, FormattedPage } from './crawler'; +import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler'; import { CookieParam } from 'puppeteer'; import { parseString as parseSetCookieString } from 'set-cookie-parser'; @@ -304,7 +303,7 @@ export class SearcherHost extends RPCHost { async *fetchSearchResults( mode: string | 'markdown' | 'html' | 'text' | 'screenshot', searchResults?: WebSearchResult[], - options?: ScrappingOptions, + options?: ExtraScrappingOptions, pageCacheTolerance?: number ) { if (!searchResults) { diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 4618d02..26c1c9c 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -53,6 +53,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser'; in: 'header', schema: { type: 'string' } }, + 'X-Remove-Selector': { + description: `Specifies a CSS selector to remove elements from the full html.\n\n` + + 'Example `X-Remove-Selector: nav`' + , + in: 'header', + schema: { type: 'string' } + }, 'X-Proxy-Url': { description: `Specifies your custom proxy if you prefer to use one.\n\n` + `Supported protocols: \n` + @@ -130,11 +137,14 @@ export class CrawlerOptions extends AutoCastable { @Prop() cacheTolerance?: number; - @Prop() - targetSelector?: string; + @Prop({ arrayOf: String }) + targetSelector?: string | string[]; - @Prop() - waitForSelector?: string; + @Prop({ arrayOf: String }) + waitForSelector?: string | string[]; + + @Prop({ arrayOf: String }) + removeSelector?: string | string[]; @Prop({ arrayOf: String, @@ -193,15 +203,17 @@ export class CrawlerOptions extends AutoCastable { instance.timeout = timeoutSeconds; } - const targetSelector = ctx?.req.get('x-target-selector'); + const removeSelector = ctx?.req.get('x-remove-selector')?.split(', '); + instance.removeSelector ??= removeSelector; + const targetSelector = ctx?.req.get('x-target-selector')?.split(', '); instance.targetSelector ??= targetSelector; - const waitForSelector = ctx?.req.get('x-wait-for-selector'); + const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', '); instance.waitForSelector ??= waitForSelector || instance.targetSelector; const overrideUserAgent = ctx?.req.get('x-user-agent'); instance.userAgent ??= overrideUserAgent; const cookies: CookieParam[] = []; - const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]); + const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); if (Array.isArray(setCookieHeaders)) { for (const setCookie of setCookieHeaders) { cookies.push({ diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index dd29235..c4cf780 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -63,7 +63,7 @@ export interface ScrappingOptions { proxyUrl?: string; cookies?: CookieParam[]; favorScreenshot?: boolean; - waitForSelector?: string; + waitForSelector?: string | string[]; minIntervalMs?: number; overrideUserAgent?: string; timeoutMs?: number; @@ -483,7 +483,8 @@ document.addEventListener('load', handlePageLoad); ); }); if (options?.waitForSelector) { - page.waitForSelector(options.waitForSelector) + const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector); + waitPromise .then(async () => { snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; screenshot = await page.screenshot(); @@ -547,8 +548,11 @@ document.addEventListener('load', handlePageLoad); return true; } - narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined { - if (!targetSelect) { + narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { + targetSelector?: string | string[]; + removeSelector?: string | string[]; + }): PageSnapshot | undefined { + if (!options?.targetSelector && !options?.removeSelector) { return snapshot; } if (!snapshot?.html) { @@ -556,26 +560,68 @@ document.addEventListener('load', handlePageLoad); } const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); - const elem = jsdom.window.document.querySelector(targetSelect); + const allNodes: Node[] = []; - if (!elem) { - return snapshot; + if (Array.isArray(options.removeSelector)) { + for (const rl of options.removeSelector) { + jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); + } + } else if (options.removeSelector) { + jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); + } + + if (Array.isArray(options.targetSelector)) { + for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { + x.forEach((el) => { + if (!allNodes.includes(el)) { + allNodes.push(el); + } + }); + } + } else if (options.targetSelector) { + jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { + if (!allNodes.includes(el)) { + allNodes.push(el); + } + }); + } else { + allNodes.push(jsdom.window.document); + } + + if (!allNodes.length) { + return snapshot; + } + const textChunks: string[] = []; + let rootDoc: Document; + if (allNodes.length === 1 && allNodes[0].nodeName === '#document') { + rootDoc = allNodes[0] as any; + if (rootDoc.body.textContent) { + textChunks.push(rootDoc.body.textContent); + } + } else { + rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document; + for (const n of allNodes) { + rootDoc.body.appendChild(n); + rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); + if (n.textContent) { + textChunks.push(n.textContent); + } + } } - const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole }); let parsed; try { - parsed = new Readability(selectedJsDom.window.document).parse(); + parsed = new Readability(rootDoc.cloneNode(true) as any).parse(); } catch (err: any) { this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); } // No innerText in jsdom // https://github.com/jsdom/jsdom/issues/1245 - const textContent = elem.textContent; + const textContent = textChunks.join('\n\n'); const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n'); - const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]')) + const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]) .flat() .map((x) => { @@ -592,7 +638,7 @@ document.addEventListener('load', handlePageLoad); const r = { ...snapshot, parsed, - html: elem.outerHTML, + html: rootDoc.documentElement.outerHTML, text: cleanedText, imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], } as PageSnapshot; diff --git a/thinapps-shared b/thinapps-shared index 5939c70..b30155d 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 5939c7091985706bebe7d1d83591430426b292c8 +Subproject commit b30155da82ea8e311faab58bb5a360e829547ea0