feat: x-remove-selector

This commit is contained in:
yanlong.wang 2024-06-18 18:07:38 +08:00
parent ee008ebe10
commit 6f37e5d3b4
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
5 changed files with 106 additions and 37 deletions

View File

@ -27,7 +27,8 @@ import { DomainBlockade } from '../db/domain-blockade';
const md5Hasher = new HashManager('md5', 'hex');
export interface ExtraScrappingOptions extends ScrappingOptions {
targetSelector?: string;
targetSelector?: string | string[];
removeSelector?: string | string[];
}
export interface FormattedPage {
@ -131,12 +132,15 @@ export class CrawlerHost extends RPCHost {
return indexObject;
}
getTurndown(noRules?: boolean | string) {
getTurndown(options?: {
noRules?: boolean | string,
url?: string | URL;
}) {
const turnDownService = new TurndownService({
codeBlockStyle: 'fenced',
preformattedCode: true,
} as any);
if (!noRules) {
if (!options?.noRules) {
turnDownService.addRule('remove-irrelevant', {
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
replacement: () => ''
@ -177,7 +181,14 @@ export class CrawlerHost extends RPCHost {
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
const fixedContent = content.replace(/\s+/g, ' ').trim();
const fixedHref = href.replace(/\s+/g, '').trim();
let fixedHref = href.replace(/\s+/g, '').trim();
if (options?.url) {
try {
fixedHref = new URL(fixedHref, options.url).toString();
} catch (_err) {
void 0;
}
}
return `[${fixedContent}](${fixedHref}${title || ''})`;
}
@ -317,7 +328,7 @@ export class CrawlerHost extends RPCHost {
}
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}
@ -380,7 +391,7 @@ export class CrawlerHost extends RPCHost {
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown();
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
try {
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
} catch (err2) {
@ -397,7 +408,7 @@ export class CrawlerHost extends RPCHost {
contentText = turnDownService.turndown(snapshot.html);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown();
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
try {
contentText = vanillaTurnDownService.turndown(snapshot.html);
} catch (err2) {
@ -806,15 +817,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
}
try {
if (crawlOpts?.targetSelector) {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
}
return;
@ -824,7 +835,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} catch (err: any) {
if (cache) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
}
throw err;
@ -853,7 +864,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
@ -910,8 +921,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
proxyUrl: opts.proxyUrl,
cookies: opts.setCookies,
favorScreenshot: opts.respondWith === 'screenshot',
waitForSelector: opts.waitForSelector,
removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector,
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
};

View File

@ -8,11 +8,10 @@ import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { BraveSearchService } from '../services/brave-search';
import { CrawlerHost, FormattedPage } from './crawler';
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
import { CookieParam } from 'puppeteer';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
@ -304,7 +303,7 @@ export class SearcherHost extends RPCHost {
async *fetchSearchResults(
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
searchResults?: WebSearchResult[],
options?: ScrappingOptions,
options?: ExtraScrappingOptions,
pageCacheTolerance?: number
) {
if (!searchResults) {

View File

@ -53,6 +53,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header',
schema: { type: 'string' }
},
'X-Remove-Selector': {
description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
'Example `X-Remove-Selector: nav`'
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
`Supported protocols: \n` +
@ -130,11 +137,14 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
cacheTolerance?: number;
@Prop()
targetSelector?: string;
@Prop({ arrayOf: String })
targetSelector?: string | string[];
@Prop()
waitForSelector?: string;
@Prop({ arrayOf: String })
waitForSelector?: string | string[];
@Prop({ arrayOf: String })
removeSelector?: string | string[];
@Prop({
arrayOf: String,
@ -193,15 +203,17 @@ export class CrawlerOptions extends AutoCastable {
instance.timeout = timeoutSeconds;
}
const targetSelector = ctx?.req.get('x-target-selector');
const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
instance.removeSelector ??= removeSelector;
const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
instance.targetSelector ??= targetSelector;
const waitForSelector = ctx?.req.get('x-wait-for-selector');
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
const overrideUserAgent = ctx?.req.get('x-user-agent');
instance.userAgent ??= overrideUserAgent;
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({

View File

@ -63,7 +63,7 @@ export interface ScrappingOptions {
proxyUrl?: string;
cookies?: CookieParam[];
favorScreenshot?: boolean;
waitForSelector?: string;
waitForSelector?: string | string[];
minIntervalMs?: number;
overrideUserAgent?: string;
timeoutMs?: number;
@ -483,7 +483,8 @@ document.addEventListener('load', handlePageLoad);
);
});
if (options?.waitForSelector) {
page.waitForSelector(options.waitForSelector)
const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
waitPromise
.then(async () => {
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
@ -547,8 +548,11 @@ document.addEventListener('load', handlePageLoad);
return true;
}
narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
if (!targetSelect) {
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
}): PageSnapshot | undefined {
if (!options?.targetSelector && !options?.removeSelector) {
return snapshot;
}
if (!snapshot?.html) {
@ -556,26 +560,68 @@ document.addEventListener('load', handlePageLoad);
}
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const elem = jsdom.window.document.querySelector(targetSelect);
const allNodes: Node[] = [];
if (!elem) {
return snapshot;
if (Array.isArray(options.removeSelector)) {
for (const rl of options.removeSelector) {
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
}
} else if (options.removeSelector) {
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
}
if (Array.isArray(options.targetSelector)) {
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);
}
});
}
} else if (options.targetSelector) {
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);
}
});
} else {
allNodes.push(jsdom.window.document);
}
if (!allNodes.length) {
return snapshot;
}
const textChunks: string[] = [];
let rootDoc: Document;
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
rootDoc = allNodes[0] as any;
if (rootDoc.body.textContent) {
textChunks.push(rootDoc.body.textContent);
}
} else {
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
for (const n of allNodes) {
rootDoc.body.appendChild(n);
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
if (n.textContent) {
textChunks.push(n.textContent);
}
}
}
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole });
let parsed;
try {
parsed = new Readability(selectedJsDom.window.document).parse();
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
} catch (err: any) {
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
}
// No innerText in jsdom
// https://github.com/jsdom/jsdom/issues/1245
const textContent = elem.textContent;
const textContent = textChunks.join('\n\n');
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]'))
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
.flat()
.map((x) => {
@ -592,7 +638,7 @@ document.addEventListener('load', handlePageLoad);
const r = {
...snapshot,
parsed,
html: elem.outerHTML,
html: rootDoc.documentElement.outerHTML,
text: cleanedText,
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
} as PageSnapshot;

@ -1 +1 @@
Subproject commit 5939c7091985706bebe7d1d83591430426b292c8
Subproject commit b30155da82ea8e311faab58bb5a360e829547ea0