mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:45:53 +08:00
feat: x-remove-selector
This commit is contained in:
parent
ee008ebe10
commit
6f37e5d3b4
@ -27,7 +27,8 @@ import { DomainBlockade } from '../db/domain-blockade';
|
|||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
targetSelector?: string;
|
targetSelector?: string | string[];
|
||||||
|
removeSelector?: string | string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FormattedPage {
|
export interface FormattedPage {
|
||||||
@ -131,12 +132,15 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return indexObject;
|
return indexObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
getTurndown(noRules?: boolean | string) {
|
getTurndown(options?: {
|
||||||
|
noRules?: boolean | string,
|
||||||
|
url?: string | URL;
|
||||||
|
}) {
|
||||||
const turnDownService = new TurndownService({
|
const turnDownService = new TurndownService({
|
||||||
codeBlockStyle: 'fenced',
|
codeBlockStyle: 'fenced',
|
||||||
preformattedCode: true,
|
preformattedCode: true,
|
||||||
} as any);
|
} as any);
|
||||||
if (!noRules) {
|
if (!options?.noRules) {
|
||||||
turnDownService.addRule('remove-irrelevant', {
|
turnDownService.addRule('remove-irrelevant', {
|
||||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
||||||
replacement: () => ''
|
replacement: () => ''
|
||||||
@ -177,7 +181,14 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
||||||
|
|
||||||
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
||||||
const fixedHref = href.replace(/\s+/g, '').trim();
|
let fixedHref = href.replace(/\s+/g, '').trim();
|
||||||
|
if (options?.url) {
|
||||||
|
try {
|
||||||
|
fixedHref = new URL(fixedHref, options.url).toString();
|
||||||
|
} catch (_err) {
|
||||||
|
void 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
||||||
}
|
}
|
||||||
@ -317,7 +328,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
||||||
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
|
let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
|
||||||
for (const plugin of this.turnDownPlugins) {
|
for (const plugin of this.turnDownPlugins) {
|
||||||
turnDownService = turnDownService.use(plugin);
|
turnDownService = turnDownService.use(plugin);
|
||||||
}
|
}
|
||||||
@ -380,7 +391,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown();
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -397,7 +408,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = turnDownService.turndown(snapshot.html);
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown();
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -799,22 +810,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
async * cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
||||||
let cache;
|
let cache;
|
||||||
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
||||||
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (crawlOpts?.targetSelector) {
|
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
|
||||||
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
||||||
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
|
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -824,7 +835,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
if (cache) {
|
if (cache) {
|
||||||
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
||||||
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
throw err;
|
throw err;
|
||||||
@ -853,7 +864,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
|
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
|
||||||
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
||||||
|
|
||||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
||||||
@ -910,8 +921,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
proxyUrl: opts.proxyUrl,
|
proxyUrl: opts.proxyUrl,
|
||||||
cookies: opts.setCookies,
|
cookies: opts.setCookies,
|
||||||
favorScreenshot: opts.respondWith === 'screenshot',
|
favorScreenshot: opts.respondWith === 'screenshot',
|
||||||
waitForSelector: opts.waitForSelector,
|
removeSelector: opts.removeSelector,
|
||||||
targetSelector: opts.targetSelector,
|
targetSelector: opts.targetSelector,
|
||||||
|
waitForSelector: opts.waitForSelector,
|
||||||
overrideUserAgent: opts.userAgent,
|
overrideUserAgent: opts.userAgent,
|
||||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||||
};
|
};
|
||||||
|
@ -8,11 +8,10 @@ import { singleton } from 'tsyringe';
|
|||||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { ScrappingOptions } from '../services/puppeteer';
|
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
import { BraveSearchService } from '../services/brave-search';
|
import { BraveSearchService } from '../services/brave-search';
|
||||||
import { CrawlerHost, FormattedPage } from './crawler';
|
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
|
||||||
import { CookieParam } from 'puppeteer';
|
import { CookieParam } from 'puppeteer';
|
||||||
|
|
||||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
@ -304,7 +303,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
async *fetchSearchResults(
|
async *fetchSearchResults(
|
||||||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||||
searchResults?: WebSearchResult[],
|
searchResults?: WebSearchResult[],
|
||||||
options?: ScrappingOptions,
|
options?: ExtraScrappingOptions,
|
||||||
pageCacheTolerance?: number
|
pageCacheTolerance?: number
|
||||||
) {
|
) {
|
||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
|
@ -53,6 +53,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Remove-Selector': {
|
||||||
|
description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
|
||||||
|
'Example `X-Remove-Selector: nav`'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-Proxy-Url': {
|
'X-Proxy-Url': {
|
||||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||||
`Supported protocols: \n` +
|
`Supported protocols: \n` +
|
||||||
@ -130,11 +137,14 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
cacheTolerance?: number;
|
cacheTolerance?: number;
|
||||||
|
|
||||||
@Prop()
|
@Prop({ arrayOf: String })
|
||||||
targetSelector?: string;
|
targetSelector?: string | string[];
|
||||||
|
|
||||||
@Prop()
|
@Prop({ arrayOf: String })
|
||||||
waitForSelector?: string;
|
waitForSelector?: string | string[];
|
||||||
|
|
||||||
|
@Prop({ arrayOf: String })
|
||||||
|
removeSelector?: string | string[];
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
arrayOf: String,
|
arrayOf: String,
|
||||||
@ -193,15 +203,17 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.timeout = timeoutSeconds;
|
instance.timeout = timeoutSeconds;
|
||||||
}
|
}
|
||||||
|
|
||||||
const targetSelector = ctx?.req.get('x-target-selector');
|
const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
|
||||||
|
instance.removeSelector ??= removeSelector;
|
||||||
|
const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
|
||||||
instance.targetSelector ??= targetSelector;
|
instance.targetSelector ??= targetSelector;
|
||||||
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
|
||||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||||
instance.userAgent ??= overrideUserAgent;
|
instance.userAgent ??= overrideUserAgent;
|
||||||
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
for (const setCookie of setCookieHeaders) {
|
for (const setCookie of setCookieHeaders) {
|
||||||
cookies.push({
|
cookies.push({
|
||||||
|
@ -63,7 +63,7 @@ export interface ScrappingOptions {
|
|||||||
proxyUrl?: string;
|
proxyUrl?: string;
|
||||||
cookies?: CookieParam[];
|
cookies?: CookieParam[];
|
||||||
favorScreenshot?: boolean;
|
favorScreenshot?: boolean;
|
||||||
waitForSelector?: string;
|
waitForSelector?: string | string[];
|
||||||
minIntervalMs?: number;
|
minIntervalMs?: number;
|
||||||
overrideUserAgent?: string;
|
overrideUserAgent?: string;
|
||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
@ -483,7 +483,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
if (options?.waitForSelector) {
|
if (options?.waitForSelector) {
|
||||||
page.waitForSelector(options.waitForSelector)
|
const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
|
||||||
|
waitPromise
|
||||||
.then(async () => {
|
.then(async () => {
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
@ -547,8 +548,11 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
|
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
||||||
if (!targetSelect) {
|
targetSelector?: string | string[];
|
||||||
|
removeSelector?: string | string[];
|
||||||
|
}): PageSnapshot | undefined {
|
||||||
|
if (!options?.targetSelector && !options?.removeSelector) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
if (!snapshot?.html) {
|
if (!snapshot?.html) {
|
||||||
@ -556,26 +560,68 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
|
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||||
const elem = jsdom.window.document.querySelector(targetSelect);
|
const allNodes: Node[] = [];
|
||||||
|
|
||||||
if (!elem) {
|
if (Array.isArray(options.removeSelector)) {
|
||||||
return snapshot;
|
for (const rl of options.removeSelector) {
|
||||||
|
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
||||||
|
}
|
||||||
|
} else if (options.removeSelector) {
|
||||||
|
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(options.targetSelector)) {
|
||||||
|
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
||||||
|
x.forEach((el) => {
|
||||||
|
if (!allNodes.includes(el)) {
|
||||||
|
allNodes.push(el);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (options.targetSelector) {
|
||||||
|
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
||||||
|
if (!allNodes.includes(el)) {
|
||||||
|
allNodes.push(el);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
allNodes.push(jsdom.window.document);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!allNodes.length) {
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
const textChunks: string[] = [];
|
||||||
|
let rootDoc: Document;
|
||||||
|
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
||||||
|
rootDoc = allNodes[0] as any;
|
||||||
|
if (rootDoc.body.textContent) {
|
||||||
|
textChunks.push(rootDoc.body.textContent);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
||||||
|
for (const n of allNodes) {
|
||||||
|
rootDoc.body.appendChild(n);
|
||||||
|
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
||||||
|
if (n.textContent) {
|
||||||
|
textChunks.push(n.textContent);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole });
|
|
||||||
let parsed;
|
let parsed;
|
||||||
try {
|
try {
|
||||||
parsed = new Readability(selectedJsDom.window.document).parse();
|
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||||
}
|
}
|
||||||
|
|
||||||
// No innerText in jsdom
|
// No innerText in jsdom
|
||||||
// https://github.com/jsdom/jsdom/issues/1245
|
// https://github.com/jsdom/jsdom/issues/1245
|
||||||
const textContent = elem.textContent;
|
const textContent = textChunks.join('\n\n');
|
||||||
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
||||||
|
|
||||||
const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]'))
|
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
||||||
.flat()
|
.flat()
|
||||||
.map((x) => {
|
.map((x) => {
|
||||||
@ -592,7 +638,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
const r = {
|
const r = {
|
||||||
...snapshot,
|
...snapshot,
|
||||||
parsed,
|
parsed,
|
||||||
html: elem.outerHTML,
|
html: rootDoc.documentElement.outerHTML,
|
||||||
text: cleanedText,
|
text: cleanedText,
|
||||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 5939c7091985706bebe7d1d83591430426b292c8
|
Subproject commit b30155da82ea8e311faab58bb5a360e829547ea0
|
Loading…
x
Reference in New Issue
Block a user