feat: bring your own html

This commit is contained in:
Yanlong Wang 2024-07-25 16:54:28 +08:00
parent 78ea13b101
commit 1c4b64fe04
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 34 additions and 14 deletions

View File

@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
rpcReflect.return(sseStream); rpcReflect.return(sseStream);
try { try {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
if (!scrapped) { if (!scrapped) {
continue; continue;
} }
@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped; let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue; continue;
@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return formatted; return formatted;
} }
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue; continue;
@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return r; return r;
} }
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
if (crawlerOpts?.html) {
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: crawlerOpts.html,
title: '',
text: '',
} as PageSnapshot;
yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
}
let cache; let cache;
const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
if (cacheTolerance && !crawlOpts?.cookies?.length) { if (cacheTolerance && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl, cacheTolerance); cache = await this.queryCache(urlToCrawl, cacheTolerance);
} }
@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) { async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance)); const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined); const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);

View File

@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
}); });
} }
delete crawlerOptions.html;
const crawlOpts = this.crawler.configure(crawlerOptions); const crawlOpts = this.crawler.configure(crawlerOptions);
const cookies: CookieParam[] = []; const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie']; const setCookieHeaders = ctx.req.headers['x-set-cookie'];
@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
} }
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
); );
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
mode: string | 'markdown' | 'html' | 'text' | 'screenshot', mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
searchResults?: WebSearchResult[], searchResults?: WebSearchResult[],
options?: ExtraScrappingOptions, options?: ExtraScrappingOptions,
pageCacheTolerance?: number crawlerOptions?: CrawlerOptions,
) { ) {
if (!searchResults) { if (!searchResults) {
return; return;
} }
const urls = searchResults.map((x) => new URL(x.url)); const urls = searchResults.map((x) => new URL(x.url));
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => { const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i]; const upstreamSearchResult = searchResults[i];
if (!x || (!x.parsed && mode !== 'markdown')) { if (!x || (!x.parsed && mode !== 'markdown')) {

View File

@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop() @Prop()
url?: string; url?: string;
@Prop()
html?: string;
@Prop({ @Prop({
default: 'default', default: 'default',
}) })

View File

@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
targetSelector?: string | string[]; targetSelector?: string | string[];
removeSelector?: string | string[]; removeSelector?: string | string[];
}): PageSnapshot | undefined { }): PageSnapshot | undefined {
if (!options?.targetSelector && !options?.removeSelector) { if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
return snapshot; return snapshot;
} }
if (!snapshot?.html) { if (!snapshot?.html) {
@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const allNodes: Node[] = []; const allNodes: Node[] = [];
if (Array.isArray(options.removeSelector)) { if (Array.isArray(options?.removeSelector)) {
for (const rl of options.removeSelector) { for (const rl of options.removeSelector) {
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
} }
} else if (options.removeSelector) { } else if (options?.removeSelector) {
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
} }
if (Array.isArray(options.targetSelector)) { if (Array.isArray(options?.targetSelector)) {
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => { x.forEach((el) => {
if (!allNodes.includes(el)) { if (!allNodes.includes(el)) {
@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
} }
}); });
} }
} else if (options.targetSelector) { } else if (options?.targetSelector) {
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
if (!allNodes.includes(el)) { if (!allNodes.includes(el)) {
allNodes.push(el); allNodes.push(el);
@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
const r = { const r = {
...snapshot, ...snapshot,
title: snapshot.title || jsdom.window.document.title,
parsed, parsed,
html: rootDoc.documentElement.outerHTML, html: rootDoc.documentElement.outerHTML,
text: cleanedText, text: cleanedText,