mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 11:55:52 +08:00
feat: bring your own html
This commit is contained in:
parent
78ea13b101
commit
1c4b64fe04
@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
rpcReflect.return(sseStream);
|
rpcReflect.return(sseStream);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
if (!scrapped) {
|
if (!scrapped) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
let lastScrapped;
|
let lastScrapped;
|
||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||||
|
if (crawlerOpts?.html) {
|
||||||
|
const fakeSnapshot = {
|
||||||
|
href: urlToCrawl.toString(),
|
||||||
|
html: crawlerOpts.html,
|
||||||
|
title: '',
|
||||||
|
text: '',
|
||||||
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
let cache;
|
let cache;
|
||||||
|
|
||||||
|
const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
|
||||||
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
}
|
}
|
||||||
@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
|
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||||
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
|
||||||
|
|
||||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
||||||
|
|
||||||
|
@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete crawlerOptions.html;
|
||||||
|
|
||||||
const crawlOpts = this.crawler.configure(crawlerOptions);
|
const crawlOpts = this.crawler.configure(crawlerOptions);
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||||
@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||||
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
|
|||||||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||||
searchResults?: WebSearchResult[],
|
searchResults?: WebSearchResult[],
|
||||||
options?: ExtraScrappingOptions,
|
options?: ExtraScrappingOptions,
|
||||||
pageCacheTolerance?: number
|
crawlerOptions?: CrawlerOptions,
|
||||||
) {
|
) {
|
||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const urls = searchResults.map((x) => new URL(x.url));
|
const urls = searchResults.map((x) => new URL(x.url));
|
||||||
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
|
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||||
const mapped = scrapped.map((x, i) => {
|
const mapped = scrapped.map((x, i) => {
|
||||||
const upstreamSearchResult = searchResults[i];
|
const upstreamSearchResult = searchResults[i];
|
||||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
if (!x || (!x.parsed && mode !== 'markdown')) {
|
||||||
|
@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
url?: string;
|
url?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
html?: string;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'default',
|
default: 'default',
|
||||||
})
|
})
|
||||||
|
@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
}): PageSnapshot | undefined {
|
}): PageSnapshot | undefined {
|
||||||
if (!options?.targetSelector && !options?.removeSelector) {
|
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
if (!snapshot?.html) {
|
if (!snapshot?.html) {
|
||||||
@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||||
const allNodes: Node[] = [];
|
const allNodes: Node[] = [];
|
||||||
|
|
||||||
if (Array.isArray(options.removeSelector)) {
|
if (Array.isArray(options?.removeSelector)) {
|
||||||
for (const rl of options.removeSelector) {
|
for (const rl of options.removeSelector) {
|
||||||
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
||||||
}
|
}
|
||||||
} else if (options.removeSelector) {
|
} else if (options?.removeSelector) {
|
||||||
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Array.isArray(options.targetSelector)) {
|
if (Array.isArray(options?.targetSelector)) {
|
||||||
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
||||||
x.forEach((el) => {
|
x.forEach((el) => {
|
||||||
if (!allNodes.includes(el)) {
|
if (!allNodes.includes(el)) {
|
||||||
@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else if (options.targetSelector) {
|
} else if (options?.targetSelector) {
|
||||||
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
||||||
if (!allNodes.includes(el)) {
|
if (!allNodes.includes(el)) {
|
||||||
allNodes.push(el);
|
allNodes.push(el);
|
||||||
@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
const r = {
|
const r = {
|
||||||
...snapshot,
|
...snapshot,
|
||||||
|
title: snapshot.title || jsdom.window.document.title,
|
||||||
parsed,
|
parsed,
|
||||||
html: rootDoc.documentElement.outerHTML,
|
html: rootDoc.documentElement.outerHTML,
|
||||||
text: cleanedText,
|
text: cleanedText,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user