mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 22:35:56 +08:00
fix: target selector
This commit is contained in:
parent
e27bcaca77
commit
c36aa730b4
@ -276,7 +276,7 @@ export class CrawlerHost extends RPCHost {
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -287,12 +287,15 @@ export class CrawlerHost extends RPCHost {
|
||||
return formatted;
|
||||
}
|
||||
|
||||
if (chargeAmount && scrapped.pdfs?.length) {
|
||||
if (chargeAmount && scrapped?.pdfs?.length) {
|
||||
return formatted;
|
||||
}
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
if (crawlOpts.targetSelector) {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
|
||||
}
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
@ -304,7 +307,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -330,6 +333,9 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
if (crawlOpts.targetSelector) {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
|
||||
}
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
|
@ -78,7 +78,9 @@ export class JSDomControl extends AsyncService {
|
||||
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
||||
}
|
||||
|
||||
let bewareTargetContentDoesNotExist = false;
|
||||
if (Array.isArray(options?.targetSelector)) {
|
||||
bewareTargetContentDoesNotExist = true;
|
||||
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
||||
x.forEach((el) => {
|
||||
if (!allNodes.includes(el)) {
|
||||
@ -87,6 +89,7 @@ export class JSDomControl extends AsyncService {
|
||||
});
|
||||
}
|
||||
} else if (options?.targetSelector) {
|
||||
bewareTargetContentDoesNotExist = true;
|
||||
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
||||
if (!allNodes.includes(el)) {
|
||||
allNodes.push(el);
|
||||
@ -97,6 +100,11 @@ export class JSDomControl extends AsyncService {
|
||||
}
|
||||
|
||||
if (!allNodes.length) {
|
||||
|
||||
if (bewareTargetContentDoesNotExist) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
const textChunks: string[] = [];
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
|
||||
Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a
|
Loading…
x
Reference in New Issue
Block a user