fix: target selector

This commit is contained in:
Yanlong Wang 2024-09-17 17:47:01 +08:00
parent e27bcaca77
commit c36aa730b4
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 18 additions and 4 deletions

View File

@ -276,7 +276,7 @@ export class CrawlerHost extends RPCHost {
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
@ -287,12 +287,15 @@ export class CrawlerHost extends RPCHost {
return formatted;
}
if (chargeAmount && scrapped.pdfs?.length) {
if (chargeAmount && scrapped?.pdfs?.length) {
return formatted;
}
}
if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
@ -304,7 +307,7 @@ export class CrawlerHost extends RPCHost {
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
@ -330,6 +333,9 @@ export class CrawlerHost extends RPCHost {
}
if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}

View File

@ -78,7 +78,9 @@ export class JSDomControl extends AsyncService {
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
}
let bewareTargetContentDoesNotExist = false;
if (Array.isArray(options?.targetSelector)) {
bewareTargetContentDoesNotExist = true;
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => {
if (!allNodes.includes(el)) {
@ -87,6 +89,7 @@ export class JSDomControl extends AsyncService {
});
}
} else if (options?.targetSelector) {
bewareTargetContentDoesNotExist = true;
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);
@ -97,6 +100,11 @@ export class JSDomControl extends AsyncService {
}
if (!allNodes.length) {
if (bewareTargetContentDoesNotExist) {
return undefined;
}
return snapshot;
}
const textChunks: string[] = [];

@ -1 +1 @@
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a