crawl fix, again

This commit is contained in:
Gergő Móricz 2024-11-20 19:55:35 +01:00
parent b468bb4014
commit ba6f29cdda

View File

@ -352,7 +352,9 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join(".");
const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),