From ba6f29cddaf43515c5c25157235b865f53af10af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 20 Nov 2024 19:55:35 +0100 Subject: [PATCH] crawl fix, again --- apps/api/src/services/queue-worker.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index aa7a891f..db976927 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,9 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); + const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!); + const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join("."); + const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),