diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index aa7a891f..db976927 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,9 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); + const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!); + const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join("."); + const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),