feat(crawler): handle cross-origin redirects differently than same-origin redirects (#1279)

This commit is contained in:
Gergő Móricz 2025-03-02 13:32:46 +01:00 committed by GitHub
parent fea249c568
commit e8c698d613
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 1 deletions

View File

@ -109,6 +109,9 @@ export class WebCrawler {
// Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} DEPTH FAIL`);
}
return false;
}
@ -119,6 +122,9 @@ export class WebCrawler {
new RegExp(excludePattern).test(path),
)
) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} EXCLUDE FAIL`);
}
return false;
}
}
@ -130,6 +136,9 @@ export class WebCrawler {
new RegExp(includePattern).test(path),
)
) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} INCLUDE FAIL`);
}
return false;
}
}
@ -140,6 +149,9 @@ export class WebCrawler {
try {
normalizedLink = new URL(link);
} catch (_) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} URL PARSE FAIL`);
}
return false;
}
const initialHostname = normalizedInitialUrl.hostname.replace(
@ -158,6 +170,9 @@ export class WebCrawler {
if (
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
}
return false;
}
}
@ -171,13 +186,22 @@ export class WebCrawler {
method: "filterLinks",
link,
});
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} ROBOTS FAIL`);
}
return false;
}
if (this.isFile(link)) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} FILE FAIL`);
}
return false;
}
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} OK`);
}
return true;
})
.slice(0, limit);

View File

@ -973,7 +973,13 @@ async function processJob(job: Job & { id: string }, token: string) {
); // TODO: make this its own error type that is ignored by error tracking
}
if (job.data.isCrawlSourceScrape) {
// Only re-set originUrl if it's different from the current hostname
// This is only done on this condition to handle cross-domain redirects
// If this would be done for non-crossdomain redirects, but also for e.g.
// redirecting / -> /introduction (like our docs site does), it would
// break crawling the entire site without allowBackwardsCrawling - mogery
const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
// TODO: re-fetch sitemap for redirect target domain
sc.originUrl = doc.metadata.url;
await saveCrawl(job.data.crawl_id, sc);