feat(crawler): handle cross-origin redirects differently than same-origin redirects (#1279)

This commit is contained in:
Gergő Móricz 2025-03-02 13:32:46 +01:00 committed by GitHub
parent fea249c568
commit e8c698d613
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 1 deletions

View File

@ -109,6 +109,9 @@ export class WebCrawler {
// Check if the link exceeds the maximum depth allowed // Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) { if (depth > maxDepth) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} DEPTH FAIL`);
}
return false; return false;
} }
@ -119,6 +122,9 @@ export class WebCrawler {
new RegExp(excludePattern).test(path), new RegExp(excludePattern).test(path),
) )
) { ) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} EXCLUDE FAIL`);
}
return false; return false;
} }
} }
@ -130,6 +136,9 @@ export class WebCrawler {
new RegExp(includePattern).test(path), new RegExp(includePattern).test(path),
) )
) { ) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} INCLUDE FAIL`);
}
return false; return false;
} }
} }
@ -140,6 +149,9 @@ export class WebCrawler {
try { try {
normalizedLink = new URL(link); normalizedLink = new URL(link);
} catch (_) { } catch (_) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} URL PARSE FAIL`);
}
return false; return false;
} }
const initialHostname = normalizedInitialUrl.hostname.replace( const initialHostname = normalizedInitialUrl.hostname.replace(
@ -158,6 +170,9 @@ export class WebCrawler {
if ( if (
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
) { ) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
}
return false; return false;
} }
} }
@ -171,13 +186,22 @@ export class WebCrawler {
method: "filterLinks", method: "filterLinks",
link, link,
}); });
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} ROBOTS FAIL`);
}
return false; return false;
} }
if (this.isFile(link)) { if (this.isFile(link)) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} FILE FAIL`);
}
return false; return false;
} }
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
this.logger.debug(`${link} OK`);
}
return true; return true;
}) })
.slice(0, limit); .slice(0, limit);

View File

@ -973,7 +973,13 @@ async function processJob(job: Job & { id: string }, token: string) {
); // TODO: make this its own error type that is ignored by error tracking ); // TODO: make this its own error type that is ignored by error tracking
} }
if (job.data.isCrawlSourceScrape) { // Only re-set originUrl if it's different from the current hostname
// This is only done on this condition to handle cross-domain redirects
// If this would be done for non-crossdomain redirects, but also for e.g.
// redirecting / -> /introduction (like our docs site does), it would
// break crawling the entire site without allowBackwardsCrawling - mogery
const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
// TODO: re-fetch sitemap for redirect target domain // TODO: re-fetch sitemap for redirect target domain
sc.originUrl = doc.metadata.url; sc.originUrl = doc.metadata.url;
await saveCrawl(job.data.crawl_id, sc); await saveCrawl(job.data.crawl_id, sc);