mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00
feat(crawler): handle cross-origin redirects differently than same-origin redirects (#1279)
This commit is contained in:
parent
fea249c568
commit
e8c698d613
@ -109,6 +109,9 @@ export class WebCrawler {
|
||||
|
||||
// Check if the link exceeds the maximum depth allowed
|
||||
if (depth > maxDepth) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} DEPTH FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -119,6 +122,9 @@ export class WebCrawler {
|
||||
new RegExp(excludePattern).test(path),
|
||||
)
|
||||
) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} EXCLUDE FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -130,6 +136,9 @@ export class WebCrawler {
|
||||
new RegExp(includePattern).test(path),
|
||||
)
|
||||
) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} INCLUDE FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -140,6 +149,9 @@ export class WebCrawler {
|
||||
try {
|
||||
normalizedLink = new URL(link);
|
||||
} catch (_) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} URL PARSE FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||
@ -158,6 +170,9 @@ export class WebCrawler {
|
||||
if (
|
||||
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||
) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -171,13 +186,22 @@ export class WebCrawler {
|
||||
method: "filterLinks",
|
||||
link,
|
||||
});
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} ROBOTS FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.isFile(link)) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} FILE FAIL`);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
this.logger.debug(`${link} OK`);
|
||||
}
|
||||
return true;
|
||||
})
|
||||
.slice(0, limit);
|
||||
|
@ -973,7 +973,13 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
|
||||
if (job.data.isCrawlSourceScrape) {
|
||||
// Only re-set originUrl if it's different from the current hostname
|
||||
// This is only done on this condition to handle cross-domain redirects
|
||||
// If this would be done for non-crossdomain redirects, but also for e.g.
|
||||
// redirecting / -> /introduction (like our docs site does), it would
|
||||
// break crawling the entire site without allowBackwardsCrawling - mogery
|
||||
const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
|
||||
if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
|
||||
// TODO: re-fetch sitemap for redirect target domain
|
||||
sc.originUrl = doc.metadata.url;
|
||||
await saveCrawl(job.data.crawl_id, sc);
|
||||
|
Loading…
x
Reference in New Issue
Block a user