mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 00:12:02 +08:00
feat(crawler): handle cross-origin redirects differently than same-origin redirects (#1279)
This commit is contained in:
parent
fea249c568
commit
e8c698d613
@ -109,6 +109,9 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// Check if the link exceeds the maximum depth allowed
|
// Check if the link exceeds the maximum depth allowed
|
||||||
if (depth > maxDepth) {
|
if (depth > maxDepth) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} DEPTH FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -119,6 +122,9 @@ export class WebCrawler {
|
|||||||
new RegExp(excludePattern).test(path),
|
new RegExp(excludePattern).test(path),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} EXCLUDE FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,6 +136,9 @@ export class WebCrawler {
|
|||||||
new RegExp(includePattern).test(path),
|
new RegExp(includePattern).test(path),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} INCLUDE FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -140,6 +149,9 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
normalizedLink = new URL(link);
|
normalizedLink = new URL(link);
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} URL PARSE FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||||
@ -158,6 +170,9 @@ export class WebCrawler {
|
|||||||
if (
|
if (
|
||||||
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||||
) {
|
) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -171,13 +186,22 @@ export class WebCrawler {
|
|||||||
method: "filterLinks",
|
method: "filterLinks",
|
||||||
link,
|
link,
|
||||||
});
|
});
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} ROBOTS FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.isFile(link)) {
|
if (this.isFile(link)) {
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} FILE FAIL`);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
this.logger.debug(`${link} OK`);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
})
|
})
|
||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
|
@ -973,7 +973,13 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
); // TODO: make this its own error type that is ignored by error tracking
|
); // TODO: make this its own error type that is ignored by error tracking
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.isCrawlSourceScrape) {
|
// Only re-set originUrl if it's different from the current hostname
|
||||||
|
// This is only done on this condition to handle cross-domain redirects
|
||||||
|
// If this would be done for non-crossdomain redirects, but also for e.g.
|
||||||
|
// redirecting / -> /introduction (like our docs site does), it would
|
||||||
|
// break crawling the entire site without allowBackwardsCrawling - mogery
|
||||||
|
const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
|
||||||
|
if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
|
||||||
// TODO: re-fetch sitemap for redirect target domain
|
// TODO: re-fetch sitemap for redirect target domain
|
||||||
sc.originUrl = doc.metadata.url;
|
sc.originUrl = doc.metadata.url;
|
||||||
await saveCrawl(job.data.crawl_id, sc);
|
await saveCrawl(job.data.crawl_id, sc);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user