mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-22 10:25:50 +08:00
Nick: fixed the crawl + n - not respecting limit
This commit is contained in:
parent
39ff49a8f3
commit
51a6b83f45
@ -423,19 +423,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
|
||||
// console.log("Original URL: ", doc.metadata.sourceURL);
|
||||
// console.log("New URL: ", doc.metadata.url);
|
||||
// console.log("Normalized original URL: ", normalizeURL(doc.metadata.sourceURL, sc));
|
||||
// console.log("Normalized new URL: ", normalizeURL(doc.metadata.url, sc));
|
||||
// logger.debug("Was redirected, removing old URL and locking new URL...");
|
||||
// // Remove the old URL from visited sets
|
||||
// await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc));
|
||||
// if (sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||
// const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href);
|
||||
// await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations);
|
||||
// }
|
||||
// await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
|
||||
|
||||
logger.debug("Was redirected, removing old URL and locking new URL...");
|
||||
// Remove the old URL from visited unique due to checking for limit
|
||||
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
|
||||
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
|
||||
// Lock the new URL
|
||||
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user