mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 21:05:55 +08:00
Merge pull request #941 from mendableai/nsc/crawl-n--1-fixes
Crawl fixes: fixed the n-1 bug
This commit is contained in:
commit
3d5704b73e
@ -20,6 +20,7 @@ import {
|
|||||||
addCrawlJobDone,
|
addCrawlJobDone,
|
||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
finishCrawl,
|
finishCrawl,
|
||||||
|
generateURLPermutations,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
lockURL,
|
lockURL,
|
||||||
@ -422,7 +423,16 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
|
||||||
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
|
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
|
||||||
logger.debug("Was redirected, locking new URL...");
|
logger.debug("Was redirected, removing old URL and locking new URL...");
|
||||||
|
// Remove the old URL from visited sets
|
||||||
|
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc));
|
||||||
|
if (sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
|
const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href);
|
||||||
|
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations);
|
||||||
|
}
|
||||||
|
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
|
||||||
|
|
||||||
|
// Lock the new URL
|
||||||
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user