Merge pull request #941 from mendableai/nsc/crawl-n--1-fixes

Crawl fixes: fixed the n-1 bug
This commit is contained in:
Nicolas 2024-12-03 17:55:43 -03:00 committed by GitHub
commit 3d5704b73e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -20,6 +20,7 @@ import {
addCrawlJobDone,
crawlToCrawler,
finishCrawl,
generateURLPermutations,
getCrawl,
getCrawlJobs,
lockURL,
@ -422,7 +423,16 @@ async function processJob(job: Job & { id: string }, token: string) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
logger.debug("Was redirected, locking new URL...");
logger.debug("Was redirected, removing old URL and locking new URL...");
// Remove the old URL from visited sets
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc));
if (sc.crawlerOptions?.deduplicateSimilarURLs) {
const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href);
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations);
}
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
// Lock the new URL
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
}