fix(crawl): allow execution time longer than 24h

This commit is contained in:
Gergő Móricz 2025-03-17 18:04:05 +01:00
parent 7e7b7e10fe
commit d12feaea52
2 changed files with 11 additions and 13 deletions

View File

@ -27,7 +27,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
plan: crawl.plan, plan: crawl.plan,
}); });
await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
} }
export async function getCrawl(id: string): Promise<StoredCrawl | null> { export async function getCrawl(id: string): Promise<StoredCrawl | null> {
@ -37,6 +37,7 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
return null; return null;
} }
await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
return JSON.parse(x); return JSON.parse(x);
} }
@ -56,7 +57,7 @@ export async function addCrawlJob(id: string, job_id: string) {
crawlId: id, crawlId: id,
}); });
await redisConnection.sadd("crawl:" + id + ":jobs", job_id); await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
} }
export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobs(id: string, job_ids: string[]) {
@ -69,7 +70,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
crawlId: id, crawlId: id,
}); });
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
} }
export async function addCrawlJobDone( export async function addCrawlJobDone(
@ -87,7 +88,6 @@ export async function addCrawlJobDone(
await redisConnection.expire( await redisConnection.expire(
"crawl:" + id + ":jobs_done", "crawl:" + id + ":jobs_done",
24 * 60 * 60, 24 * 60 * 60,
"NX",
); );
if (success) { if (success) {
@ -104,11 +104,11 @@ export async function addCrawlJobDone(
await redisConnection.expire( await redisConnection.expire(
"crawl:" + id + ":jobs_done_ordered", "crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60, 24 * 60 * 60,
"NX",
); );
} }
export async function getDoneJobsOrderedLength(id: string): Promise<number> { export async function getDoneJobsOrderedLength(id: string): Promise<number> {
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
} }
@ -117,6 +117,7 @@ export async function getDoneJobsOrdered(
start = 0, start = 0,
end = -1, end = -1,
): Promise<string[]> { ): Promise<string[]> {
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
return await redisConnection.lrange( return await redisConnection.lrange(
"crawl:" + id + ":jobs_done_ordered", "crawl:" + id + ":jobs_done_ordered",
start, start,
@ -125,6 +126,7 @@ export async function getDoneJobsOrdered(
} }
export async function isCrawlFinished(id: string) { export async function isCrawlFinished(id: string) {
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
return ( return (
(await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
(await redisConnection.scard("crawl:" + id + ":jobs")) && (await redisConnection.scard("crawl:" + id + ":jobs")) &&
@ -133,6 +135,7 @@ export async function isCrawlFinished(id: string) {
} }
export async function isCrawlKickoffFinished(id: string) { export async function isCrawlKickoffFinished(id: string) {
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
return ( return (
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
); );
@ -159,9 +162,7 @@ export async function finishCrawl(id: string) {
crawlId: id, crawlId: id,
}); });
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
if (set === 1) { await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
}
return set === 1; return set === 1;
} else { } else {
_logger.debug("Crawl can not be finished yet, not marking as finished.", { _logger.debug("Crawl can not be finished yet, not marking as finished.", {
@ -294,14 +295,13 @@ export async function lockURL(
res = x === permutations.length; res = x === permutations.length;
} }
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
if (res) { if (res) {
await redisConnection.sadd("crawl:" + id + ":visited_unique", url); await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire( await redisConnection.expire(
"crawl:" + id + ":visited_unique", "crawl:" + id + ":visited_unique",
24 * 60 * 60, 24 * 60 * 60,
"NX",
); );
} }
@ -334,7 +334,6 @@ export async function lockURLs(
await redisConnection.expire( await redisConnection.expire(
"crawl:" + id + ":visited_unique", "crawl:" + id + ":visited_unique",
24 * 60 * 60, 24 * 60 * 60,
"NX",
); );
let res: boolean; let res: boolean;
@ -353,7 +352,7 @@ export async function lockURLs(
res = x === allPermutations.length; res = x === allPermutations.length;
} }
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
logger.debug("lockURLs final result: " + res, { res }); logger.debug("lockURLs final result: " + res, { res });
return res; return res;

View File

@ -384,7 +384,6 @@ export class WebCrawler {
await redisConnection.expire( await redisConnection.expire(
"crawl:" + this.jobId + ":robots_blocked", "crawl:" + this.jobId + ":robots_blocked",
24 * 60 * 60, 24 * 60 * 60,
"NX",
); );
})(); })();
} }