feat(queue-worker): fix redirect slipping

This commit is contained in:
Gergő Móricz 2024-12-15 20:16:29 +01:00
parent 126b46ee2c
commit 30fa78cd9e
4 changed files with 76 additions and 21 deletions

View File

@ -481,33 +481,30 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc)
) {
logger.debug(
"Was redirected, removing old URL and locking new URL...",
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
);
// Remove the old URL from visited unique due to checking for limit
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(doc.metadata.sourceURL, sc),
);
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
const p2 = generateURLPermutations(
normalizeURL(doc.metadata.sourceURL, sc),
);
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
// This can prevent flakiness with race conditions.
// Lock the new URL
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
if (
job.data.crawlerOptions !== null &&
!lockRes &&
JSON.stringify(p1) !== JSON.stringify(p2)
) {
throw new RacedRedirectError();
if (JSON.stringify(p1) !== JSON.stringify(p2)) {
logger.debug(
"Was redirected, removing old URL and locking new URL...",
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
);
// Prevent redirect target from being visited in the crawl again
// See lockURL
const x = await redisConnection.sadd(
"crawl:" + job.data.crawl_id + ":visited",
...p1.map(x => x.href),
);
const lockRes = x === p1.length;
if (job.data.crawlerOptions !== null && !lockRes) {
throw new RacedRedirectError();
}
}
}
logger.debug("Logging job to DB...");
@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
logger.debug("Logging job to DB...");
await logJob(

View File

@ -0,0 +1,14 @@
require("dotenv").config();
const Redis = require("ioredis");
const crawlId = process.argv[2];
const redisConnection = new Redis(process.env.REDIS_URL, {
maxRetriesPerRequest: null,
});
(async () => {
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
process.exit(0);
})();

43
apps/api/utils/urldump.js Normal file
View File

@ -0,0 +1,43 @@
require("dotenv").config();
//const baseUrl = "https://api.firecrawl.dev";
const baseUrl = "http://localhost:3002";
const crawlId = process.argv[2];
(async () => {
let url = baseUrl + "/v1/crawl/" + crawlId;
let urls = [];
while (url) {
let res;
while (true) {
try {
res = (await (await fetch(url, {
headers: {
"Authorization": "Bearer " + process.env.TEST_API_KEY
}
})).json());
break;
} catch (e) {
console.error(e);
}
}
console.log(res.data.length);
if (res.data.length === 0) {
break;
}
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
url = res.next;
if (url !== undefined) {
const o = new URL(url)
o.protocol = new URL(baseUrl).protocol;
url = o.href;
}
}
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
})();