mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 19:18:58 +08:00
feat(queue-worker): fix redirect slipping
This commit is contained in:
parent
126b46ee2c
commit
30fa78cd9e
@ -481,33 +481,30 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
normalizeURL(doc.metadata.url, sc) !==
|
||||
normalizeURL(doc.metadata.sourceURL, sc)
|
||||
) {
|
||||
logger.debug(
|
||||
"Was redirected, removing old URL and locking new URL...",
|
||||
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
||||
);
|
||||
// Remove the old URL from visited unique due to checking for limit
|
||||
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(doc.metadata.sourceURL, sc),
|
||||
);
|
||||
|
||||
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
||||
const p2 = generateURLPermutations(
|
||||
normalizeURL(doc.metadata.sourceURL, sc),
|
||||
);
|
||||
|
||||
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
|
||||
// This can prevent flakiness with race conditions.
|
||||
// Lock the new URL
|
||||
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
||||
if (
|
||||
job.data.crawlerOptions !== null &&
|
||||
!lockRes &&
|
||||
JSON.stringify(p1) !== JSON.stringify(p2)
|
||||
) {
|
||||
throw new RacedRedirectError();
|
||||
if (JSON.stringify(p1) !== JSON.stringify(p2)) {
|
||||
logger.debug(
|
||||
"Was redirected, removing old URL and locking new URL...",
|
||||
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
||||
);
|
||||
|
||||
// Prevent redirect target from being visited in the crawl again
|
||||
// See lockURL
|
||||
const x = await redisConnection.sadd(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
...p1.map(x => x.href),
|
||||
);
|
||||
const lockRes = x === p1.length;
|
||||
|
||||
if (job.data.crawlerOptions !== null && !lockRes) {
|
||||
throw new RacedRedirectError();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
await logJob(
|
||||
|
14
apps/api/utils/urldump-redis.js
Normal file
14
apps/api/utils/urldump-redis.js
Normal file
@ -0,0 +1,14 @@
|
||||
require("dotenv").config();
|
||||
const Redis = require("ioredis");
|
||||
|
||||
const crawlId = process.argv[2];
|
||||
|
||||
const redisConnection = new Redis(process.env.REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
|
||||
(async () => {
|
||||
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
|
||||
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||
process.exit(0);
|
||||
})();
|
43
apps/api/utils/urldump.js
Normal file
43
apps/api/utils/urldump.js
Normal file
@ -0,0 +1,43 @@
|
||||
require("dotenv").config();
|
||||
|
||||
//const baseUrl = "https://api.firecrawl.dev";
|
||||
const baseUrl = "http://localhost:3002";
|
||||
const crawlId = process.argv[2];
|
||||
|
||||
(async () => {
|
||||
let url = baseUrl + "/v1/crawl/" + crawlId;
|
||||
let urls = [];
|
||||
|
||||
while (url) {
|
||||
let res;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
res = (await (await fetch(url, {
|
||||
headers: {
|
||||
"Authorization": "Bearer " + process.env.TEST_API_KEY
|
||||
}
|
||||
})).json());
|
||||
break;
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(res.data.length);
|
||||
if (res.data.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
|
||||
|
||||
url = res.next;
|
||||
if (url !== undefined) {
|
||||
const o = new URL(url)
|
||||
o.protocol = new URL(baseUrl).protocol;
|
||||
url = o.href;
|
||||
}
|
||||
}
|
||||
|
||||
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||
})();
|
Loading…
x
Reference in New Issue
Block a user