mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 05:09:03 +08:00
feat(queue-worker): fix redirect slipping
This commit is contained in:
parent
126b46ee2c
commit
30fa78cd9e
@ -481,35 +481,32 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
normalizeURL(doc.metadata.url, sc) !==
|
normalizeURL(doc.metadata.url, sc) !==
|
||||||
normalizeURL(doc.metadata.sourceURL, sc)
|
normalizeURL(doc.metadata.sourceURL, sc)
|
||||||
) {
|
) {
|
||||||
logger.debug(
|
|
||||||
"Was redirected, removing old URL and locking new URL...",
|
|
||||||
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
|
||||||
);
|
|
||||||
// Remove the old URL from visited unique due to checking for limit
|
|
||||||
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
|
|
||||||
await redisConnection.srem(
|
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
|
||||||
normalizeURL(doc.metadata.sourceURL, sc),
|
|
||||||
);
|
|
||||||
|
|
||||||
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
||||||
const p2 = generateURLPermutations(
|
const p2 = generateURLPermutations(
|
||||||
normalizeURL(doc.metadata.sourceURL, sc),
|
normalizeURL(doc.metadata.sourceURL, sc),
|
||||||
);
|
);
|
||||||
|
|
||||||
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
|
if (JSON.stringify(p1) !== JSON.stringify(p2)) {
|
||||||
// This can prevent flakiness with race conditions.
|
logger.debug(
|
||||||
// Lock the new URL
|
"Was redirected, removing old URL and locking new URL...",
|
||||||
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
||||||
if (
|
);
|
||||||
job.data.crawlerOptions !== null &&
|
|
||||||
!lockRes &&
|
// Prevent redirect target from being visited in the crawl again
|
||||||
JSON.stringify(p1) !== JSON.stringify(p2)
|
// See lockURL
|
||||||
) {
|
const x = await redisConnection.sadd(
|
||||||
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
|
...p1.map(x => x.href),
|
||||||
|
);
|
||||||
|
const lockRes = x === p1.length;
|
||||||
|
|
||||||
|
if (job.data.crawlerOptions !== null && !lockRes) {
|
||||||
throw new RacedRedirectError();
|
throw new RacedRedirectError();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
logger.debug("Logging job to DB...");
|
logger.debug("Logging job to DB...");
|
||||||
await logJob(
|
await logJob(
|
||||||
{
|
{
|
||||||
@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||||
|
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
|
||||||
|
|
||||||
logger.debug("Logging job to DB...");
|
logger.debug("Logging job to DB...");
|
||||||
await logJob(
|
await logJob(
|
||||||
|
14
apps/api/utils/urldump-redis.js
Normal file
14
apps/api/utils/urldump-redis.js
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
require("dotenv").config();
|
||||||
|
const Redis = require("ioredis");
|
||||||
|
|
||||||
|
const crawlId = process.argv[2];
|
||||||
|
|
||||||
|
const redisConnection = new Redis(process.env.REDIS_URL, {
|
||||||
|
maxRetriesPerRequest: null,
|
||||||
|
});
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
|
||||||
|
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||||
|
process.exit(0);
|
||||||
|
})();
|
43
apps/api/utils/urldump.js
Normal file
43
apps/api/utils/urldump.js
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
require("dotenv").config();
|
||||||
|
|
||||||
|
//const baseUrl = "https://api.firecrawl.dev";
|
||||||
|
const baseUrl = "http://localhost:3002";
|
||||||
|
const crawlId = process.argv[2];
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
let url = baseUrl + "/v1/crawl/" + crawlId;
|
||||||
|
let urls = [];
|
||||||
|
|
||||||
|
while (url) {
|
||||||
|
let res;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
res = (await (await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
"Authorization": "Bearer " + process.env.TEST_API_KEY
|
||||||
|
}
|
||||||
|
})).json());
|
||||||
|
break;
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(res.data.length);
|
||||||
|
if (res.data.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
|
||||||
|
|
||||||
|
url = res.next;
|
||||||
|
if (url !== undefined) {
|
||||||
|
const o = new URL(url)
|
||||||
|
o.protocol = new URL(baseUrl).protocol;
|
||||||
|
url = o.href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||||
|
})();
|
Loading…
x
Reference in New Issue
Block a user