mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 02:29:00 +08:00
fix(queue-worker): do not follow redirect URLs if they are not allowed by the crawl options
This commit is contained in:
parent
4d1f92f4c8
commit
9005757de3
@ -42,6 +42,11 @@ export const logger = winston.createLogger({
|
||||
},
|
||||
}),
|
||||
transports: [
|
||||
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
|
||||
new winston.transports.File({
|
||||
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
|
||||
})
|
||||
] : []),
|
||||
new winston.transports.Console({
|
||||
format: winston.format.combine(
|
||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||
|
@ -46,6 +46,8 @@ import {
|
||||
removeConcurrencyLimitActiveJob,
|
||||
takeConcurrencyLimitedJob,
|
||||
} from "../lib/concurrency-limit";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
configDotenv();
|
||||
|
||||
class RacedRedirectError extends Error {
|
||||
@ -620,6 +622,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
normalizeURL(doc.metadata.url, sc) !==
|
||||
normalizeURL(doc.metadata.sourceURL, sc)
|
||||
) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
|
||||
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
|
||||
if (isUrlBlocked(doc.metadata.url)) {
|
||||
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
|
||||
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
||||
const p2 = generateURLPermutations(
|
||||
normalizeURL(doc.metadata.sourceURL, sc),
|
||||
|
Loading…
x
Reference in New Issue
Block a user