fix(queue-worker): do not follow redirect URLs if they are not allowed by the crawl options

This commit is contained in:
Gergő Móricz 2024-12-30 14:41:31 +01:00
parent 4d1f92f4c8
commit 9005757de3
2 changed files with 16 additions and 0 deletions

View File

@ -42,6 +42,11 @@ export const logger = winston.createLogger({
},
}),
transports: [
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
new winston.transports.File({
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
})
] : []),
new winston.transports.Console({
format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),

View File

@ -46,6 +46,8 @@ import {
removeConcurrencyLimitActiveJob,
takeConcurrencyLimitedJob,
} from "../lib/concurrency-limit";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
configDotenv();
class RacedRedirectError extends Error {
@ -620,6 +622,15 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc)
) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
}
if (isUrlBlocked(doc.metadata.url)) {
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
}
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
const p2 = generateURLPermutations(
normalizeURL(doc.metadata.sourceURL, sc),