From 9005757de347edb2a771f94d6da92955ad9670e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 30 Dec 2024 14:41:31 +0100 Subject: [PATCH] fix(queue-worker): do not follow redirect URLs if they are not allowed by the crawl options --- apps/api/src/lib/logger.ts | 5 +++++ apps/api/src/services/queue-worker.ts | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 3cc04a11..bf57f63f 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -42,6 +42,11 @@ export const logger = winston.createLogger({ }, }), transports: [ + ...(process.env.FIRECRAWL_LOG_TO_FILE ? [ + new winston.transports.File({ + filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log", + }) + ] : []), new winston.transports.Console({ format: winston.format.combine( winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e8c8bdf3..48e6f7fd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -46,6 +46,8 @@ import { removeConcurrencyLimitActiveJob, takeConcurrencyLimitedJob, } from "../lib/concurrency-limit"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; configDotenv(); class RacedRedirectError extends Error { @@ -620,6 +622,15 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc) ) { + const crawler = crawlToCrawler(job.data.crawl_id, sc); + if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) { + throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking + } + + if (isUrlBlocked(doc.metadata.url)) { + throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking + } + const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc),