fix(queue-worker): do not follow redirect URLs if they are not allowed by the crawl options

This commit is contained in:
Gergő Móricz 2024-12-30 14:41:31 +01:00
parent 4d1f92f4c8
commit 9005757de3
2 changed files with 16 additions and 0 deletions

View File

@ -42,6 +42,11 @@ export const logger = winston.createLogger({
}, },
}), }),
transports: [ transports: [
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
new winston.transports.File({
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
})
] : []),
new winston.transports.Console({ new winston.transports.Console({
format: winston.format.combine( format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),

View File

@ -46,6 +46,8 @@ import {
removeConcurrencyLimitActiveJob, removeConcurrencyLimitActiveJob,
takeConcurrencyLimitedJob, takeConcurrencyLimitedJob,
} from "../lib/concurrency-limit"; } from "../lib/concurrency-limit";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
configDotenv(); configDotenv();
class RacedRedirectError extends Error { class RacedRedirectError extends Error {
@ -620,6 +622,15 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc) normalizeURL(doc.metadata.sourceURL, sc)
) { ) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
}
if (isUrlBlocked(doc.metadata.url)) {
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
}
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
const p2 = generateURLPermutations( const p2 = generateURLPermutations(
normalizeURL(doc.metadata.sourceURL, sc), normalizeURL(doc.metadata.sourceURL, sc),