mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 12:55:57 +08:00
fix(queue-worker): do not follow redirect URLs if they are not allowed by the crawl options
This commit is contained in:
parent
4d1f92f4c8
commit
9005757de3
@ -42,6 +42,11 @@ export const logger = winston.createLogger({
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
transports: [
|
transports: [
|
||||||
|
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
|
||||||
|
new winston.transports.File({
|
||||||
|
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
|
||||||
|
})
|
||||||
|
] : []),
|
||||||
new winston.transports.Console({
|
new winston.transports.Console({
|
||||||
format: winston.format.combine(
|
format: winston.format.combine(
|
||||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||||
|
@ -46,6 +46,8 @@ import {
|
|||||||
removeConcurrencyLimitActiveJob,
|
removeConcurrencyLimitActiveJob,
|
||||||
takeConcurrencyLimitedJob,
|
takeConcurrencyLimitedJob,
|
||||||
} from "../lib/concurrency-limit";
|
} from "../lib/concurrency-limit";
|
||||||
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
class RacedRedirectError extends Error {
|
class RacedRedirectError extends Error {
|
||||||
@ -620,6 +622,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
normalizeURL(doc.metadata.url, sc) !==
|
normalizeURL(doc.metadata.url, sc) !==
|
||||||
normalizeURL(doc.metadata.sourceURL, sc)
|
normalizeURL(doc.metadata.sourceURL, sc)
|
||||||
) {
|
) {
|
||||||
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
|
||||||
|
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isUrlBlocked(doc.metadata.url)) {
|
||||||
|
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
||||||
|
}
|
||||||
|
|
||||||
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
||||||
const p2 = generateURLPermutations(
|
const p2 = generateURLPermutations(
|
||||||
normalizeURL(doc.metadata.sourceURL, sc),
|
normalizeURL(doc.metadata.sourceURL, sc),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user