mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 19:06:01 +08:00
fix(crawl): never invalidate first crawl scrape if redirects
This commit is contained in:
parent
b96b97ed72
commit
7d73ebdbf1
@ -558,6 +558,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
crawl_id: job.data.crawl_id,
|
||||
webhook: job.data.webhook,
|
||||
v1: job.data.v1,
|
||||
isCrawlSourceScrape: true,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
@ -721,10 +722,14 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
if (
|
||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
||||
) {
|
||||
if (job.data.isCrawlSourceScrape) {
|
||||
// TODO: re-fetch sitemap for redirect target domain
|
||||
} else {
|
||||
throw new Error(
|
||||
"Redirected target URL is not allowed by crawlOptions",
|
||||
); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
}
|
||||
|
||||
if (isUrlBlocked(doc.metadata.url)) {
|
||||
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
||||
|
@ -44,6 +44,7 @@ export interface WebScraperOptions {
|
||||
webhook?: z.infer<typeof webhookSchema>;
|
||||
v1?: boolean;
|
||||
is_scrape?: boolean;
|
||||
isCrawlSourceScrape?: boolean;
|
||||
}
|
||||
|
||||
export interface RunWebScraperParams {
|
||||
|
Loading…
x
Reference in New Issue
Block a user