mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 00:45:52 +08:00
fix(crawl): never invalidate first crawl scrape if redirects
This commit is contained in:
parent
b96b97ed72
commit
7d73ebdbf1
@ -558,6 +558,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
webhook: job.data.webhook,
|
webhook: job.data.webhook,
|
||||||
v1: job.data.v1,
|
v1: job.data.v1,
|
||||||
|
isCrawlSourceScrape: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 15,
|
priority: 15,
|
||||||
@ -721,9 +722,13 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
if (
|
if (
|
||||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
||||||
) {
|
) {
|
||||||
throw new Error(
|
if (job.data.isCrawlSourceScrape) {
|
||||||
"Redirected target URL is not allowed by crawlOptions",
|
// TODO: re-fetch sitemap for redirect target domain
|
||||||
); // TODO: make this its own error type that is ignored by error tracking
|
} else {
|
||||||
|
throw new Error(
|
||||||
|
"Redirected target URL is not allowed by crawlOptions",
|
||||||
|
); // TODO: make this its own error type that is ignored by error tracking
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(doc.metadata.url)) {
|
if (isUrlBlocked(doc.metadata.url)) {
|
||||||
|
@ -44,6 +44,7 @@ export interface WebScraperOptions {
|
|||||||
webhook?: z.infer<typeof webhookSchema>;
|
webhook?: z.infer<typeof webhookSchema>;
|
||||||
v1?: boolean;
|
v1?: boolean;
|
||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
|
isCrawlSourceScrape?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunWebScraperParams {
|
export interface RunWebScraperParams {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user