fix(crawl): never invalidate first crawl scrape if redirects

This commit is contained in:
Móricz Gergő 2025-01-07 13:56:41 +01:00
parent b96b97ed72
commit 7d73ebdbf1
2 changed files with 9 additions and 3 deletions

View File

@ -558,6 +558,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
crawl_id: job.data.crawl_id, crawl_id: job.data.crawl_id,
webhook: job.data.webhook, webhook: job.data.webhook,
v1: job.data.v1, v1: job.data.v1,
isCrawlSourceScrape: true,
}, },
{ {
priority: 15, priority: 15,
@ -721,9 +722,13 @@ async function processJob(job: Job & { id: string }, token: string) {
if ( if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
) { ) {
throw new Error( if (job.data.isCrawlSourceScrape) {
"Redirected target URL is not allowed by crawlOptions", // TODO: re-fetch sitemap for redirect target domain
); // TODO: make this its own error type that is ignored by error tracking } else {
throw new Error(
"Redirected target URL is not allowed by crawlOptions",
); // TODO: make this its own error type that is ignored by error tracking
}
} }
if (isUrlBlocked(doc.metadata.url)) { if (isUrlBlocked(doc.metadata.url)) {

View File

@ -44,6 +44,7 @@ export interface WebScraperOptions {
webhook?: z.infer<typeof webhookSchema>; webhook?: z.infer<typeof webhookSchema>;
v1?: boolean; v1?: boolean;
is_scrape?: boolean; is_scrape?: boolean;
isCrawlSourceScrape?: boolean;
} }
export interface RunWebScraperParams { export interface RunWebScraperParams {