fix(crawl): never invalidate first crawl scrape if redirects

This commit is contained in:
Móricz Gergő 2025-01-07 13:56:41 +01:00
parent b96b97ed72
commit 7d73ebdbf1
2 changed files with 9 additions and 3 deletions

View File

@ -558,6 +558,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
crawl_id: job.data.crawl_id,
webhook: job.data.webhook,
v1: job.data.v1,
isCrawlSourceScrape: true,
},
{
priority: 15,
@ -721,10 +722,14 @@ async function processJob(job: Job & { id: string }, token: string) {
if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
) {
if (job.data.isCrawlSourceScrape) {
// TODO: re-fetch sitemap for redirect target domain
} else {
throw new Error(
"Redirected target URL is not allowed by crawlOptions",
); // TODO: make this its own error type that is ignored by error tracking
}
}
if (isUrlBlocked(doc.metadata.url)) {
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking

View File

@ -44,6 +44,7 @@ export interface WebScraperOptions {
webhook?: z.infer<typeof webhookSchema>;
v1?: boolean;
is_scrape?: boolean;
isCrawlSourceScrape?: boolean;
}
export interface RunWebScraperParams {