From 5030fea634b1362ffa0ab126a682ddf4f43e2cf2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 Jan 2025 13:28:59 -0300 Subject: [PATCH] Update document-scraper.ts --- apps/api/src/lib/extract/document-scraper.ts | 29 ++++++++++++++------ 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 91d515df..727d5da4 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -12,6 +12,7 @@ interface ScrapeDocumentOptions { plan: PlanType; origin: string; timeout: number; + isSingleUrl?: boolean; } export async function scrapeDocument( @@ -24,14 +25,14 @@ export async function scrapeDocument( trace.timing.scrapedAt = new Date().toISOString(); } - const jobId = crypto.randomUUID(); - const jobPriority = await getJobPriority({ - plan: options.plan, - team_id: options.teamId, - basePriority: 10, - }); + async function attemptScrape(timeout: number) { + const jobId = crypto.randomUUID(); + const jobPriority = await getJobPriority({ + plan: options.plan, + team_id: options.teamId, + basePriority: 10, + }); - try { await addScrapeJob( { url: options.url, @@ -50,7 +51,7 @@ export async function scrapeDocument( jobPriority, ); - const doc = await waitForJob(jobId, options.timeout); + const doc = await waitForJob(jobId, timeout); await getScrapeQueue().remove(jobId); if (trace) { @@ -63,6 +64,18 @@ export async function scrapeDocument( } return doc; + } + + try { + try { + return await attemptScrape(options.timeout); + } catch (timeoutError) { + if (options.isSingleUrl) { + // For single URLs, try again with double timeout + return await attemptScrape(options.timeout * 2); + } + throw timeoutError; + } } catch (error) { logger.error(`Error in scrapeDocument: ${error}`); if (trace) {