Update document-scraper.ts

This commit is contained in:
Nicolas 2025-01-20 13:28:59 -03:00
parent 2d4f4de0ab
commit 5030fea634

View File

@ -12,6 +12,7 @@ interface ScrapeDocumentOptions {
plan: PlanType; plan: PlanType;
origin: string; origin: string;
timeout: number; timeout: number;
isSingleUrl?: boolean;
} }
export async function scrapeDocument( export async function scrapeDocument(
@ -24,14 +25,14 @@ export async function scrapeDocument(
trace.timing.scrapedAt = new Date().toISOString(); trace.timing.scrapedAt = new Date().toISOString();
} }
const jobId = crypto.randomUUID(); async function attemptScrape(timeout: number) {
const jobPriority = await getJobPriority({ const jobId = crypto.randomUUID();
plan: options.plan, const jobPriority = await getJobPriority({
team_id: options.teamId, plan: options.plan,
basePriority: 10, team_id: options.teamId,
}); basePriority: 10,
});
try {
await addScrapeJob( await addScrapeJob(
{ {
url: options.url, url: options.url,
@ -50,7 +51,7 @@ export async function scrapeDocument(
jobPriority, jobPriority,
); );
const doc = await waitForJob<Document>(jobId, options.timeout); const doc = await waitForJob<Document>(jobId, timeout);
await getScrapeQueue().remove(jobId); await getScrapeQueue().remove(jobId);
if (trace) { if (trace) {
@ -63,6 +64,18 @@ export async function scrapeDocument(
} }
return doc; return doc;
}
try {
try {
return await attemptScrape(options.timeout);
} catch (timeoutError) {
if (options.isSingleUrl) {
// For single URLs, try again with double timeout
return await attemptScrape(options.timeout * 2);
}
throw timeoutError;
}
} catch (error) { } catch (error) {
logger.error(`Error in scrapeDocument: ${error}`); logger.error(`Error in scrapeDocument: ${error}`);
if (trace) { if (trace) {