Nick: scrape timeout + warnings

This commit is contained in:
Nicolas 2024-11-24 19:44:51 -08:00
parent b693c6c23b
commit 30def84c0a
2 changed files with 7 additions and 5 deletions

View File

@ -128,7 +128,7 @@ export async function extractController(
// Scrape all links in parallel // Scrape all links in parallel
const scrapePromises = links.map(async (url) => { const scrapePromises = links.map(async (url) => {
const origin = req.body.origin || "api"; const origin = req.body.origin || "api";
const timeout = req.body.timeout ?? 30000; const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
const jobId = crypto.randomUUID(); const jobId = crypto.randomUUID();
const jobPriority = await getJobPriority({ const jobPriority = await getJobPriority({
@ -153,10 +153,8 @@ export async function extractController(
jobPriority jobPriority
); );
const totalWait = 0;
try { try {
const doc = await waitForJob<Document>(jobId, timeout + totalWait); const doc = await waitForJob<Document>(jobId, timeout);
await getScrapeQueue().remove(jobId); await getScrapeQueue().remove(jobId);
if (earlyReturn) { if (earlyReturn) {
return null; return null;
@ -216,10 +214,13 @@ export async function extractController(
// console.log("completions.extract", completions.extract); // console.log("completions.extract", completions.extract);
let data: any; let data: any;
let warning = completions.warning ?? "";
try { try {
data = JSON.parse(completions.extract); data = JSON.parse(completions.extract);
} catch (e) { } catch (e) {
logger.warn(`ExtractController: Error parsing JSON: ${e}`);
data = completions.extract; data = completions.extract;
warning = "JSON could not be parsed correctly. Returning raw LLM output...";
} }
logJob({ logJob({
@ -241,5 +242,6 @@ export async function extractController(
success: true, success: true,
data: data, data: data,
scrape_id: id, scrape_id: id,
warning: warning
}); });
} }

View File

@ -163,7 +163,7 @@ export const extractV1Options = z.object({
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000)
}).strict(strictMessage) }).strict(strictMessage)
export type ExtractV1Options = z.infer<typeof extractV1Options>; export type ExtractV1Options = z.infer<typeof extractV1Options>;