Nick: scrape timeout + warnings

This commit is contained in:
Nicolas 2024-11-24 19:44:51 -08:00
parent b693c6c23b
commit 30def84c0a
2 changed files with 7 additions and 5 deletions

View File

@ -128,7 +128,7 @@ export async function extractController(
// Scrape all links in parallel
const scrapePromises = links.map(async (url) => {
const origin = req.body.origin || "api";
const timeout = req.body.timeout ?? 30000;
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
const jobId = crypto.randomUUID();
const jobPriority = await getJobPriority({
@ -153,10 +153,8 @@ export async function extractController(
jobPriority
);
const totalWait = 0;
try {
const doc = await waitForJob<Document>(jobId, timeout + totalWait);
const doc = await waitForJob<Document>(jobId, timeout);
await getScrapeQueue().remove(jobId);
if (earlyReturn) {
return null;
@ -216,10 +214,13 @@ export async function extractController(
// console.log("completions.extract", completions.extract);
let data: any;
let warning = completions.warning ?? "";
try {
data = JSON.parse(completions.extract);
} catch (e) {
logger.warn(`ExtractController: Error parsing JSON: ${e}`);
data = completions.extract;
warning = "JSON could not be parsed correctly. Returning raw LLM output...";
}
logJob({
@ -241,5 +242,6 @@ export async function extractController(
success: true,
data: data,
scrape_id: id,
warning: warning
});
}

View File

@ -163,7 +163,7 @@ export const extractV1Options = z.object({
includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
timeout: z.number().int().positive().finite().safe().default(60000)
}).strict(strictMessage)
export type ExtractV1Options = z.infer<typeof extractV1Options>;