mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 23:00:40 +08:00
Nick: scrape timeout + warnings
This commit is contained in:
parent
b693c6c23b
commit
30def84c0a
@ -128,7 +128,7 @@ export async function extractController(
|
|||||||
// Scrape all links in parallel
|
// Scrape all links in parallel
|
||||||
const scrapePromises = links.map(async (url) => {
|
const scrapePromises = links.map(async (url) => {
|
||||||
const origin = req.body.origin || "api";
|
const origin = req.body.origin || "api";
|
||||||
const timeout = req.body.timeout ?? 30000;
|
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
|
||||||
const jobId = crypto.randomUUID();
|
const jobId = crypto.randomUUID();
|
||||||
|
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -153,10 +153,8 @@ export async function extractController(
|
|||||||
jobPriority
|
jobPriority
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait = 0;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const doc = await waitForJob<Document>(jobId, timeout + totalWait);
|
const doc = await waitForJob<Document>(jobId, timeout);
|
||||||
await getScrapeQueue().remove(jobId);
|
await getScrapeQueue().remove(jobId);
|
||||||
if (earlyReturn) {
|
if (earlyReturn) {
|
||||||
return null;
|
return null;
|
||||||
@ -216,10 +214,13 @@ export async function extractController(
|
|||||||
// console.log("completions.extract", completions.extract);
|
// console.log("completions.extract", completions.extract);
|
||||||
|
|
||||||
let data: any;
|
let data: any;
|
||||||
|
let warning = completions.warning ?? "";
|
||||||
try {
|
try {
|
||||||
data = JSON.parse(completions.extract);
|
data = JSON.parse(completions.extract);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
logger.warn(`ExtractController: Error parsing JSON: ${e}`);
|
||||||
data = completions.extract;
|
data = completions.extract;
|
||||||
|
warning = "JSON could not be parsed correctly. Returning raw LLM output...";
|
||||||
}
|
}
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
@ -241,5 +242,6 @@ export async function extractController(
|
|||||||
success: true,
|
success: true,
|
||||||
data: data,
|
data: data,
|
||||||
scrape_id: id,
|
scrape_id: id,
|
||||||
|
warning: warning
|
||||||
});
|
});
|
||||||
}
|
}
|
@ -163,7 +163,7 @@ export const extractV1Options = z.object({
|
|||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000)
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user