mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-20 12:39:07 +08:00
Merge pull request #978 from mendableai/nsc/timeout-fixes
Timeout fixes on user defined timeouts
This commit is contained in:
commit
20f89c3478
@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000;
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const response = await Promise.race([
|
||||
fetch(meta.url, {
|
||||
|
@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export const defaultTimeout = 10000;
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
// `scrapeURLWithFireEngine*` functions.
|
||||
@ -31,7 +29,7 @@ async function performFireEngineScrape<
|
||||
>(
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
timeout = defaultTimeout,
|
||||
timeout: number,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const scrape = await fireEngineScrape(
|
||||
logger.child({ method: "fireEngineScrape" }),
|
||||
@ -94,6 +92,7 @@ async function performFireEngineScrape<
|
||||
|
||||
export async function scrapeURLWithFireEngineChromeCDP(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const actions: Action[] = [
|
||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||
@ -121,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3));
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestChromeCDP = {
|
||||
@ -208,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
|
||||
export async function scrapeURLWithFireEnginePlaywright(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3);
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestPlaywright = {
|
||||
@ -267,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
|
||||
export async function scrapeURLWithFireEngineTLSClient(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3);
|
||||
const timeout = timeToRun ?? 30000;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestTLSClient = {
|
||||
|
@ -105,7 +105,7 @@ export type EngineScrapeResult = {
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
|
||||
[E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
|
||||
} = {
|
||||
cache: scrapeCache,
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): {
|
||||
export async function scrapeURLWithEngine(
|
||||
meta: Meta,
|
||||
engine: Engine,
|
||||
timeToRun: number | undefined
|
||||
): Promise<EngineScrapeResult> {
|
||||
const fn = engineHandlers[engine];
|
||||
const logger = meta.logger.child({
|
||||
@ -383,5 +384,5 @@ export async function scrapeURLWithEngine(
|
||||
logger,
|
||||
};
|
||||
|
||||
return await fn(_meta);
|
||||
return await fn(_meta, timeToRun);
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
|
||||
async function scrapePDFWithLlamaParse(
|
||||
meta: Meta,
|
||||
tempFilePath: string,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with LlamaIndex", {
|
||||
tempFilePath,
|
||||
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
|
||||
|
||||
// TODO: timeout, retries
|
||||
const startedAt = Date.now();
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
|
||||
while (Date.now() <= startedAt + timeout) {
|
||||
try {
|
||||
const result = await robustFetch({
|
||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||
@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF(
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
|
||||
if (!meta.options.parsePDF) {
|
||||
const file = await fetchFileToBuffer(meta.url);
|
||||
const content = file.buffer.toString("base64");
|
||||
@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
timeToRun,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||
|
@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function scrapeURLWithPlaywright(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000 + meta.options.waitFor;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
|
||||
const response = await Promise.race([
|
||||
await robustFetch({
|
||||
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
|
||||
}),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
throw new TimeoutError(
|
||||
"Playwright was unable to scrape the page before timing out",
|
||||
{ cause: { timeout } },
|
||||
|
@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
|
||||
export function scrapeURLWithScrapingBee(
|
||||
wait_browser: "domcontentloaded" | "networkidle2",
|
||||
): (meta: Meta) => Promise<EngineScrapeResult> {
|
||||
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
|
||||
return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
|
||||
let response: AxiosResponse<any>;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
try {
|
||||
response = await client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout: 15000, // TODO: dynamic timeout based on request timeout
|
||||
timeout,
|
||||
wait_browser: wait_browser,
|
||||
wait: Math.min(meta.options.waitFor, 35000),
|
||||
wait: meta.options.waitFor,
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
|
@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
const results: EngineResultsTracker = {};
|
||||
let result: EngineScrapeResultWithContext | null = null;
|
||||
|
||||
const timeToRun = meta.options.timeout !== undefined
|
||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
|
||||
: undefined
|
||||
|
||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||
const startedAt = Date.now();
|
||||
try {
|
||||
meta.logger.info("Scraping via " + engine + "...");
|
||||
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
||||
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
|
||||
if (_engineResult.markdown === undefined) {
|
||||
// Some engines emit Markdown directly.
|
||||
_engineResult.markdown = await parseMarkdown(_engineResult.html);
|
||||
|
Loading…
x
Reference in New Issue
Block a user