diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index af6f57c0..168d9b8f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( meta: Meta, + timeToRun: number | undefined ): Promise { - const timeout = 20000; + const timeout = timeToRun ?? 300000; const response = await Promise.race([ fetch(meta.url, { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 2b67c4d6..ef0b41fc 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -export const defaultTimeout = 10000; - // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the // `scrapeURLWithFireEngine*` functions. @@ -31,7 +29,7 @@ async function performFireEngineScrape< >( logger: Logger, request: FireEngineScrapeRequestCommon & Engine, - timeout = defaultTimeout, + timeout: number, ): Promise { const scrape = await fireEngineScrape( logger.child({ method: "fireEngineScrape" }), @@ -94,6 +92,7 @@ async function performFireEngineScrape< export async function scrapeURLWithFireEngineChromeCDP( meta: Meta, + timeToRun: number | undefined, ): Promise { const actions: Action[] = [ // Transform waitFor option into an action (unsupported by chrome-cdp) @@ -121,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP( ...(meta.options.actions ?? []), ]; - const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3)); + const timeout = timeToRun ?? 300000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -208,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEnginePlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3); + const timeout = timeToRun ?? 300000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -267,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEngineTLSClient( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3); + const timeout = timeToRun ?? 30000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 01ac0be9..14f263f3 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -105,7 +105,7 @@ export type EngineScrapeResult = { }; const engineHandlers: { - [E in Engine]: (meta: Meta) => Promise; + [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise; } = { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, @@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, engine: Engine, + timeToRun: number | undefined ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ @@ -383,5 +384,5 @@ export async function scrapeURLWithEngine( logger, }; - return await fn(_meta); + return await fn(_meta, timeToRun); } diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 341a4f1a..24d5f002 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string }; async function scrapePDFWithLlamaParse( meta: Meta, tempFilePath: string, + timeToRun: number | undefined, ): Promise { meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath, @@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse( // TODO: timeout, retries const startedAt = Date.now(); + const timeout = timeToRun ?? 300000; - while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { + while (Date.now() <= startedAt + timeout) { try { const result = await robustFetch({ url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, @@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF( }; } -export async function scrapePDF(meta: Meta): Promise { +export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); @@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise { }), }, tempFilePath, + timeToRun, ); } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index c92b1d90..edcd50c0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch"; export async function scrapeURLWithPlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = 20000 + meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; const response = await Promise.race([ await robustFetch({ @@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright( }), }), (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); + await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); throw new TimeoutError( "Playwright was unable to scrape the page before timing out", { cause: { timeout } }, diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 50ac502b..db702a44 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( wait_browser: "domcontentloaded" | "networkidle2", -): (meta: Meta) => Promise { - return async (meta: Meta): Promise => { +): (meta: Meta, timeToRun: number | undefined) => Promise { + return async (meta: Meta, timeToRun: number | undefined): Promise => { let response: AxiosResponse; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { response = await client.get({ url: meta.url, params: { - timeout: 15000, // TODO: dynamic timeout based on request timeout + timeout, wait_browser: wait_browser, - wait: Math.min(meta.options.waitFor, 35000), + wait: meta.options.waitFor, transparent_status_code: true, json_response: true, screenshot: meta.options.formats.includes("screenshot"), diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index d3b33418..c0b6d4e5 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; + const timeToRun = meta.options.timeout !== undefined + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + : undefined + for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html);