Merge pull request #978 from mendableai/nsc/timeout-fixes

Timeout fixes on user defined timeouts
2025-08-20 12:39:07 +08:00 · 2024-12-15 15:02:46 -03:00 · 2024-12-15 15:02:46 -03:00 · 20f89c3478
commit 20f89c3478
parent 98f27b0acc 0f3a27bf27
7 changed files with 30 additions and 18 deletions
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";

 export async function scrapeURLWithFetch(
  meta: Meta,
+  timeToRun: number | undefined
 ): Promise<EngineScrapeResult> {
-  const timeout = 20000;
+  const timeout = timeToRun ?? 300000;

  const response = await Promise.race([
    fetch(meta.url, {
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node";
 import { Action } from "../../../../lib/entities";
 import { specialtyScrapeCheck } from "../utils/specialtyHandler";

-export const defaultTimeout = 10000;
-
 // This function does not take `Meta` on purpose. It may not access any
 // meta values to construct the request -- that must be done by the
 // `scrapeURLWithFireEngine*` functions.
@ -31,7 +29,7 @@ async function performFireEngineScrape<
 >(
  logger: Logger,
  request: FireEngineScrapeRequestCommon & Engine,
-  timeout = defaultTimeout,
+  timeout: number,
 ): Promise<FireEngineCheckStatusSuccess> {
  const scrape = await fireEngineScrape(
    logger.child({ method: "fireEngineScrape" }),
@ -94,6 +92,7 @@ async function performFireEngineScrape<

 export async function scrapeURLWithFireEngineChromeCDP(
  meta: Meta,
+  timeToRun: number | undefined,
 ): Promise<EngineScrapeResult> {
  const actions: Action[] = [
    // Transform waitFor option into an action (unsupported by chrome-cdp)
@ -121,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
    ...(meta.options.actions ?? []),
  ];
  
-  const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3));
+  const timeout = timeToRun ?? 300000;

  const request: FireEngineScrapeRequestCommon &
    FireEngineScrapeRequestChromeCDP = {
@ -208,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP(

 export async function scrapeURLWithFireEnginePlaywright(
  meta: Meta,
+  timeToRun: number | undefined,
 ): Promise<EngineScrapeResult> {
-  const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3);
+  const timeout = timeToRun ?? 300000;

  const request: FireEngineScrapeRequestCommon &
    FireEngineScrapeRequestPlaywright = {
@ -267,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright(

 export async function scrapeURLWithFireEngineTLSClient(
  meta: Meta,
+  timeToRun: number | undefined,
 ): Promise<EngineScrapeResult> {
-  const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3);
+  const timeout = timeToRun ?? 30000;

  const request: FireEngineScrapeRequestCommon &
    FireEngineScrapeRequestTLSClient = {
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -105,7 +105,7 @@ export type EngineScrapeResult = {
 };

 const engineHandlers: {
-  [E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
+  [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
 } = {
  cache: scrapeCache,
  "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): {
 export async function scrapeURLWithEngine(
  meta: Meta,
  engine: Engine,
+  timeToRun: number | undefined
 ): Promise<EngineScrapeResult> {
  const fn = engineHandlers[engine];
  const logger = meta.logger.child({
@ -383,5 +384,5 @@ export async function scrapeURLWithEngine(
    logger,
  };

-  return await fn(_meta);
+  return await fn(_meta, timeToRun);
 }
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
 async function scrapePDFWithLlamaParse(
  meta: Meta,
  tempFilePath: string,
+  timeToRun: number | undefined,
 ): Promise<PDFProcessorResult> {
  meta.logger.debug("Processing PDF document with LlamaIndex", {
    tempFilePath,
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(

  // TODO: timeout, retries
  const startedAt = Date.now();
+  const timeout = timeToRun ?? 300000;

-  while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
+  while (Date.now() <= startedAt + timeout) {
    try {
      const result = await robustFetch({
        url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF(
  };
 }

-export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
+export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
  if (!meta.options.parsePDF) {
    const file = await fetchFileToBuffer(meta.url);
    const content = file.buffer.toString("base64");
@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
          }),
        },
        tempFilePath,
+        timeToRun,
      );
    } catch (error) {
      if (error instanceof Error && error.message === "LlamaParse timed out") {
--- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";

 export async function scrapeURLWithPlaywright(
  meta: Meta,
+  timeToRun: number | undefined,
 ): Promise<EngineScrapeResult> {
-  const timeout = 20000 + meta.options.waitFor;
+  const timeout = (timeToRun ?? 300000) + meta.options.waitFor;

  const response = await Promise.race([
    await robustFetch({
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
      }),
    }),
    (async () => {
-      await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
+      await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
      throw new TimeoutError(
        "Playwright was unable to scrape the page before timing out",
        { cause: { timeout } },
--- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);

 export function scrapeURLWithScrapingBee(
  wait_browser: "domcontentloaded" | "networkidle2",
-): (meta: Meta) => Promise<EngineScrapeResult> {
-  return async (meta: Meta): Promise<EngineScrapeResult> => {
+): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
+  return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
    let response: AxiosResponse<any>;
+    const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
    try {
      response = await client.get({
        url: meta.url,
        params: {
-          timeout: 15000, // TODO: dynamic timeout based on request timeout
+          timeout,
          wait_browser: wait_browser,
-          wait: Math.min(meta.options.waitFor, 35000),
+          wait: meta.options.waitFor,
          transparent_status_code: true,
          json_response: true,
          screenshot: meta.options.formats.includes("screenshot"),
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
  const results: EngineResultsTracker = {};
  let result: EngineScrapeResultWithContext | null = null;

+  const timeToRun = meta.options.timeout !== undefined
+    ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
+    : undefined
+
  for (const { engine, unsupportedFeatures } of fallbackList) {
    const startedAt = Date.now();
    try {
      meta.logger.info("Scraping via " + engine + "...");
-      const _engineResult = await scrapeURLWithEngine(meta, engine);
+      const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
      if (_engineResult.markdown === undefined) {
        // Some engines emit Markdown directly.
        _engineResult.markdown = await parseMarkdown(_engineResult.html);