feat(scrape): add error tallying instead of empty response

2025-08-05 10:00:38 +08:00 · 2024-09-19 19:21:13 +02:00 · 2024-09-19 19:21:13 +02:00 · 01f42b980d
commit 01f42b980d
parent 712ca31615
2 changed files with 21 additions and 8 deletions
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -64,22 +64,21 @@ export async function scrapeController(
        success: false,
        error: "Request timed out",
      });
-    } else {
+    } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) {
      return res.status(500).json({
        success: false,
-        error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
-          extractorOptions && extractorOptions.mode !== "markdown"
-            ? " - Could be due to LLM parsing issues"
-            : ""
-        }`,
+        error: "All scraping methods failed for URL: " + req.body.url,
+        details: JSON.parse(e).errors as string[],
      });
+    } else {
+      throw e;
    }
  }

  await job.remove();

  if (!doc) {
-    console.error("!!! PANIC DOC IS", doc, job);
+    // console.error("!!! PANIC DOC IS", doc, job);
    return res.status(200).json({
      success: true,
      warning: "No page found",
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -351,6 +351,9 @@ export async function scrapSingleUrl(
    pageStatusCode: 200,
    pageError: undefined,
  };
+
+  const errors: Record<string, string> = {};
+
  try {
    let urlKey = urlToScrap;
    try {
@ -392,6 +395,12 @@ export async function scrapSingleUrl(
        pageError = undefined;
      }

+      if (attempt.pageError) {
+        errors[scraper] = attempt.pageError;
+      } else {
+        errors[scraper] = null;
+      }
+
      if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
        break;
@ -443,12 +452,17 @@ export async function scrapSingleUrl(

    return document;
  } catch (error) {
-    Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
+    Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
    ScrapeEvents.insert(jobId, {
      type: "error",
      message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
      stack: error.stack,
    });
+
+    if (error instanceof Error && error.message.startsWith("All scraping methods failed")) {
+      throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)}));
+    }
+
    return {
      content: "",
      markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,