From 01f42b980dba6d631731ce68ac09f6d009a3dd62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Thu, 19 Sep 2024 19:21:13 +0200
Subject: [PATCH] feat(scrape): add error tallying instead of empty response

---
 apps/api/src/controllers/v1/scrape.ts         | 13 ++++++-------
 apps/api/src/scraper/WebScraper/single_url.ts | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts
index f0744c22..6d006bce 100644
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@@ -64,22 +64,21 @@ export async function scrapeController(
         success: false,
         error: "Request timed out",
       });
-    } else {
+    } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) {
       return res.status(500).json({
         success: false,
-        error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
-          extractorOptions && extractorOptions.mode !== "markdown"
-            ? " - Could be due to LLM parsing issues"
-            : ""
-        }`,
+        error: "All scraping methods failed for URL: " + req.body.url,
+        details: JSON.parse(e).errors as string[],
       });
+    } else {
+      throw e;
     }
   }
 
   await job.remove();
 
   if (!doc) {
-    console.error("!!! PANIC DOC IS", doc, job);
+    // console.error("!!! PANIC DOC IS", doc, job);
     return res.status(200).json({
       success: true,
       warning: "No page found",
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index d61fb828..0a3adf5c 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -351,6 +351,9 @@ export async function scrapSingleUrl(
     pageStatusCode: 200,
     pageError: undefined,
   };
+
+  const errors: Record<string, string> = {};
+
   try {
     let urlKey = urlToScrap;
     try {
@@ -392,6 +395,12 @@ export async function scrapSingleUrl(
         pageError = undefined;
       }
 
+      if (attempt.pageError) {
+        errors[scraper] = attempt.pageError;
+      } else {
+        errors[scraper] = null;
+      }
+
       if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
         Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
         break;
@@ -443,12 +452,17 @@ export async function scrapSingleUrl(
 
     return document;
   } catch (error) {
-    Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
+    Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
     ScrapeEvents.insert(jobId, {
       type: "error",
       message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
       stack: error.stack,
     });
+
+    if (error instanceof Error && error.message.startsWith("All scraping methods failed")) {
+      throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)}));
+    }
+
     return {
       content: "",
       markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,