From 01f42b980dba6d631731ce68ac09f6d009a3dd62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 19 Sep 2024 19:21:13 +0200 Subject: [PATCH] feat(scrape): add error tallying instead of empty response --- apps/api/src/controllers/v1/scrape.ts | 13 ++++++------- apps/api/src/scraper/WebScraper/single_url.ts | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index f0744c22..6d006bce 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -64,22 +64,21 @@ export async function scrapeController( success: false, error: "Request timed out", }); - } else { + } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) { return res.status(500).json({ success: false, - error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ - extractorOptions && extractorOptions.mode !== "markdown" - ? " - Could be due to LLM parsing issues" - : "" - }`, + error: "All scraping methods failed for URL: " + req.body.url, + details: JSON.parse(e).errors as string[], }); + } else { + throw e; } } await job.remove(); if (!doc) { - console.error("!!! PANIC DOC IS", doc, job); + // console.error("!!! PANIC DOC IS", doc, job); return res.status(200).json({ success: true, warning: "No page found", diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d61fb828..0a3adf5c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -351,6 +351,9 @@ export async function scrapSingleUrl( pageStatusCode: 200, pageError: undefined, }; + + const errors: Record = {}; + try { let urlKey = urlToScrap; try { @@ -392,6 +395,12 @@ export async function scrapSingleUrl( pageError = undefined; } + if (attempt.pageError) { + errors[scraper] = attempt.pageError; + } else { + errors[scraper] = null; + } + if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; @@ -443,12 +452,17 @@ export async function scrapSingleUrl( return document; } catch (error) { - Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); ScrapeEvents.insert(jobId, { type: "error", message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), stack: error.stack, }); + + if (error instanceof Error && error.message.startsWith("All scraping methods failed")) { + throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)})); + } + return { content: "", markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,