From a59b5836d5d46ed9e032214dd5e58a0fcf603095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 24 Sep 2024 10:27:49 +0200 Subject: [PATCH] Revert error tallying --- apps/api/src/controllers/v0/scrape.ts | 18 +++++------------ apps/api/src/controllers/v0/search.ts | 4 +--- apps/api/src/controllers/v1/scrape.ts | 13 ++++++------ apps/api/src/scraper/WebScraper/single_url.ts | 20 ++----------------- apps/api/src/services/queue-worker.ts | 2 +- 5 files changed, 16 insertions(+), 41 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 22c694d0..c46ebc62 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -283,21 +283,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { - if (typeof error === "string" && error.startsWith("{\"type\":\"all\",")) { - return res.status(500).json({ - success: false, - error: "All scraping methods failed for URL: " + req.body.url, - details: JSON.parse(error).errors as string[], - }); - } else { - Sentry.captureException(error); - Logger.error(error); - return res.status(500).json({ - error: + Sentry.captureException(error); + Logger.error(error); + return res.status(500).json({ + error: typeof error === "string" ? error : error?.message ?? "Internal Server Error", - }); - } + }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index b42f82d8..5ef2b767 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -196,9 +196,7 @@ export async function searchController(req: Request, res: Response) { return res.status(408).json({ error: "Request timed out" }); } - if (!(error instanceof Error && error.message.startsWith('{"type":"all",'))) { - Sentry.captureException(error); - } + Sentry.captureException(error); Logger.error(error); return res.status(500).json({ error: error.message }); } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 6d006bce..f0744c22 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -64,21 +64,22 @@ export async function scrapeController( success: false, error: "Request timed out", }); - } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) { + } else { return res.status(500).json({ success: false, - error: "All scraping methods failed for URL: " + req.body.url, - details: JSON.parse(e).errors as string[], + error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ + extractorOptions && extractorOptions.mode !== "markdown" + ? " - Could be due to LLM parsing issues" + : "" + }`, }); - } else { - throw e; } } await job.remove(); if (!doc) { - // console.error("!!! PANIC DOC IS", doc, job); + console.error("!!! PANIC DOC IS", doc, job); return res.status(200).json({ success: true, warning: "No page found", diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 75c5da2c..80491f3c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -357,9 +357,6 @@ export async function scrapSingleUrl( pageStatusCode: 200, pageError: undefined, }; - - const errors: Record = {}; - try { let urlKey = urlToScrap; try { @@ -401,12 +398,6 @@ export async function scrapSingleUrl( pageError = undefined; } - if (attempt.pageError) { - errors[scraper] = attempt.pageError; - } else { - errors[scraper] = null; - } - if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; @@ -421,9 +412,7 @@ export async function scrapSingleUrl( // } } - // NOTE: This exception for status codes may only work with fire-engine. In lieu of better error management, - // it's the best we can do. - mogery - if (!text && !Object.values(errors).some(x => x.startsWith("Request failed with status code ") || x === "NOT FOUND")) { + if (!text) { throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); } @@ -460,17 +449,12 @@ export async function scrapSingleUrl( return document; } catch (error) { - Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); ScrapeEvents.insert(jobId, { type: "error", message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), stack: error.stack, }); - - if (error instanceof Error && error.message.startsWith("All scraping methods failed")) { - throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)})); - } - return { content: "", markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a4b1c74d..37e14baf 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -448,7 +448,7 @@ async function processJob(job: Job, token: string) { } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); - if (!(error instanceof Error && (error.message.includes("JSON parsing error(s): ") || error.message.startsWith('{"type":"all",')))) { + if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) { Sentry.captureException(error, { data: { job: job.id,