From 1368f9a87f0a078b84c9f2b8ec82f8fe93265f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 20 Aug 2024 22:24:18 +0200 Subject: [PATCH] fix: treat existing screenshot as a scraper success condition --- apps/api/src/controllers/v1/types.ts | 10 ++++++++++ apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 4dcb32fc..ffaf6c19 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -279,6 +279,16 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { } export function legacyDocumentConverter(doc: any): Document { + if (doc.metadata.screenshot) { + doc.screenshot = doc.metadata.screenshot; + delete doc.metadata.screenshot; + } + + if (doc.metadata.fullPageScreenshot) { + doc.fullPageScreenshot = doc.metadata.fullPageScreenshot; + delete doc.metadata.fullPageScreenshot; + } + return { markdown: doc.markdown, links: doc.linksOnPage, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2bbbc7b..408f9838 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -340,8 +340,8 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {