From f5b2fbd7e830829e9a826125c0aeaf6b1eebc881 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:06:53 -0300 Subject: [PATCH] Nick: revision --- .../src/scraper/WebScraper/scrapers/fetch.ts | 14 +++++++------- .../scraper/WebScraper/scrapers/fireEngine.ts | 19 +++++++------------ .../scraper/WebScraper/scrapers/playwright.ts | 9 ++++++--- .../WebScraper/scrapers/scrapingBee.ts | 7 ++++--- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 9badfd91..4c31438c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -3,7 +3,6 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; - /** * Scrapes a URL with Axios * @param url The URL to scrape @@ -50,15 +49,16 @@ export async function scrapWithFetch( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; } else { const text = response.data; - const result = { content: text, pageStatusCode: 200 }; logParams.success = true; logParams.html = text; - logParams.response_code = 200; - logParams.error_message = null; - return result; + logParams.response_code = response.status; + return { content: text, pageStatusCode: response.status, pageError: null }; } } catch (error) { if (error.code === "ECONNABORTED") { @@ -68,7 +68,7 @@ export async function scrapWithFetch( logParams.error_message = error.message || error; console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ce3cd2da..50388dea 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -87,7 +87,8 @@ export async function scrapWithFireEngine({ pageOptions?.parsePDF ); logParams.success = true; - // We shouldnt care about the pdf logging here I believe + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; @@ -112,18 +113,12 @@ export async function scrapWithFireEngine({ console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); logParams.error_message = error.message || error; } - return { html: "", screenshot: "" }; + return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); - const time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape({ - url: logParams.url, - scraper: logParams.scraper, - success: logParams.success, - response_code: logParams.response_code, - time_taken_seconds, - error_message: logParams.error_message, - html: logParams.html, - }); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); } } + + diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 03a6728d..11c3c5ad 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -66,7 +66,10 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; } else { const textData = response.data; try { @@ -86,7 +89,7 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` ); - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } } } catch (error) { @@ -97,7 +100,7 @@ export async function scrapWithPlaywright( logParams.error_message = error.message || error; console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 63e8a082..9a1f0b35 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -46,6 +46,8 @@ export async function scrapWithScrapingBee( if (contentType && contentType.includes("application/pdf")) { logParams.success = true; const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; return { content, pageStatusCode, pageError }; } else { let text = ""; @@ -62,12 +64,11 @@ export async function scrapWithScrapingBee( logParams.response_code = response.status; logParams.html = text; logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; - logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; + logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined; return { content: text, pageStatusCode: response.status, - pageError: - response.statusText != "OK" ? response.statusText : undefined, + pageError: response.statusText !== "OK" ? response.statusText : undefined, }; } } catch (error) {