From b36406e4653484f70529eaa39c376303913ad69f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 17:28:53 -0300 Subject: [PATCH 1/6] Nick: log scrpaers --- apps/api/src/scraper/WebScraper/single_url.ts | 123 ++++++++++++++++-- apps/api/src/services/logging/scrape_log.ts | 37 ++++++ apps/api/src/types.ts | 17 ++- 3 files changed, 168 insertions(+), 9 deletions(-) create mode 100644 apps/api/src/services/logging/scrape_log.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b7d6fc12..1d5df69b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -45,6 +45,8 @@ export async function generateRequestParams( return defaultParams; } } +import { logScrape } from "../../services/logging/scrape_log"; + export async function scrapWithFireEngine({ url, waitFor = 0, @@ -60,9 +62,21 @@ export async function scrapWithFireEngine({ headers?: Record; options?: any; }): Promise { + + const logParams = { + url, + scraper: "fire-engine", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: "", + html: "", + startTime: Date.now(), + }; + + try { const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; console.log( @@ -90,6 +104,8 @@ export async function scrapWithFireEngine({ console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; return { html: "", screenshot: "", @@ -104,14 +120,20 @@ export async function scrapWithFireEngine({ url, pageOptions?.parsePDF ); + logParams.success = true; + logParams.html = content; + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; - const html = data.content; - const screenshot = data.screenshot; + logParams.success = true; + logParams.html = data.content ?? ""; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; return { - html: html ?? "", - screenshot: screenshot ?? "", + html: data.content ?? "", + screenshot: data.screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError, }; @@ -119,10 +141,24 @@ export async function scrapWithFireEngine({ } catch (error) { if (error.code === "ECONNABORTED") { console.log(`[Fire-Engine] Request timed out for ${url}`); + logParams.error_message = "Request timed out"; } else { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message; } return { html: "", screenshot: "" }; + } finally { + const endTime = Date.now(); + const time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape({ + url: logParams.url, + scraper: logParams.scraper, + success: logParams.success, + response_code: logParams.response_code, + time_taken_seconds, + error_message: logParams.error_message, + html: logParams.html, + }); } } @@ -132,6 +168,16 @@ export async function scrapWithScrapingBee( timeout: number = universalTimeout, pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: "", + html: "", + startTime: Date.now(), + }; try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const clientParams = await generateRequestParams( @@ -148,17 +194,22 @@ export async function scrapWithScrapingBee( }); const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { let text = ""; try { const decoder = new TextDecoder(); text = decoder.decode(response.data); + logParams.success = true; } catch (decodeError) { console.error( `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` ); + logParams.error_message = decodeError.message; } + logParams.response_code = response.status; + logParams.html = text; return { content: text, pageStatusCode: response.status, @@ -168,11 +219,17 @@ export async function scrapWithScrapingBee( } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message; + logParams.response_code = error.response?.status; return { content: "", - pageStatusCode: error.response.status, - pageError: error.response.statusText, + pageStatusCode: error.response?.status, + pageError: error.response?.statusText, }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); } } @@ -182,6 +239,18 @@ export async function scrapWithPlaywright( headers?: Record, pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "playwright", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: "", + html: "", + startTime: Date.now(), + }; + + try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -207,6 +276,8 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; return { content: "", pageStatusCode: response.data?.pageStatusCode, @@ -216,18 +287,24 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { const data = JSON.parse(textData); const html = data.content; + logParams.success = true; + logParams.html = html; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError, }; } catch (jsonError) { + logParams.error_message = jsonError.message; console.error( `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` ); @@ -236,11 +313,17 @@ export async function scrapWithPlaywright( } } catch (error) { if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; console.log(`[Playwright] Request timed out for ${url}`); } else { + logParams.error_message = error.message; console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); } } @@ -248,6 +331,18 @@ export async function scrapWithFetch( url: string, pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "fetch", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: "", + html: "", + startTime: Date.now(), + }; + + try { const response = await axios.get(url, { headers: { @@ -261,6 +356,8 @@ export async function scrapWithFetch( console.error( `[Axios] Error fetching url: ${url} with status: ${response.status}` ); + logParams.error_message = response.statusText; + logParams.response_code = response.status; return { content: "", pageStatusCode: response.status, @@ -270,18 +367,28 @@ export async function scrapWithFetch( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; - return { content: text, pageStatusCode: 200 }; + const result = { content: text, pageStatusCode: 200 }; + logParams.success = true; + logParams.html = text; + return result; } } catch (error) { if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; console.log(`[Axios] Request timed out for ${url}`); } else { + logParams.error_message = error.message; console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts new file mode 100644 index 00000000..bb568242 --- /dev/null +++ b/apps/api/src/services/logging/scrape_log.ts @@ -0,0 +1,37 @@ +import "dotenv/config"; +import { ScrapeLog } from "../../types"; +import { supabase_service } from "../supabase"; + +export async function logScrape(scrapeLog: ScrapeLog) { + try { + // Only log jobs in production + // if (process.env.ENV !== "production") { + // return; + // } + + const { data, error } = await supabase_service + .from("scrape_logs") + .insert([ + { + url: scrapeLog.url, + scraper: scrapeLog.scraper, + success: scrapeLog.success, + response_code: scrapeLog.response_code, + time_taken_seconds: scrapeLog.time_taken_seconds, + proxy: scrapeLog.proxy, + retried: scrapeLog.retried, + error_message: scrapeLog.error_message, + date_added: new Date().toISOString(), + html: scrapeLog.html, + ipv4_support: scrapeLog.ipv4_support, + ipv6_support: scrapeLog.ipv6_support, + }, + ]); + + if (error) { + console.error("Error logging proxy:\n", error); + } + } catch (error) { + console.error("Error logging proxy:\n", error); + } +} diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index e69353b6..7c3aacad 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -113,4 +113,19 @@ export enum NotificationType { APPROACHING_LIMIT = "approachingLimit", LIMIT_REACHED = "limitReached", RATE_LIMIT_REACHED = "rateLimitReached", -} \ No newline at end of file +} + +export type ScrapeLog = { + url: string; + scraper: string; + success?: boolean; + response_code?: number; + time_taken_seconds?: number; + proxy?: string; + retried?: boolean; + error_message?: string; + date_added?: string; // ISO 8601 format + html?: string; + ipv4_support?: boolean | null; + ipv6_support?: boolean | null; +}; \ No newline at end of file From 90cf799a3c2de1adb20afd32d35809322427e824 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 17:56:21 -0300 Subject: [PATCH 2/6] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 1d5df69b..c7a74552 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -121,13 +121,11 @@ export async function scrapWithFireEngine({ pageOptions?.parsePDF ); logParams.success = true; - logParams.html = content; - logParams.response_code = pageStatusCode; - logParams.error_message = pageError; + // We shouldnt care about the pdf logging here I believe return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; - logParams.success = true; + logParams.success = data.pageStatusCode >= 200 && data.pageStatusCode < 300 || data.pageStatusCode === 404; logParams.html = data.content ?? ""; logParams.response_code = data.pageStatusCode; logParams.error_message = data.pageError; @@ -144,7 +142,7 @@ export async function scrapWithFireEngine({ logParams.error_message = "Request timed out"; } else { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message; + logParams.error_message = error.message || error; } return { html: "", screenshot: "" }; } finally { @@ -195,7 +193,8 @@ export async function scrapWithScrapingBee( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + return { content, pageStatusCode, pageError }; } else { let text = ""; try { @@ -206,10 +205,12 @@ export async function scrapWithScrapingBee( console.error( `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` ); - logParams.error_message = decodeError.message; + logParams.error_message = decodeError.message || decodeError; } logParams.response_code = response.status; logParams.html = text; + logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; + logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; return { content: text, pageStatusCode: response.status, @@ -219,7 +220,7 @@ export async function scrapWithScrapingBee( } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message; + logParams.error_message = error.message || error; logParams.response_code = error.response?.status; return { content: "", @@ -304,7 +305,7 @@ export async function scrapWithPlaywright( pageError: data.pageError, }; } catch (jsonError) { - logParams.error_message = jsonError.message; + logParams.error_message = jsonError.message || jsonError; console.error( `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` ); @@ -316,7 +317,7 @@ export async function scrapWithPlaywright( logParams.error_message = "Request timed out"; console.log(`[Playwright] Request timed out for ${url}`); } else { - logParams.error_message = error.message; + logParams.error_message = error.message || error; console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } return { content: "" }; @@ -374,6 +375,8 @@ export async function scrapWithFetch( const result = { content: text, pageStatusCode: 200 }; logParams.success = true; logParams.html = text; + logParams.response_code = 200; + logParams.error_message = null; return result; } } catch (error) { @@ -381,7 +384,7 @@ export async function scrapWithFetch( logParams.error_message = "Request timed out"; console.log(`[Axios] Request timed out for ${url}`); } else { - logParams.error_message = error.message; + logParams.error_message = error.message || error; console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } return { content: "" }; From 90c54c32fdef78f37169a6a9b82db2baf676d7f8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:01:17 -0300 Subject: [PATCH 3/6] Nick: refactor --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/global.ts | 1 + .../src/scraper/WebScraper/scrapers/fetch.ts | 70 ++++ .../scraper/WebScraper/scrapers/fireEngine.ts | 119 ++++++ .../scraper/WebScraper/scrapers/playwright.ts | 98 +++++ .../WebScraper/scrapers/scrapingBee.ts | 80 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 380 +----------------- 7 files changed, 391 insertions(+), 359 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/global.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/fetch.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/playwright.ts create mode 100644 apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 831970ea..99fff9e4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; -import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; diff --git a/apps/api/src/scraper/WebScraper/global.ts b/apps/api/src/scraper/WebScraper/global.ts new file mode 100644 index 00000000..7233fe78 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/global.ts @@ -0,0 +1 @@ +export const universalTimeout = 15000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts new file mode 100644 index 00000000..562fa6e7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -0,0 +1,70 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "fetch", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { + console.error( + `[Axios] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.statusText; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: response.statusText, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { + const text = response.data; + const result = { content: text, pageStatusCode: 200 }; + logParams.success = true; + logParams.html = text; + logParams.response_code = 200; + logParams.error_message = null; + return result; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Axios] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } + return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts new file mode 100644 index 00000000..f6121861 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -0,0 +1,119 @@ +import axios from "axios"; +import { FireEngineResponse } from "../../../lib/entities"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithFireEngine({ + url, + waitFor = 0, + screenshot = false, + pageOptions = { parsePDF: true }, + headers, + options, +}: { + url: string; + waitFor?: number; + screenshot?: boolean; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + headers?: Record; + options?: any; +}): Promise { + const logParams = { + url, + scraper: "fire-engine", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + const waitParam = reqParams["params"]?.wait ?? waitFor; + const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + console.log( + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + ); + + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { + url: url, + wait: waitParam, + screenshot: screenshotParam, + headers: headers, + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, + } + ); + + if (response.status !== 200) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + html: "", + screenshot: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + url, + pageOptions?.parsePDF + ); + logParams.success = true; + // We shouldnt care about the pdf logging here I believe + return { html: content, screenshot: "", pageStatusCode, pageError }; + } else { + const data = response.data; + logParams.success = + (data.pageStatusCode >= 200 && data.pageStatusCode < 300) || + data.pageStatusCode === 404; + logParams.html = data.content ?? ""; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + html: data.content ?? "", + screenshot: data.screenshot ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + console.log(`[Fire-Engine] Request timed out for ${url}`); + logParams.error_message = "Request timed out"; + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + } + return { html: "", screenshot: "" }; + } finally { + const endTime = Date.now(); + const time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape({ + url: logParams.url, + scraper: logParams.scraper, + success: logParams.success, + response_code: logParams.response_code, + time_taken_seconds, + error_message: logParams.error_message, + html: logParams.html, + }); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts new file mode 100644 index 00000000..fd1aef53 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -0,0 +1,98 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +export async function scrapWithPlaywright( + url: string, + waitFor: number = 0, + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "playwright", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; + + const response = await axios.post( + process.env.PLAYWRIGHT_MICROSERVICE_URL, + { + url: url, + wait_after_load: waitParam, + headers: headers, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + } + ); + + if (response.status !== 200) { + console.error( + `[Playwright] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + content: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { + const textData = response.data; + try { + const data = JSON.parse(textData); + const html = data.content; + logParams.success = true; + logParams.html = html; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + content: html ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } catch (jsonError) { + logParams.error_message = jsonError.message || jsonError; + console.error( + `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + ); + return { content: "" }; + } + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Playwright] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } + return { content: "" }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts new file mode 100644 index 00000000..5ab0e061 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -0,0 +1,80 @@ +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; +import { ScrapingBeeClient } from "scrapingbee"; + + +export async function scrapWithScrapingBee( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } + ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + try { + const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + transparent_status_code: "True", + }, + }); + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + return { content, pageStatusCode, pageError }; + } else { + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + logParams.success = true; + } catch (decodeError) { + console.error( + `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + ); + logParams.error_message = decodeError.message || decodeError; + } + logParams.response_code = response.status; + logParams.html = text; + logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; + logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; + return { + content: text, + pageStatusCode: response.status, + pageError: + response.statusText != "OK" ? response.statusText : undefined, + }; + } + } catch (error) { + console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + logParams.response_code = error.response?.status; + return { + content: "", + pageStatusCode: error.response?.status, + pageError: error.response?.statusText, + }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } + } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7a74552..cc162456 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -1,14 +1,21 @@ import * as cheerio from "cheerio"; -import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities"; +import { + Document, + PageOptions, + FireEngineResponse, + ExtractorOptions, +} from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; import { removeUnwantedElements } from "./utils/removeUnwantedElements"; -import axios from "axios"; +import { scrapWithFetch } from "./scrapers/fetch"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { scrapWithPlaywright } from "./scrapers/playwright"; +import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; dotenv.config(); @@ -20,8 +27,6 @@ const baseScrapers = [ "fetch", ] as const; -const universalTimeout = 15000; - export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -45,355 +50,6 @@ export async function generateRequestParams( return defaultParams; } } -import { logScrape } from "../../services/logging/scrape_log"; - -export async function scrapWithFireEngine({ - url, - waitFor = 0, - screenshot = false, - pageOptions = { parsePDF: true }, - headers, - options, -}: { - url: string; - waitFor?: number; - screenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; - headers?: Record; - options?: any; -}): Promise { - - const logParams = { - url, - scraper: "fire-engine", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const reqParams = await generateRequestParams(url); - const waitParam = reqParams["params"]?.wait ?? waitFor; - const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` - ); - - const response = await axios.post( - process.env.FIRE_ENGINE_BETA_URL + "/scrape", - { - url: url, - wait: waitParam, - screenshot: screenshotParam, - headers: headers, - pageOptions: pageOptions, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, - } - ); - - if (response.status !== 200) { - console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.data?.pageError; - logParams.response_code = response.data?.pageStatusCode; - return { - html: "", - screenshot: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - url, - pageOptions?.parsePDF - ); - logParams.success = true; - // We shouldnt care about the pdf logging here I believe - return { html: content, screenshot: "", pageStatusCode, pageError }; - } else { - const data = response.data; - logParams.success = data.pageStatusCode >= 200 && data.pageStatusCode < 300 || data.pageStatusCode === 404; - logParams.html = data.content ?? ""; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError; - return { - html: data.content ?? "", - screenshot: data.screenshot ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); - logParams.error_message = "Request timed out"; - } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message || error; - } - return { html: "", screenshot: "" }; - } finally { - const endTime = Date.now(); - const time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape({ - url: logParams.url, - scraper: logParams.scraper, - success: logParams.success, - response_code: logParams.response_code, - time_taken_seconds, - error_message: logParams.error_message, - html: logParams.html, - }); - } -} - -export async function scrapWithScrapingBee( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - try { - const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const clientParams = await generateRequestParams( - url, - wait_browser, - timeout - ); - const response = await client.get({ - ...clientParams, - params: { - ...clientParams.params, - transparent_status_code: "True", - }, - }); - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); - return { content, pageStatusCode, pageError }; - } else { - let text = ""; - try { - const decoder = new TextDecoder(); - text = decoder.decode(response.data); - logParams.success = true; - } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` - ); - logParams.error_message = decodeError.message || decodeError; - } - logParams.response_code = response.status; - logParams.html = text; - logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; - logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; - return { - content: text, - pageStatusCode: response.status, - pageError: - response.statusText != "OK" ? response.statusText : undefined, - }; - } - } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - logParams.error_message = error.message || error; - logParams.response_code = error.response?.status; - return { - content: "", - pageStatusCode: error.response?.status, - pageError: error.response?.statusText, - }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} - -export async function scrapWithPlaywright( - url: string, - waitFor: number = 0, - headers?: Record, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "playwright", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - - const response = await axios.post( - process.env.PLAYWRIGHT_MICROSERVICE_URL, - { - url: url, - wait_after_load: waitParam, - headers: headers, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - } - ); - - if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.data?.pageError; - logParams.response_code = response.data?.pageStatusCode; - return { - content: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const textData = response.data; - try { - const data = JSON.parse(textData); - const html = data.content; - logParams.success = true; - logParams.html = html; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError; - return { - content: html ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } catch (jsonError) { - logParams.error_message = jsonError.message || jsonError; - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` - ); - return { content: "" }; - } - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - console.log(`[Playwright] Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} - -export async function scrapWithFetch( - url: string, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "fetch", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: "", - html: "", - startTime: Date.now(), - }; - - - try { - const response = await axios.get(url, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout, - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.statusText; - logParams.response_code = response.status; - return { - content: "", - pageStatusCode: response.status, - pageError: response.statusText, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const text = response.data; - const result = { content: text, pageStatusCode: 200 }; - logParams.success = true; - logParams.html = text; - logParams.response_code = 200; - logParams.error_message = null; - return result; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - console.log(`[Axios] Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} /** * Get the order of scrapers to be used for scraping a URL @@ -464,7 +120,7 @@ export async function scrapSingleUrl( headers: undefined, }, extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown" + mode: "llm-extraction-from-markdown", }, existingHtml: string = "" ): Promise { @@ -628,7 +284,7 @@ export async function scrapSingleUrl( html = attempt.html ?? ""; rawHtml = attempt.rawHtml ?? ""; screenshot = attempt.screenshot ?? ""; - + if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } @@ -659,7 +315,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -673,7 +333,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, sourceURL: urlToScrap, From 2d30cc6117b993aa37f12cf29d9563f9f1dc8b0b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:01:54 -0300 Subject: [PATCH 4/6] Nick: comments --- apps/api/src/scraper/WebScraper/scrapers/fetch.ts | 7 +++++++ apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 10 ++++++++++ apps/api/src/scraper/WebScraper/scrapers/playwright.ts | 8 ++++++++ .../api/src/scraper/WebScraper/scrapers/scrapingBee.ts | 9 ++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 562fa6e7..9badfd91 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -3,6 +3,13 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Axios + * @param url The URL to scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithFetch( url: string, pageOptions: { parsePDF?: boolean } = { parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index f6121861..ce3cd2da 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -5,6 +5,16 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +/** + * Scrapes a URL with Fire-Engine + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param screenshot Whether to take a screenshot + * @param pageOptions The options for the page + * @param headers The headers to send with the request + * @param options The options for the request + * @returns The scraped content + */ export async function scrapWithFireEngine({ url, waitFor = 0, diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index fd1aef53..03a6728d 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -4,6 +4,14 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +/** + * Scrapes a URL with Playwright + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param headers The headers to send with the request + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithPlaywright( url: string, waitFor: number = 0, diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 5ab0e061..63e8a082 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -4,7 +4,14 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { ScrapingBeeClient } from "scrapingbee"; - +/** + * Scrapes a URL with ScrapingBee + * @param url The URL to scrape + * @param wait_browser The browser event to wait for + * @param timeout The timeout for the scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", From f5b2fbd7e830829e9a826125c0aeaf6b1eebc881 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:06:53 -0300 Subject: [PATCH 5/6] Nick: revision --- .../src/scraper/WebScraper/scrapers/fetch.ts | 14 +++++++------- .../scraper/WebScraper/scrapers/fireEngine.ts | 19 +++++++------------ .../scraper/WebScraper/scrapers/playwright.ts | 9 ++++++--- .../WebScraper/scrapers/scrapingBee.ts | 7 ++++--- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 9badfd91..4c31438c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -3,7 +3,6 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; - /** * Scrapes a URL with Axios * @param url The URL to scrape @@ -50,15 +49,16 @@ export async function scrapWithFetch( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; } else { const text = response.data; - const result = { content: text, pageStatusCode: 200 }; logParams.success = true; logParams.html = text; - logParams.response_code = 200; - logParams.error_message = null; - return result; + logParams.response_code = response.status; + return { content: text, pageStatusCode: response.status, pageError: null }; } } catch (error) { if (error.code === "ECONNABORTED") { @@ -68,7 +68,7 @@ export async function scrapWithFetch( logParams.error_message = error.message || error; console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ce3cd2da..50388dea 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -87,7 +87,8 @@ export async function scrapWithFireEngine({ pageOptions?.parsePDF ); logParams.success = true; - // We shouldnt care about the pdf logging here I believe + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; @@ -112,18 +113,12 @@ export async function scrapWithFireEngine({ console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); logParams.error_message = error.message || error; } - return { html: "", screenshot: "" }; + return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); - const time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape({ - url: logParams.url, - scraper: logParams.scraper, - success: logParams.success, - response_code: logParams.response_code, - time_taken_seconds, - error_message: logParams.error_message, - html: logParams.html, - }); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); } } + + diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 03a6728d..11c3c5ad 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -66,7 +66,10 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { logParams.success = true; - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; } else { const textData = response.data; try { @@ -86,7 +89,7 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` ); - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } } } catch (error) { @@ -97,7 +100,7 @@ export async function scrapWithPlaywright( logParams.error_message = error.message || error; console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } - return { content: "" }; + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 63e8a082..9a1f0b35 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -46,6 +46,8 @@ export async function scrapWithScrapingBee( if (contentType && contentType.includes("application/pdf")) { logParams.success = true; const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; return { content, pageStatusCode, pageError }; } else { let text = ""; @@ -62,12 +64,11 @@ export async function scrapWithScrapingBee( logParams.response_code = response.status; logParams.html = text; logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; - logParams.error_message = response.statusText != "OK" ? response.statusText : undefined; + logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined; return { content: text, pageStatusCode: response.status, - pageError: - response.statusText != "OK" ? response.statusText : undefined, + pageError: response.statusText !== "OK" ? response.statusText : undefined, }; } } catch (error) { From 066d92f643a69a7fe19c26b63a1dd030a3c13c9e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:38:17 -0300 Subject: [PATCH 6/6] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index cc162456..d24e5c2e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -290,7 +290,7 @@ export async function scrapSingleUrl( } if (attempt.pageError && attempt.pageStatusCode >= 400) { pageError = attempt.pageError; - } else if (attempt.pageStatusCode < 400) { + } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) { pageError = undefined; }