diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 831970ea..99fff9e4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; -import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; diff --git a/apps/api/src/scraper/WebScraper/global.ts b/apps/api/src/scraper/WebScraper/global.ts new file mode 100644 index 00000000..7233fe78 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/global.ts @@ -0,0 +1 @@ +export const universalTimeout = 15000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts new file mode 100644 index 00000000..4c31438c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -0,0 +1,77 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Axios + * @param url The URL to scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "fetch", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { + console.error( + `[Axios] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.statusText; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: response.statusText, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + const text = response.data; + logParams.success = true; + logParams.html = text; + logParams.response_code = response.status; + return { content: text, pageStatusCode: response.status, pageError: null }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Axios] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts new file mode 100644 index 00000000..50388dea --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -0,0 +1,124 @@ +import axios from "axios"; +import { FireEngineResponse } from "../../../lib/entities"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Fire-Engine + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param screenshot Whether to take a screenshot + * @param pageOptions The options for the page + * @param headers The headers to send with the request + * @param options The options for the request + * @returns The scraped content + */ +export async function scrapWithFireEngine({ + url, + waitFor = 0, + screenshot = false, + pageOptions = { parsePDF: true }, + headers, + options, +}: { + url: string; + waitFor?: number; + screenshot?: boolean; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + headers?: Record; + options?: any; +}): Promise { + const logParams = { + url, + scraper: "fire-engine", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + const waitParam = reqParams["params"]?.wait ?? waitFor; + const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + console.log( + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + ); + + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { + url: url, + wait: waitParam, + screenshot: screenshotParam, + headers: headers, + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, + } + ); + + if (response.status !== 200) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + html: "", + screenshot: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + url, + pageOptions?.parsePDF + ); + logParams.success = true; + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { html: content, screenshot: "", pageStatusCode, pageError }; + } else { + const data = response.data; + logParams.success = + (data.pageStatusCode >= 200 && data.pageStatusCode < 300) || + data.pageStatusCode === 404; + logParams.html = data.content ?? ""; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + html: data.content ?? "", + screenshot: data.screenshot ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + console.log(`[Fire-Engine] Request timed out for ${url}`); + logParams.error_message = "Request timed out"; + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + } + return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} + + diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts new file mode 100644 index 00000000..11c3c5ad --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -0,0 +1,109 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Playwright + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param headers The headers to send with the request + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithPlaywright( + url: string, + waitFor: number = 0, + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "playwright", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; + + const response = await axios.post( + process.env.PLAYWRIGHT_MICROSERVICE_URL, + { + url: url, + wait_after_load: waitParam, + headers: headers, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + } + ); + + if (response.status !== 200) { + console.error( + `[Playwright] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + content: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + const textData = response.data; + try { + const data = JSON.parse(textData); + const html = data.content; + logParams.success = true; + logParams.html = html; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + content: html ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } catch (jsonError) { + logParams.error_message = jsonError.message || jsonError; + console.error( + `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + ); + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Playwright] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts new file mode 100644 index 00000000..9a1f0b35 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -0,0 +1,88 @@ +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; +import { ScrapingBeeClient } from "scrapingbee"; + +/** + * Scrapes a URL with ScrapingBee + * @param url The URL to scrape + * @param wait_browser The browser event to wait for + * @param timeout The timeout for the scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithScrapingBee( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } + ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + try { + const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + transparent_status_code: "True", + }, + }); + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + logParams.success = true; + } catch (decodeError) { + console.error( + `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + ); + logParams.error_message = decodeError.message || decodeError; + } + logParams.response_code = response.status; + logParams.html = text; + logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; + logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined; + return { + content: text, + pageStatusCode: response.status, + pageError: response.statusText !== "OK" ? response.statusText : undefined, + }; + } + } catch (error) { + console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + logParams.response_code = error.response?.status; + return { + content: "", + pageStatusCode: error.response?.status, + pageError: error.response?.statusText, + }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } + } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b7d6fc12..d24e5c2e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -1,14 +1,21 @@ import * as cheerio from "cheerio"; -import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities"; +import { + Document, + PageOptions, + FireEngineResponse, + ExtractorOptions, +} from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; import { removeUnwantedElements } from "./utils/removeUnwantedElements"; -import axios from "axios"; +import { scrapWithFetch } from "./scrapers/fetch"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { scrapWithPlaywright } from "./scrapers/playwright"; +import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; dotenv.config(); @@ -20,8 +27,6 @@ const baseScrapers = [ "fetch", ] as const; -const universalTimeout = 15000; - export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -45,245 +50,6 @@ export async function generateRequestParams( return defaultParams; } } -export async function scrapWithFireEngine({ - url, - waitFor = 0, - screenshot = false, - pageOptions = { parsePDF: true }, - headers, - options, -}: { - url: string; - waitFor?: number; - screenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; - headers?: Record; - options?: any; -}): Promise { - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` - ); - - const response = await axios.post( - process.env.FIRE_ENGINE_BETA_URL + "/scrape", - { - url: url, - wait: waitParam, - screenshot: screenshotParam, - headers: headers, - pageOptions: pageOptions, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, - } - ); - - if (response.status !== 200) { - console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` - ); - return { - html: "", - screenshot: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - url, - pageOptions?.parsePDF - ); - return { html: content, screenshot: "", pageStatusCode, pageError }; - } else { - const data = response.data; - const html = data.content; - const screenshot = data.screenshot; - return { - html: html ?? "", - screenshot: screenshot ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); - } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - } - return { html: "", screenshot: "" }; - } -} - -export async function scrapWithScrapingBee( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - try { - const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const clientParams = await generateRequestParams( - url, - wait_browser, - timeout - ); - const response = await client.get({ - ...clientParams, - params: { - ...clientParams.params, - transparent_status_code: "True", - }, - }); - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - let text = ""; - try { - const decoder = new TextDecoder(); - text = decoder.decode(response.data); - } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` - ); - } - return { - content: text, - pageStatusCode: response.status, - pageError: - response.statusText != "OK" ? response.statusText : undefined, - }; - } - } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return { - content: "", - pageStatusCode: error.response.status, - pageError: error.response.statusText, - }; - } -} - -export async function scrapWithPlaywright( - url: string, - waitFor: number = 0, - headers?: Record, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - - const response = await axios.post( - process.env.PLAYWRIGHT_MICROSERVICE_URL, - { - url: url, - wait_after_load: waitParam, - headers: headers, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - } - ); - - if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` - ); - return { - content: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const textData = response.data; - try { - const data = JSON.parse(textData); - const html = data.content; - return { - content: html ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } catch (jsonError) { - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` - ); - return { content: "" }; - } - } - } catch (error) { - if (error.code === "ECONNABORTED") { - console.log(`[Playwright] Request timed out for ${url}`); - } else { - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } -} - -export async function scrapWithFetch( - url: string, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - try { - const response = await axios.get(url, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout, - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); - return { - content: "", - pageStatusCode: response.status, - pageError: response.statusText, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const text = response.data; - return { content: text, pageStatusCode: 200 }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - console.log(`[Axios] Request timed out for ${url}`); - } else { - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } -} /** * Get the order of scrapers to be used for scraping a URL @@ -354,7 +120,7 @@ export async function scrapSingleUrl( headers: undefined, }, extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown" + mode: "llm-extraction-from-markdown", }, existingHtml: string = "" ): Promise { @@ -518,13 +284,13 @@ export async function scrapSingleUrl( html = attempt.html ?? ""; rawHtml = attempt.rawHtml ?? ""; screenshot = attempt.screenshot ?? ""; - + if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } if (attempt.pageError && attempt.pageStatusCode >= 400) { pageError = attempt.pageError; - } else if (attempt.pageStatusCode < 400) { + } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) { pageError = undefined; } @@ -549,7 +315,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -563,7 +333,11 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, sourceURL: urlToScrap, diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts new file mode 100644 index 00000000..bb568242 --- /dev/null +++ b/apps/api/src/services/logging/scrape_log.ts @@ -0,0 +1,37 @@ +import "dotenv/config"; +import { ScrapeLog } from "../../types"; +import { supabase_service } from "../supabase"; + +export async function logScrape(scrapeLog: ScrapeLog) { + try { + // Only log jobs in production + // if (process.env.ENV !== "production") { + // return; + // } + + const { data, error } = await supabase_service + .from("scrape_logs") + .insert([ + { + url: scrapeLog.url, + scraper: scrapeLog.scraper, + success: scrapeLog.success, + response_code: scrapeLog.response_code, + time_taken_seconds: scrapeLog.time_taken_seconds, + proxy: scrapeLog.proxy, + retried: scrapeLog.retried, + error_message: scrapeLog.error_message, + date_added: new Date().toISOString(), + html: scrapeLog.html, + ipv4_support: scrapeLog.ipv4_support, + ipv6_support: scrapeLog.ipv6_support, + }, + ]); + + if (error) { + console.error("Error logging proxy:\n", error); + } + } catch (error) { + console.error("Error logging proxy:\n", error); + } +} diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index e69353b6..7c3aacad 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -113,4 +113,19 @@ export enum NotificationType { APPROACHING_LIMIT = "approachingLimit", LIMIT_REACHED = "limitReached", RATE_LIMIT_REACHED = "rateLimitReached", -} \ No newline at end of file +} + +export type ScrapeLog = { + url: string; + scraper: string; + success?: boolean; + response_code?: number; + time_taken_seconds?: number; + proxy?: string; + retried?: boolean; + error_message?: string; + date_added?: string; // ISO 8601 format + html?: string; + ipv4_support?: boolean | null; + ipv6_support?: boolean | null; +}; \ No newline at end of file