From 2d30cc6117b993aa37f12cf29d9563f9f1dc8b0b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 3 Jul 2024 18:01:54 -0300 Subject: [PATCH] Nick: comments --- apps/api/src/scraper/WebScraper/scrapers/fetch.ts | 7 +++++++ apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 10 ++++++++++ apps/api/src/scraper/WebScraper/scrapers/playwright.ts | 8 ++++++++ .../api/src/scraper/WebScraper/scrapers/scrapingBee.ts | 9 ++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 562fa6e7..9badfd91 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -3,6 +3,13 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Axios + * @param url The URL to scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithFetch( url: string, pageOptions: { parsePDF?: boolean } = { parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index f6121861..ce3cd2da 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -5,6 +5,16 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +/** + * Scrapes a URL with Fire-Engine + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param screenshot Whether to take a screenshot + * @param pageOptions The options for the page + * @param headers The headers to send with the request + * @param options The options for the request + * @returns The scraped content + */ export async function scrapWithFireEngine({ url, waitFor = 0, diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index fd1aef53..03a6728d 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -4,6 +4,14 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +/** + * Scrapes a URL with Playwright + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param headers The headers to send with the request + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithPlaywright( url: string, waitFor: number = 0, diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 5ab0e061..63e8a082 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -4,7 +4,14 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { ScrapingBeeClient } from "scrapingbee"; - +/** + * Scrapes a URL with ScrapingBee + * @param url The URL to scrape + * @param wait_browser The browser event to wait for + * @param timeout The timeout for the scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded",