Nick: comments

This commit is contained in:
Nicolas 2024-07-03 18:01:54 -03:00
parent 90c54c32fd
commit 2d30cc6117
4 changed files with 33 additions and 1 deletions

View File

@ -3,6 +3,13 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Axios
* @param url The URL to scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }

View File

@ -5,6 +5,16 @@ import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Fire-Engine
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot
* @param pageOptions The options for the page
* @param headers The headers to send with the request
* @param options The options for the request
* @returns The scraped content
*/
export async function scrapWithFireEngine({
url,
waitFor = 0,

View File

@ -4,6 +4,14 @@ import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Playwright
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param headers The headers to send with the request
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,

View File

@ -4,7 +4,14 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { ScrapingBeeClient } from "scrapingbee";
/**
* Scrapes a URL with ScrapingBee
* @param url The URL to scrape
* @param wait_browser The browser event to wait for
* @param timeout The timeout for the scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",