This commit is contained in:
Nicolas 2024-07-12 22:02:08 -04:00
parent bfc7f5882e
commit e098e88ea7
4 changed files with 13 additions and 3 deletions

View File

@ -129,3 +129,11 @@ export interface FireEngineResponse {
pageError?: string; pageError?: string;
} }
export interface FireEngineOptions{
mobileProxy?: boolean;
method?: string;
engine?: string;
blockMedia?: boolean;
blockAds?: boolean;
}

View File

@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export class WebCrawler { export class WebCrawler {
private initialUrl: string; private initialUrl: string;

View File

@ -1,5 +1,5 @@
import axios from "axios"; import axios from "axios";
import { FireEngineResponse } from "../../../lib/entities"; import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log"; import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@ -20,6 +20,7 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true },
fireEngineOptions = {},
headers, headers,
options, options,
}: { }: {
@ -27,6 +28,7 @@ export async function scrapWithFireEngine({
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
@ -57,6 +59,7 @@ export async function scrapWithFireEngine({
screenshot: screenshotParam, screenshot: screenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
...fireEngineOptions,
}, },
{ {
headers: { headers: {

View File

@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl }); const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
content = response.html; content = response.html;
} }
} catch (error) { } catch (error) {