feat(fire-engine): port waitFor and screenshot to use actions

This commit is contained in:
Gergő Móricz 2024-09-18 20:04:54 +02:00
parent c28e1e2959
commit 42d677fe3c
3 changed files with 38 additions and 21 deletions

View File

@ -10,6 +10,17 @@ export interface Progress {
currentDocument?: Document;
}
export type Action = {
type: "wait",
milliseconds: number,
} | {
type: "click",
selector: string,
} | {
type: "screenshot",
fullPage?: boolean,
};
export type PageOptions = {
includeMarkdown?: boolean;
includeExtract?: boolean;
@ -29,7 +40,8 @@ export type PageOptions = {
includeLinks?: boolean;
useFastMode?: boolean; // beta
disableJsDom?: boolean; // beta
atsv?: boolean; // beta
atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta
};
export type ExtractorOptions = {

View File

@ -1,5 +1,5 @@
import axios from "axios";
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@ -20,9 +20,7 @@ import * as Sentry from "@sentry/node";
*/
export async function scrapWithFireEngine({
url,
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
actions,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
fireEngineOptions = {},
headers,
@ -31,9 +29,7 @@ export async function scrapWithFireEngine({
teamId,
}: {
url: string;
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
actions?: Action[];
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
@ -54,10 +50,7 @@ export async function scrapWithFireEngine({
try {
const reqParams = await generateRequestParams(url);
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@ -75,7 +68,7 @@ export async function scrapWithFireEngine({
}
Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
// atsv is only available for beta customers
@ -101,9 +94,6 @@ export async function scrapWithFireEngine({
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
url: url,
wait: waitParam,
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
@ -112,6 +102,7 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
actions: actions,
},
{
headers: {
@ -125,8 +116,10 @@ export async function scrapWithFireEngine({
);
});
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0);
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
}

View File

@ -195,9 +195,17 @@ export async function scrapSingleUrl(
if (process.env.FIRE_ENGINE_BETA_URL) {
const response = await scrapWithFireEngine({
url,
waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
actions: [
...(pageOptions.waitFor ? [{
type: "wait" as const,
milliseconds: pageOptions.waitFor,
}] : []),
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
type: "screenshot" as const,
fullPage: !!pageOptions.fullPageScreenshot,
}] : []),
...(pageOptions.actions ?? []),
],
pageOptions: pageOptions,
headers: pageOptions.headers,
fireEngineOptions: {
@ -267,8 +275,12 @@ export async function scrapSingleUrl(
case "fire-engine":
customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad,
screenshot: false,
actions: customScraperResult.waitAfterLoad ? ([
{
type: "wait",
milliseconds: customScraperResult.waitAfterLoad,
}
]) : ([]),
pageOptions: customScraperResult.pageOptions,
});
if (screenshot) {