mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 02:06:05 +08:00
feat(fire-engine): port waitFor and screenshot to use actions
This commit is contained in:
parent
c28e1e2959
commit
42d677fe3c
@ -10,6 +10,17 @@ export interface Progress {
|
||||
currentDocument?: Document;
|
||||
}
|
||||
|
||||
export type Action = {
|
||||
type: "wait",
|
||||
milliseconds: number,
|
||||
} | {
|
||||
type: "click",
|
||||
selector: string,
|
||||
} | {
|
||||
type: "screenshot",
|
||||
fullPage?: boolean,
|
||||
};
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
includeExtract?: boolean;
|
||||
@ -29,7 +40,8 @@ export type PageOptions = {
|
||||
includeLinks?: boolean;
|
||||
useFastMode?: boolean; // beta
|
||||
disableJsDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
atsv?: boolean; // anti-bot solver, beta
|
||||
actions?: Action[]; // beta
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
@ -1,5 +1,5 @@
|
||||
import axios from "axios";
|
||||
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
@ -20,9 +20,7 @@ import * as Sentry from "@sentry/node";
|
||||
*/
|
||||
export async function scrapWithFireEngine({
|
||||
url,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
actions,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
@ -31,9 +29,7 @@ export async function scrapWithFireEngine({
|
||||
teamId,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
actions?: Action[];
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
@ -54,10 +50,7 @@ export async function scrapWithFireEngine({
|
||||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
@ -75,7 +68,7 @@ export async function scrapWithFireEngine({
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// atsv is only available for beta customers
|
||||
@ -101,9 +94,6 @@ export async function scrapWithFireEngine({
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
@ -112,6 +102,7 @@ export async function scrapWithFireEngine({
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
@ -125,8 +116,10 @@ export async function scrapWithFireEngine({
|
||||
);
|
||||
});
|
||||
|
||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0);
|
||||
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
@ -195,9 +195,17 @@ export async function scrapSingleUrl(
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
actions: [
|
||||
...(pageOptions.waitFor ? [{
|
||||
type: "wait" as const,
|
||||
milliseconds: pageOptions.waitFor,
|
||||
}] : []),
|
||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: !!pageOptions.fullPageScreenshot,
|
||||
}] : []),
|
||||
...(pageOptions.actions ?? []),
|
||||
],
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
@ -267,8 +275,12 @@ export async function scrapSingleUrl(
|
||||
case "fire-engine":
|
||||
customScrapedContent = await scrapWithFireEngine({
|
||||
url: customScraperResult.url,
|
||||
waitFor: customScraperResult.waitAfterLoad,
|
||||
screenshot: false,
|
||||
actions: customScraperResult.waitAfterLoad ? ([
|
||||
{
|
||||
type: "wait",
|
||||
milliseconds: customScraperResult.waitAfterLoad,
|
||||
}
|
||||
]) : ([]),
|
||||
pageOptions: customScraperResult.pageOptions,
|
||||
});
|
||||
if (screenshot) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user