mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-08 06:51:48 +08:00
Nick:
This commit is contained in:
parent
bfc7f5882e
commit
e098e88ea7
@ -129,3 +129,11 @@ export interface FireEngineResponse {
|
|||||||
pageError?: string;
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export interface FireEngineOptions{
|
||||||
|
mobileProxy?: boolean;
|
||||||
|
method?: string;
|
||||||
|
engine?: string;
|
||||||
|
blockMedia?: boolean;
|
||||||
|
blockAds?: boolean;
|
||||||
|
}
|
||||||
|
@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url";
|
|||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { FireEngineResponse } from "../../../lib/entities";
|
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
import { logScrape } from "../../../services/logging/scrape_log";
|
||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
@ -20,6 +20,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
pageOptions = { parsePDF: true },
|
pageOptions = { parsePDF: true },
|
||||||
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
}: {
|
}: {
|
||||||
@ -27,6 +28,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||||
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
@ -57,6 +59,7 @@ export async function scrapWithFireEngine({
|
|||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
...fireEngineOptions,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
|
@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
|
|||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} else if (mode === 'fire-engine') {
|
} else if (mode === 'fire-engine') {
|
||||||
const response = await scrapWithFireEngine({ url: sitemapUrl });
|
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
|
||||||
content = response.html;
|
content = response.html;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user