mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 16:19:00 +08:00
added variables to beta customers
This commit is contained in:
parent
5a44191344
commit
ecd472356b
@ -24,6 +24,9 @@ export type PageOptions = {
|
|||||||
parsePDF?: boolean;
|
parsePDF?: boolean;
|
||||||
removeTags?: string | string[];
|
removeTags?: string | string[];
|
||||||
onlyIncludeTags?: string | string[];
|
onlyIncludeTags?: string | string[];
|
||||||
|
useFastMode?: boolean; // beta
|
||||||
|
disableJSDom?: boolean; // beta
|
||||||
|
atsv?: boolean; // beta
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
@ -66,6 +69,7 @@ export type WebScraperOptions = {
|
|||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
bullJobId?: string;
|
bullJobId?: string;
|
||||||
priority?: number;
|
priority?: number;
|
||||||
|
teamId?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface DocumentUrl {
|
export interface DocumentUrl {
|
||||||
@ -142,4 +146,5 @@ export interface FireEngineOptions{
|
|||||||
blockMedia?: boolean;
|
blockMedia?: boolean;
|
||||||
blockAds?: boolean;
|
blockAds?: boolean;
|
||||||
disableJsDom?: boolean;
|
disableJsDom?: boolean;
|
||||||
|
atsv?: boolean; // beta
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,7 @@ export class WebScraperDataProvider {
|
|||||||
private allowBackwardCrawling: boolean = false;
|
private allowBackwardCrawling: boolean = false;
|
||||||
private allowExternalContentLinks: boolean = false;
|
private allowExternalContentLinks: boolean = false;
|
||||||
private priority?: number;
|
private priority?: number;
|
||||||
|
private teamId?: string;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -596,6 +597,7 @@ export class WebScraperDataProvider {
|
|||||||
this.allowExternalContentLinks =
|
this.allowExternalContentLinks =
|
||||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||||
this.priority = options.priority;
|
this.priority = options.priority;
|
||||||
|
this.teamId = options.teamId ?? null;
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
@ -22,21 +22,23 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
fullPageScreenshot = false,
|
fullPageScreenshot = false,
|
||||||
pageOptions = { parsePDF: true },
|
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
priority,
|
priority,
|
||||||
|
teamId,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
fullPageScreenshot?: boolean;
|
fullPageScreenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
priority?: number;
|
priority?: number;
|
||||||
|
teamId?: string;
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
const logParams = {
|
const logParams = {
|
||||||
url,
|
url,
|
||||||
@ -51,11 +53,11 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
|
|
||||||
let endpoint = "/scrape";
|
let endpoint = "/scrape";
|
||||||
@ -70,6 +72,20 @@ export async function scrapWithFireEngine({
|
|||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (pageOptions?.useFastMode) {
|
||||||
|
console.log('using tlsclient')
|
||||||
|
fireEngineOptionsParam.engine = "tlsclient";
|
||||||
|
engine = "tlsclient";
|
||||||
|
}
|
||||||
|
|
||||||
|
// atsv is only available for beta customers
|
||||||
|
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||||
|
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||||
|
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
||||||
|
fireEngineOptionsParam.atsv = true;
|
||||||
|
} else {
|
||||||
|
pageOptions.atsv = false;
|
||||||
|
}
|
||||||
|
|
||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
@ -80,7 +96,9 @@ export async function scrapWithFireEngine({
|
|||||||
fullPageScreenshot: fullPageScreenshotParam,
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||||
priority,
|
priority,
|
||||||
|
engine,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -136,6 +136,7 @@ export async function scrapSingleUrl(
|
|||||||
},
|
},
|
||||||
existingHtml: string = "",
|
existingHtml: string = "",
|
||||||
priority?: number,
|
priority?: number,
|
||||||
|
teamId?: string
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
@ -164,7 +165,7 @@ export async function scrapSingleUrl(
|
|||||||
case "fire-engine;chrome-cdp":
|
case "fire-engine;chrome-cdp":
|
||||||
|
|
||||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||||
if(method === "fire-engine;chrome-cdp"){
|
if (method === "fire-engine;chrome-cdp") {
|
||||||
engine = "chrome-cdp";
|
engine = "chrome-cdp";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,8 +179,10 @@ export async function scrapSingleUrl(
|
|||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
engine: engine,
|
engine: engine,
|
||||||
|
atsv: pageOptions.atsv,
|
||||||
},
|
},
|
||||||
priority,
|
priority,
|
||||||
|
teamId,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
scraperResponse.screenshot = response.screenshot;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user