added variables to beta customers

This commit is contained in:
rafaelsideguide 2024-08-19 16:41:54 -03:00
parent 5a44191344
commit ecd472356b
4 changed files with 36 additions and 8 deletions

View File

@ -24,6 +24,9 @@ export type PageOptions = {
parsePDF?: boolean; parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
onlyIncludeTags?: string | string[]; onlyIncludeTags?: string | string[];
useFastMode?: boolean; // beta
disableJSDom?: boolean; // beta
atsv?: boolean; // beta
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -66,6 +69,7 @@ export type WebScraperOptions = {
concurrentRequests?: number; concurrentRequests?: number;
bullJobId?: string; bullJobId?: string;
priority?: number; priority?: number;
teamId?: string;
}; };
export interface DocumentUrl { export interface DocumentUrl {
@ -142,4 +146,5 @@ export interface FireEngineOptions{
blockMedia?: boolean; blockMedia?: boolean;
blockAds?: boolean; blockAds?: boolean;
disableJsDom?: boolean; disableJsDom?: boolean;
atsv?: boolean; // beta
} }

View File

@ -45,6 +45,7 @@ export class WebScraperDataProvider {
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false; private allowExternalContentLinks: boolean = false;
private priority?: number; private priority?: number;
private teamId?: string;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -596,6 +597,7 @@ export class WebScraperDataProvider {
this.allowExternalContentLinks = this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority; this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -22,21 +22,23 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false, fullPageScreenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
priority, priority,
teamId,
}: { }: {
url: string; url: string;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean; fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
priority?: number; priority?: number;
teamId?: string;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
const logParams = { const logParams = {
url, url,
@ -51,11 +53,11 @@ export async function scrapWithFireEngine({
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor; let waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = "/scrape"; let endpoint = "/scrape";
@ -70,6 +72,20 @@ export async function scrapWithFireEngine({
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
if (pageOptions?.useFastMode) {
console.log('using tlsclient')
fireEngineOptionsParam.engine = "tlsclient";
engine = "tlsclient";
}
// atsv is only available for beta customers
const betaCustomersString = process.env.BETA_CUSTOMERS;
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
fireEngineOptionsParam.atsv = true;
} else {
pageOptions.atsv = false;
}
const response = await axios.post( const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
@ -80,7 +96,9 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam, fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority, priority,
engine,
...fireEngineOptionsParam, ...fireEngineOptionsParam,
}, },
{ {

View File

@ -136,6 +136,7 @@ export async function scrapSingleUrl(
}, },
existingHtml: string = "", existingHtml: string = "",
priority?: number, priority?: number,
teamId?: string
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -164,7 +165,7 @@ export async function scrapSingleUrl(
case "fire-engine;chrome-cdp": case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if(method === "fire-engine;chrome-cdp"){ if (method === "fire-engine;chrome-cdp") {
engine = "chrome-cdp"; engine = "chrome-cdp";
} }
@ -178,8 +179,10 @@ export async function scrapSingleUrl(
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
engine: engine, engine: engine,
atsv: pageOptions.atsv,
}, },
priority, priority,
teamId,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;