diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 922e4b6a..92dd4c7c 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -65,6 +65,7 @@ export type WebScraperOptions = { extractorOptions?: ExtractorOptions; concurrentRequests?: number; bullJobId?: string; + priority?: number; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 81c06e9f..77d392f5 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -49,6 +49,7 @@ export async function startWebScraperPipeline({ }, team_id: job.data.team_id, bull_job_id: job.id.toString(), + priority: job.opts.priority, })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -62,6 +63,7 @@ export async function runWebScraper({ onError, team_id, bull_job_id, + priority, }: RunWebScraperParams): Promise { try { const provider = new WebScraperDataProvider(); @@ -74,6 +76,7 @@ export async function runWebScraper({ crawlerOptions: crawlerOptions, pageOptions: pageOptions, bullJobId: bull_job_id, + priority, }); } else { await provider.setOptions({ @@ -83,6 +86,7 @@ export async function runWebScraper({ extractorOptions, crawlerOptions: crawlerOptions, pageOptions: pageOptions, + priority, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index a03553bc..98f07ae5 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -44,6 +44,7 @@ export class WebScraperDataProvider { private crawlerMode: string = "default"; private allowBackwardCrawling: boolean = false; private allowExternalContentLinks: boolean = false; + private priority?: number; authorize(): void { throw new Error("Method not implemented."); @@ -72,7 +73,8 @@ export class WebScraperDataProvider { url, this.pageOptions, this.extractorOptions, - existingHTML + existingHTML, + this.priority, ); processedUrls++; if (inProgress) { @@ -593,6 +595,7 @@ export class WebScraperDataProvider { options.crawlerOptions?.allowBackwardCrawling ?? false; this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; + this.priority = options.priority; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 0bb9986f..77697411 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -26,6 +26,7 @@ export async function scrapWithFireEngine({ fireEngineOptions = {}, headers, options, + priority, }: { url: string; waitFor?: number; @@ -35,6 +36,7 @@ export async function scrapWithFireEngine({ fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; + priority?: number; }): Promise { const logParams = { url, @@ -78,6 +80,7 @@ export async function scrapWithFireEngine({ fullPageScreenshot: fullPageScreenshotParam, headers: headers, pageOptions: pageOptions, + priority, ...fireEngineOptionsParam, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fee0d644..df9d04ab 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -134,7 +134,8 @@ export async function scrapSingleUrl( extractorOptions: ExtractorOptions = { mode: "llm-extraction-from-markdown", }, - existingHtml: string = "" + existingHtml: string = "", + priority?: number, ): Promise { urlToScrap = urlToScrap.trim(); @@ -177,7 +178,8 @@ export async function scrapSingleUrl( headers: pageOptions.headers, fireEngineOptions: { engine: engine, - } + }, + priority, }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c02aba1c..b092d310 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -43,6 +43,7 @@ export interface RunWebScraperParams { onError: (error: Error) => void; team_id: string; bull_job_id: string; + priority?: number; } export interface RunWebScraperResult {