propagate priority to fire-engine

This commit is contained in:
Gergő Móricz 2024-08-15 19:04:46 +02:00
parent b79d3d1754
commit 29f0d9ec94
6 changed files with 17 additions and 3 deletions

View File

@ -65,6 +65,7 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;
bullJobId?: string; bullJobId?: string;
priority?: number;
}; };
export interface DocumentUrl { export interface DocumentUrl {

View File

@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority,
})) as { success: boolean; message: string; docs: Document[] }; })) as { success: boolean; message: string; docs: Document[] };
} }
export async function runWebScraper({ export async function runWebScraper({
@ -62,6 +63,7 @@ export async function runWebScraper({
onError, onError,
team_id, team_id,
bull_job_id, bull_job_id,
priority,
}: RunWebScraperParams): Promise<RunWebScraperResult> { }: RunWebScraperParams): Promise<RunWebScraperResult> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
@ -74,6 +76,7 @@ export async function runWebScraper({
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
bullJobId: bull_job_id, bullJobId: bull_job_id,
priority,
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
@ -83,6 +86,7 @@ export async function runWebScraper({
extractorOptions, extractorOptions,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
}); });
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {

View File

@ -44,6 +44,7 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false; private allowExternalContentLinks: boolean = false;
private priority?: number;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
existingHTML existingHTML,
this.priority,
); );
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
options.crawlerOptions?.allowBackwardCrawling ?? false; options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
priority,
}: { }: {
url: string; url: string;
waitFor?: number; waitFor?: number;
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
priority?: number;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
const logParams = { const logParams = {
url, url,
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam, fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
...fireEngineOptionsParam, ...fireEngineOptionsParam,
}, },
{ {

View File

@ -134,7 +134,8 @@ export async function scrapSingleUrl(
extractorOptions: ExtractorOptions = { extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown", mode: "llm-extraction-from-markdown",
}, },
existingHtml: string = "" existingHtml: string = "",
priority?: number,
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
engine: engine, engine: engine,
} },
priority,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;

View File

@ -43,6 +43,7 @@ export interface RunWebScraperParams {
onError: (error: Error) => void; onError: (error: Error) => void;
team_id: string; team_id: string;
bull_job_id: string; bull_job_id: string;
priority?: number;
} }
export interface RunWebScraperResult { export interface RunWebScraperResult {