propagate priority to fire-engine

This commit is contained in:
Gergő Móricz 2024-08-15 19:04:46 +02:00
parent b79d3d1754
commit 29f0d9ec94
6 changed files with 17 additions and 3 deletions

View File

@ -65,6 +65,7 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions;
concurrentRequests?: number;
bullJobId?: string;
priority?: number;
};
export interface DocumentUrl {

View File

@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
},
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
})) as { success: boolean; message: string; docs: Document[] };
}
export async function runWebScraper({
@ -62,6 +63,7 @@ export async function runWebScraper({
onError,
team_id,
bull_job_id,
priority,
}: RunWebScraperParams): Promise<RunWebScraperResult> {
try {
const provider = new WebScraperDataProvider();
@ -74,6 +76,7 @@ export async function runWebScraper({
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
bullJobId: bull_job_id,
priority,
});
} else {
await provider.setOptions({
@ -83,6 +86,7 @@ export async function runWebScraper({
extractorOptions,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
priority,
});
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {

View File

@ -44,6 +44,7 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
private priority?: number;
authorize(): void {
throw new Error("Method not implemented.");
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
url,
this.pageOptions,
this.extractorOptions,
existingHTML
existingHTML,
this.priority,
);
processedUrls++;
if (inProgress) {
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {

View File

@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
fireEngineOptions = {},
headers,
options,
priority,
}: {
url: string;
waitFor?: number;
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
priority?: number;
}): Promise<FireEngineResponse> {
const logParams = {
url,
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
pageOptions: pageOptions,
priority,
...fireEngineOptionsParam,
},
{

View File

@ -134,7 +134,8 @@ export async function scrapSingleUrl(
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = ""
existingHtml: string = "",
priority?: number,
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
headers: pageOptions.headers,
fireEngineOptions: {
engine: engine,
}
},
priority,
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;

View File

@ -43,6 +43,7 @@ export interface RunWebScraperParams {
onError: (error: Error) => void;
team_id: string;
bull_job_id: string;
priority?: number;
}
export interface RunWebScraperResult {