From f92217e3b66a730beb88b565b92e72252556a6cd Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 16 Apr 2025 00:38:35 -0700 Subject: [PATCH] wip --- apps/api/src/controllers/v1/extract-status.ts | 1 + .../lib/extract/completions/batchExtract.ts | 23 ++++++++++++++++--- apps/api/src/lib/extract/extract-redis.ts | 6 ++++- .../api/src/lib/extract/extraction-service.ts | 18 ++++++++++++++- .../scrapeURL/lib/extractSmartScrape.ts | 5 +++- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 47e02f7b..76a611f8 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -73,5 +73,6 @@ export async function extractStatusController( llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, sources: extract.showSources ? extract.sources : undefined, costTracking: extract.showCostTracking ? extract.costTracking : undefined, + sessionIds: extract.sessionIds ? extract.sessionIds : undefined, }); } diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 1bffeaa2..7e254a0f 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -23,6 +23,7 @@ type BatchExtractOptions = { doc: Document; useAgent: boolean; extractId?: string; + sessionId?: string; }; /** @@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: otherCost: number; smartScrapeCallCount: number; otherCallCount: number; + sessionId?: string; }> { - const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options; - + const { + multiEntitySchema, + links, + prompt, + systemPrompt, + doc, + useAgent, + extractId, + sessionId } = options; const generationOptions: GenerateCompletionsOptions = { logger: logger.child({ @@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger: let warning: string | undefined; let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0; try { - const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ + const { + extractedDataArray: e, + warning: w, + smartScrapeCost, + otherCost, + smartScrapeCallCount, + otherCallCount + } = await extractData({ extractOptions: generationOptions, urls: [doc.metadata.sourceURL || doc.metadata.url || ""], useAgent, extractId, + sessionId }); extractedDataArray = e; warning = w; diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index e560f18d..d256c582 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -8,6 +8,7 @@ export enum ExtractStep { MAP_RERANK = "map-rerank", MULTI_ENTITY = "multi-entity", MULTI_ENTITY_SCRAPE = "multi-entity-scrape", + MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape", MULTI_ENTITY_EXTRACT = "multi-entity-extract", SCRAPE = "scrape", EXTRACT = "extract", @@ -17,7 +18,7 @@ export enum ExtractStep { export type ExtractedStep = { step: ExtractStep; startedAt: number; - finishedAt: number; + finishedAt: number | null; error?: any; discoveredLinks?: string[]; }; @@ -38,6 +39,7 @@ export type StoredExtract = { sources?: { [key: string]: string[]; }; + sessionIds?: string[]; }; // Reduce TTL to 6 hours instead of 24 @@ -107,6 +109,8 @@ export async function updateExtract( })) }; + console.log(minimalExtract.sessionIds) + await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract)); await redisConnection.expire("extract:" + id, EXTRACT_TTL); } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1a6a4262..dd828f4f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -413,8 +413,23 @@ export async function performExtraction( chunks.push(multyEntityDocs.slice(i, i + chunkSize)); } + const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID()); + await updateExtract(extractId, { + status: "processing", + steps: [ + { + step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE, + startedAt: Date.now(), + finishedAt: null + }, + ], + sessionIds + }); + // Process chunks sequentially with timeout - for (const chunk of chunks) { + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const sessionId = sessionIds[i]; const chunkPromises = chunk.map(async (doc) => { try { ajv.compile(multiEntitySchema); @@ -432,6 +447,7 @@ export async function performExtraction( doc, useAgent: isAgentExtractModelValid(request.agent?.model), extractId, + sessionId }, logger); // Race between timeout and completion diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 7380f380..f642383a 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { getModel } from "../../../lib/generic-ai"; import { TokenUsage } from "../../../controllers/v1/types"; import type { SmartScrapeResult } from "./smartScrape"; +import { ExtractStep } from "src/lib/extract/extract-redis"; const commonSmartScrapeProperties = { shouldUseSmartscrape: { @@ -185,11 +186,13 @@ export async function extractData({ urls, useAgent, extractId, + sessionId }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; extractId?: string; + sessionId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -275,7 +278,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++;