This commit is contained in:
rafaelmmiller 2025-04-16 00:38:35 -07:00
parent 80b507e64e
commit f92217e3b6
5 changed files with 47 additions and 6 deletions

View File

@ -73,5 +73,6 @@ export async function extractStatusController(
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
sources: extract.showSources ? extract.sources : undefined, sources: extract.showSources ? extract.sources : undefined,
costTracking: extract.showCostTracking ? extract.costTracking : undefined, costTracking: extract.showCostTracking ? extract.costTracking : undefined,
sessionIds: extract.sessionIds ? extract.sessionIds : undefined,
}); });
} }

View File

@ -23,6 +23,7 @@ type BatchExtractOptions = {
doc: Document; doc: Document;
useAgent: boolean; useAgent: boolean;
extractId?: string; extractId?: string;
sessionId?: string;
}; };
/** /**
@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
otherCost: number; otherCost: number;
smartScrapeCallCount: number; smartScrapeCallCount: number;
otherCallCount: number; otherCallCount: number;
sessionId?: string;
}> { }> {
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options; const {
multiEntitySchema,
links,
prompt,
systemPrompt,
doc,
useAgent,
extractId,
sessionId } = options;
const generationOptions: GenerateCompletionsOptions = { const generationOptions: GenerateCompletionsOptions = {
logger: logger.child({ logger: logger.child({
@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
let warning: string | undefined; let warning: string | undefined;
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0; let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
try { try {
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({ const {
extractedDataArray: e,
warning: w,
smartScrapeCost,
otherCost,
smartScrapeCallCount,
otherCallCount
} = await extractData({
extractOptions: generationOptions, extractOptions: generationOptions,
urls: [doc.metadata.sourceURL || doc.metadata.url || ""], urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
useAgent, useAgent,
extractId, extractId,
sessionId
}); });
extractedDataArray = e; extractedDataArray = e;
warning = w; warning = w;

View File

@ -8,6 +8,7 @@ export enum ExtractStep {
MAP_RERANK = "map-rerank", MAP_RERANK = "map-rerank",
MULTI_ENTITY = "multi-entity", MULTI_ENTITY = "multi-entity",
MULTI_ENTITY_SCRAPE = "multi-entity-scrape", MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape",
MULTI_ENTITY_EXTRACT = "multi-entity-extract", MULTI_ENTITY_EXTRACT = "multi-entity-extract",
SCRAPE = "scrape", SCRAPE = "scrape",
EXTRACT = "extract", EXTRACT = "extract",
@ -17,7 +18,7 @@ export enum ExtractStep {
export type ExtractedStep = { export type ExtractedStep = {
step: ExtractStep; step: ExtractStep;
startedAt: number; startedAt: number;
finishedAt: number; finishedAt: number | null;
error?: any; error?: any;
discoveredLinks?: string[]; discoveredLinks?: string[];
}; };
@ -38,6 +39,7 @@ export type StoredExtract = {
sources?: { sources?: {
[key: string]: string[]; [key: string]: string[];
}; };
sessionIds?: string[];
}; };
// Reduce TTL to 6 hours instead of 24 // Reduce TTL to 6 hours instead of 24
@ -107,6 +109,8 @@ export async function updateExtract(
})) }))
}; };
console.log(minimalExtract.sessionIds)
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract)); await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
await redisConnection.expire("extract:" + id, EXTRACT_TTL); await redisConnection.expire("extract:" + id, EXTRACT_TTL);
} }

View File

@ -413,8 +413,23 @@ export async function performExtraction(
chunks.push(multyEntityDocs.slice(i, i + chunkSize)); chunks.push(multyEntityDocs.slice(i, i + chunkSize));
} }
const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID());
await updateExtract(extractId, {
status: "processing",
steps: [
{
step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE,
startedAt: Date.now(),
finishedAt: null
},
],
sessionIds
});
// Process chunks sequentially with timeout // Process chunks sequentially with timeout
for (const chunk of chunks) { for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const sessionId = sessionIds[i];
const chunkPromises = chunk.map(async (doc) => { const chunkPromises = chunk.map(async (doc) => {
try { try {
ajv.compile(multiEntitySchema); ajv.compile(multiEntitySchema);
@ -432,6 +447,7 @@ export async function performExtraction(
doc, doc,
useAgent: isAgentExtractModelValid(request.agent?.model), useAgent: isAgentExtractModelValid(request.agent?.model),
extractId, extractId,
sessionId
}, logger); }, logger);
// Race between timeout and completion // Race between timeout and completion

View File

@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
import { getModel } from "../../../lib/generic-ai"; import { getModel } from "../../../lib/generic-ai";
import { TokenUsage } from "../../../controllers/v1/types"; import { TokenUsage } from "../../../controllers/v1/types";
import type { SmartScrapeResult } from "./smartScrape"; import type { SmartScrapeResult } from "./smartScrape";
import { ExtractStep } from "src/lib/extract/extract-redis";
const commonSmartScrapeProperties = { const commonSmartScrapeProperties = {
shouldUseSmartscrape: { shouldUseSmartscrape: {
@ -185,11 +186,13 @@ export async function extractData({
urls, urls,
useAgent, useAgent,
extractId, extractId,
sessionId
}: { }: {
extractOptions: GenerateCompletionsOptions; extractOptions: GenerateCompletionsOptions;
urls: string[]; urls: string[];
useAgent: boolean; useAgent: boolean;
extractId?: string; extractId?: string;
sessionId?: string;
}): Promise<{ }): Promise<{
extractedDataArray: any[]; extractedDataArray: any[];
warning: any; warning: any;
@ -275,7 +278,7 @@ export async function extractData({
let smartscrapeResults: SmartScrapeResult[]; let smartscrapeResults: SmartScrapeResult[];
if (isSingleUrl) { if (isSingleUrl) {
smartscrapeResults = [ smartscrapeResults = [
await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
]; ];
smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCost += smartscrapeResults[0].tokenUsage;
smartScrapeCallCount++; smartScrapeCallCount++;