This commit is contained in:
rafaelmmiller 2025-04-16 00:38:35 -07:00
parent 80b507e64e
commit f92217e3b6
5 changed files with 47 additions and 6 deletions

View File

@ -73,5 +73,6 @@ export async function extractStatusController(
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
sources: extract.showSources ? extract.sources : undefined,
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
sessionIds: extract.sessionIds ? extract.sessionIds : undefined,
});
}

View File

@ -23,6 +23,7 @@ type BatchExtractOptions = {
doc: Document;
useAgent: boolean;
extractId?: string;
sessionId?: string;
};
/**
@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
otherCost: number;
smartScrapeCallCount: number;
otherCallCount: number;
sessionId?: string;
}> {
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;
const {
multiEntitySchema,
links,
prompt,
systemPrompt,
doc,
useAgent,
extractId,
sessionId } = options;
const generationOptions: GenerateCompletionsOptions = {
logger: logger.child({
@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
let warning: string | undefined;
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
try {
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
const {
extractedDataArray: e,
warning: w,
smartScrapeCost,
otherCost,
smartScrapeCallCount,
otherCallCount
} = await extractData({
extractOptions: generationOptions,
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
useAgent,
extractId,
sessionId
});
extractedDataArray = e;
warning = w;

View File

@ -8,6 +8,7 @@ export enum ExtractStep {
MAP_RERANK = "map-rerank",
MULTI_ENTITY = "multi-entity",
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape",
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
SCRAPE = "scrape",
EXTRACT = "extract",
@ -17,7 +18,7 @@ export enum ExtractStep {
export type ExtractedStep = {
step: ExtractStep;
startedAt: number;
finishedAt: number;
finishedAt: number | null;
error?: any;
discoveredLinks?: string[];
};
@ -38,6 +39,7 @@ export type StoredExtract = {
sources?: {
[key: string]: string[];
};
sessionIds?: string[];
};
// Reduce TTL to 6 hours instead of 24
@ -107,6 +109,8 @@ export async function updateExtract(
}))
};
console.log(minimalExtract.sessionIds)
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
}

View File

@ -413,8 +413,23 @@ export async function performExtraction(
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
}
const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID());
await updateExtract(extractId, {
status: "processing",
steps: [
{
step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE,
startedAt: Date.now(),
finishedAt: null
},
],
sessionIds
});
// Process chunks sequentially with timeout
for (const chunk of chunks) {
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const sessionId = sessionIds[i];
const chunkPromises = chunk.map(async (doc) => {
try {
ajv.compile(multiEntitySchema);
@ -432,6 +447,7 @@ export async function performExtraction(
doc,
useAgent: isAgentExtractModelValid(request.agent?.model),
extractId,
sessionId
}, logger);
// Race between timeout and completion

View File

@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
import { getModel } from "../../../lib/generic-ai";
import { TokenUsage } from "../../../controllers/v1/types";
import type { SmartScrapeResult } from "./smartScrape";
import { ExtractStep } from "src/lib/extract/extract-redis";
const commonSmartScrapeProperties = {
shouldUseSmartscrape: {
@ -185,11 +186,13 @@ export async function extractData({
urls,
useAgent,
extractId,
sessionId
}: {
extractOptions: GenerateCompletionsOptions;
urls: string[];
useAgent: boolean;
extractId?: string;
sessionId?: string;
}): Promise<{
extractedDataArray: any[];
warning: any;
@ -275,7 +278,7 @@ export async function extractData({
let smartscrapeResults: SmartScrapeResult[];
if (isSingleUrl) {
smartscrapeResults = [
await smartScrape(urls[0], extract?.smartscrape_prompt, extractId),
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
];
smartScrapeCost += smartscrapeResults[0].tokenUsage;
smartScrapeCallCount++;