From 522c5b35da7d5cd997aa5ebe2002a38ede7ace93 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 21:00:40 -0300 Subject: [PATCH] Nick: extract api reference --- apps/api/src/controllers/v1/extract-status.ts | 2 + apps/api/src/controllers/v1/types.ts | 4 + apps/api/src/lib/extract/extract-redis.ts | 3 + .../api/src/lib/extract/extraction-service.ts | 78 +++++++++++++++++++ apps/api/src/services/rate-limiter.ts | 1 + 5 files changed, 88 insertions(+) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 3e166e5f..da3c08f1 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -30,6 +30,7 @@ export async function extractStatusController( data = jobData[0].docs; } + console.log(extract.sources); return res.status(200).json({ success: extract.status === "failed" ? false : true, data: data, @@ -38,5 +39,6 @@ export async function extractStatusController( expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(), steps: extract.showSteps ? extract.steps : undefined, llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, + sources: extract.sources, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 13b14116..e86f209e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -537,6 +537,7 @@ export interface URLTrace { }; relevanceScore?: number; usedInCompletion?: boolean; + extractedFields?: string[]; } export interface ExtractResponse { @@ -547,6 +548,9 @@ export interface ExtractResponse { id?: string; warning?: string; urlTrace?: URLTrace[]; + sources?: { + [key: string]: string[]; + }; } export interface ExtractResponseRequestTest { diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index a6a3d6d3..39cf2364 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -32,6 +32,9 @@ export type StoredExtract = { steps?: ExtractedStep[]; showLLMUsage?: boolean; llmUsage?: number; + sources?: { + [key: string]: string[]; + }; }; // Reduce TTL to 6 hours instead of 24 diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 0ac30245..577daaa5 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -56,6 +56,9 @@ interface ExtractResult { tokenUsageBreakdown?: TokenUsage[]; llmUsage?: number; totalUrlsScraped?: number; + sources?: { + [key: string]: string[]; + }; } async function analyzeSchemaAndPrompt( @@ -179,6 +182,45 @@ function getRootDomain(url: string): string { } } +// Add helper function to track sources +function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] { + const extractedFields: string[] = []; + + if (data && typeof data === 'object') { + Object.entries(data).forEach(([key, value]) => { + const currentPath = parentPath ? `${parentPath}.${key}` : key; + + if (value !== null && value !== undefined) { + extractedFields.push(currentPath); + + if (typeof value === 'object') { + extractedFields.push(...trackFieldSources(value, url, currentPath)); + } + } + }); + } + + return extractedFields; +} + +// Add helper to merge sources from multiple extractions +function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } { + const mergedSources: { [key: string]: string[] } = {}; + + sources.forEach(sourceMap => { + Object.entries(sourceMap).forEach(([field, urls]) => { + if (!mergedSources[field]) { + mergedSources[field] = []; + } + mergedSources[field].push(...urls); + // Deduplicate URLs + mergedSources[field] = [...new Set(mergedSources[field])]; + }); + }); + + return mergedSources; +} + export async function performExtraction( extractId: string, options: ExtractServiceOptions, @@ -191,6 +233,7 @@ export async function performExtraction( let multiEntityResult: any = {}; let singleAnswerResult: any = {}; let totalUrlsScraped = 0; + let extractionSources: { [key: string]: string[] } = {}; const logger = _logger.child({ module: "extract", @@ -551,6 +594,24 @@ export async function performExtraction( // return null; // } + if (multiEntityCompletion?.extract) { + const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!); + + // Update URL trace with extracted fields + const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!)); + if (trace) { + trace.extractedFields = extractedFields; + } + + // Track sources for each field + extractedFields.forEach(field => { + if (!extractionSources[field]) { + extractionSources[field] = []; + } + extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!); + }); + } + return multiEntityCompletion.extract; } catch (error) { logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! }); @@ -727,6 +788,21 @@ export async function performExtraction( // } // }); // } + + if (singleAnswerCompletions?.extract) { + const singleAnswerSources: { [key: string]: string[] } = {}; + const usedUrls = Array.from(docsMap.values()) + .map(doc => doc.metadata.url || doc.metadata.sourceURL!) + .filter(Boolean); + + const extractedFields = trackFieldSources(singleAnswerCompletions.extract, ''); + extractedFields.forEach(field => { + singleAnswerSources[field] = usedUrls; + }); + + // Merge with multi-entity sources + extractionSources = mergeSources([extractionSources, singleAnswerSources]); + } } let finalResult = reqSchema @@ -817,6 +893,7 @@ export async function performExtraction( updateExtract(extractId, { status: "completed", llmUsage, + sources: extractionSources }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -834,5 +911,6 @@ export async function performExtraction( urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, + sources: extractionSources }; } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 68cea8fa..f54019e1 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -227,6 +227,7 @@ export function getRateLimiterPoints( const points: number = rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5 + return points; }