Nick: extract api reference

This commit is contained in:
Nicolas 2025-01-26 21:00:40 -03:00
parent ce3c54d7c7
commit 522c5b35da
5 changed files with 88 additions and 0 deletions

View File

@ -30,6 +30,7 @@ export async function extractStatusController(
data = jobData[0].docs;
}
console.log(extract.sources);
return res.status(200).json({
success: extract.status === "failed" ? false : true,
data: data,
@ -38,5 +39,6 @@ export async function extractStatusController(
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
steps: extract.showSteps ? extract.steps : undefined,
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
sources: extract.sources,
});
}

View File

@ -537,6 +537,7 @@ export interface URLTrace {
};
relevanceScore?: number;
usedInCompletion?: boolean;
extractedFields?: string[];
}
export interface ExtractResponse {
@ -547,6 +548,9 @@ export interface ExtractResponse {
id?: string;
warning?: string;
urlTrace?: URLTrace[];
sources?: {
[key: string]: string[];
};
}
export interface ExtractResponseRequestTest {

View File

@ -32,6 +32,9 @@ export type StoredExtract = {
steps?: ExtractedStep[];
showLLMUsage?: boolean;
llmUsage?: number;
sources?: {
[key: string]: string[];
};
};
// Reduce TTL to 6 hours instead of 24

View File

@ -56,6 +56,9 @@ interface ExtractResult {
tokenUsageBreakdown?: TokenUsage[];
llmUsage?: number;
totalUrlsScraped?: number;
sources?: {
[key: string]: string[];
};
}
async function analyzeSchemaAndPrompt(
@ -179,6 +182,45 @@ function getRootDomain(url: string): string {
}
}
// Add helper function to track sources
function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] {
const extractedFields: string[] = [];
if (data && typeof data === 'object') {
Object.entries(data).forEach(([key, value]) => {
const currentPath = parentPath ? `${parentPath}.${key}` : key;
if (value !== null && value !== undefined) {
extractedFields.push(currentPath);
if (typeof value === 'object') {
extractedFields.push(...trackFieldSources(value, url, currentPath));
}
}
});
}
return extractedFields;
}
// Add helper to merge sources from multiple extractions
function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } {
const mergedSources: { [key: string]: string[] } = {};
sources.forEach(sourceMap => {
Object.entries(sourceMap).forEach(([field, urls]) => {
if (!mergedSources[field]) {
mergedSources[field] = [];
}
mergedSources[field].push(...urls);
// Deduplicate URLs
mergedSources[field] = [...new Set(mergedSources[field])];
});
});
return mergedSources;
}
export async function performExtraction(
extractId: string,
options: ExtractServiceOptions,
@ -191,6 +233,7 @@ export async function performExtraction(
let multiEntityResult: any = {};
let singleAnswerResult: any = {};
let totalUrlsScraped = 0;
let extractionSources: { [key: string]: string[] } = {};
const logger = _logger.child({
module: "extract",
@ -551,6 +594,24 @@ export async function performExtraction(
// return null;
// }
if (multiEntityCompletion?.extract) {
const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!);
// Update URL trace with extracted fields
const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!));
if (trace) {
trace.extractedFields = extractedFields;
}
// Track sources for each field
extractedFields.forEach(field => {
if (!extractionSources[field]) {
extractionSources[field] = [];
}
extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!);
});
}
return multiEntityCompletion.extract;
} catch (error) {
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
@ -727,6 +788,21 @@ export async function performExtraction(
// }
// });
// }
if (singleAnswerCompletions?.extract) {
const singleAnswerSources: { [key: string]: string[] } = {};
const usedUrls = Array.from(docsMap.values())
.map(doc => doc.metadata.url || doc.metadata.sourceURL!)
.filter(Boolean);
const extractedFields = trackFieldSources(singleAnswerCompletions.extract, '');
extractedFields.forEach(field => {
singleAnswerSources[field] = usedUrls;
});
// Merge with multi-entity sources
extractionSources = mergeSources([extractionSources, singleAnswerSources]);
}
}
let finalResult = reqSchema
@ -817,6 +893,7 @@ export async function performExtraction(
updateExtract(extractId, {
status: "completed",
llmUsage,
sources: extractionSources
}).catch((error) => {
logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
@ -834,5 +911,6 @@ export async function performExtraction(
urlTrace: request.urlTrace ? urlTraces : undefined,
llmUsage,
totalUrlsScraped,
sources: extractionSources
};
}

View File

@ -227,6 +227,7 @@ export function getRateLimiterPoints(
const points: number =
rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5
return points;
}