mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:29:04 +08:00
Nick: extract api reference
This commit is contained in:
parent
ce3c54d7c7
commit
522c5b35da
@ -30,6 +30,7 @@ export async function extractStatusController(
|
||||
data = jobData[0].docs;
|
||||
}
|
||||
|
||||
console.log(extract.sources);
|
||||
return res.status(200).json({
|
||||
success: extract.status === "failed" ? false : true,
|
||||
data: data,
|
||||
@ -38,5 +39,6 @@ export async function extractStatusController(
|
||||
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
|
||||
steps: extract.showSteps ? extract.steps : undefined,
|
||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
||||
sources: extract.sources,
|
||||
});
|
||||
}
|
||||
|
@ -537,6 +537,7 @@ export interface URLTrace {
|
||||
};
|
||||
relevanceScore?: number;
|
||||
usedInCompletion?: boolean;
|
||||
extractedFields?: string[];
|
||||
}
|
||||
|
||||
export interface ExtractResponse {
|
||||
@ -547,6 +548,9 @@ export interface ExtractResponse {
|
||||
id?: string;
|
||||
warning?: string;
|
||||
urlTrace?: URLTrace[];
|
||||
sources?: {
|
||||
[key: string]: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export interface ExtractResponseRequestTest {
|
||||
|
@ -32,6 +32,9 @@ export type StoredExtract = {
|
||||
steps?: ExtractedStep[];
|
||||
showLLMUsage?: boolean;
|
||||
llmUsage?: number;
|
||||
sources?: {
|
||||
[key: string]: string[];
|
||||
};
|
||||
};
|
||||
|
||||
// Reduce TTL to 6 hours instead of 24
|
||||
|
@ -56,6 +56,9 @@ interface ExtractResult {
|
||||
tokenUsageBreakdown?: TokenUsage[];
|
||||
llmUsage?: number;
|
||||
totalUrlsScraped?: number;
|
||||
sources?: {
|
||||
[key: string]: string[];
|
||||
};
|
||||
}
|
||||
|
||||
async function analyzeSchemaAndPrompt(
|
||||
@ -179,6 +182,45 @@ function getRootDomain(url: string): string {
|
||||
}
|
||||
}
|
||||
|
||||
// Add helper function to track sources
|
||||
function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] {
|
||||
const extractedFields: string[] = [];
|
||||
|
||||
if (data && typeof data === 'object') {
|
||||
Object.entries(data).forEach(([key, value]) => {
|
||||
const currentPath = parentPath ? `${parentPath}.${key}` : key;
|
||||
|
||||
if (value !== null && value !== undefined) {
|
||||
extractedFields.push(currentPath);
|
||||
|
||||
if (typeof value === 'object') {
|
||||
extractedFields.push(...trackFieldSources(value, url, currentPath));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return extractedFields;
|
||||
}
|
||||
|
||||
// Add helper to merge sources from multiple extractions
|
||||
function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } {
|
||||
const mergedSources: { [key: string]: string[] } = {};
|
||||
|
||||
sources.forEach(sourceMap => {
|
||||
Object.entries(sourceMap).forEach(([field, urls]) => {
|
||||
if (!mergedSources[field]) {
|
||||
mergedSources[field] = [];
|
||||
}
|
||||
mergedSources[field].push(...urls);
|
||||
// Deduplicate URLs
|
||||
mergedSources[field] = [...new Set(mergedSources[field])];
|
||||
});
|
||||
});
|
||||
|
||||
return mergedSources;
|
||||
}
|
||||
|
||||
export async function performExtraction(
|
||||
extractId: string,
|
||||
options: ExtractServiceOptions,
|
||||
@ -191,6 +233,7 @@ export async function performExtraction(
|
||||
let multiEntityResult: any = {};
|
||||
let singleAnswerResult: any = {};
|
||||
let totalUrlsScraped = 0;
|
||||
let extractionSources: { [key: string]: string[] } = {};
|
||||
|
||||
const logger = _logger.child({
|
||||
module: "extract",
|
||||
@ -551,6 +594,24 @@ export async function performExtraction(
|
||||
// return null;
|
||||
// }
|
||||
|
||||
if (multiEntityCompletion?.extract) {
|
||||
const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!);
|
||||
|
||||
// Update URL trace with extracted fields
|
||||
const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!));
|
||||
if (trace) {
|
||||
trace.extractedFields = extractedFields;
|
||||
}
|
||||
|
||||
// Track sources for each field
|
||||
extractedFields.forEach(field => {
|
||||
if (!extractionSources[field]) {
|
||||
extractionSources[field] = [];
|
||||
}
|
||||
extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!);
|
||||
});
|
||||
}
|
||||
|
||||
return multiEntityCompletion.extract;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
|
||||
@ -727,6 +788,21 @@ export async function performExtraction(
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
|
||||
if (singleAnswerCompletions?.extract) {
|
||||
const singleAnswerSources: { [key: string]: string[] } = {};
|
||||
const usedUrls = Array.from(docsMap.values())
|
||||
.map(doc => doc.metadata.url || doc.metadata.sourceURL!)
|
||||
.filter(Boolean);
|
||||
|
||||
const extractedFields = trackFieldSources(singleAnswerCompletions.extract, '');
|
||||
extractedFields.forEach(field => {
|
||||
singleAnswerSources[field] = usedUrls;
|
||||
});
|
||||
|
||||
// Merge with multi-entity sources
|
||||
extractionSources = mergeSources([extractionSources, singleAnswerSources]);
|
||||
}
|
||||
}
|
||||
|
||||
let finalResult = reqSchema
|
||||
@ -817,6 +893,7 @@ export async function performExtraction(
|
||||
updateExtract(extractId, {
|
||||
status: "completed",
|
||||
llmUsage,
|
||||
sources: extractionSources
|
||||
}).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||
@ -834,5 +911,6 @@ export async function performExtraction(
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
llmUsage,
|
||||
totalUrlsScraped,
|
||||
sources: extractionSources
|
||||
};
|
||||
}
|
||||
|
@ -227,6 +227,7 @@ export function getRateLimiterPoints(
|
||||
|
||||
const points: number =
|
||||
rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5
|
||||
|
||||
return points;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user