mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 14:28:59 +08:00
Nick: extract api reference
This commit is contained in:
parent
ce3c54d7c7
commit
522c5b35da
@ -30,6 +30,7 @@ export async function extractStatusController(
|
|||||||
data = jobData[0].docs;
|
data = jobData[0].docs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(extract.sources);
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: extract.status === "failed" ? false : true,
|
success: extract.status === "failed" ? false : true,
|
||||||
data: data,
|
data: data,
|
||||||
@ -38,5 +39,6 @@ export async function extractStatusController(
|
|||||||
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
|
||||||
steps: extract.showSteps ? extract.steps : undefined,
|
steps: extract.showSteps ? extract.steps : undefined,
|
||||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
||||||
|
sources: extract.sources,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -537,6 +537,7 @@ export interface URLTrace {
|
|||||||
};
|
};
|
||||||
relevanceScore?: number;
|
relevanceScore?: number;
|
||||||
usedInCompletion?: boolean;
|
usedInCompletion?: boolean;
|
||||||
|
extractedFields?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtractResponse {
|
export interface ExtractResponse {
|
||||||
@ -547,6 +548,9 @@ export interface ExtractResponse {
|
|||||||
id?: string;
|
id?: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
urlTrace?: URLTrace[];
|
urlTrace?: URLTrace[];
|
||||||
|
sources?: {
|
||||||
|
[key: string]: string[];
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtractResponseRequestTest {
|
export interface ExtractResponseRequestTest {
|
||||||
|
@ -32,6 +32,9 @@ export type StoredExtract = {
|
|||||||
steps?: ExtractedStep[];
|
steps?: ExtractedStep[];
|
||||||
showLLMUsage?: boolean;
|
showLLMUsage?: boolean;
|
||||||
llmUsage?: number;
|
llmUsage?: number;
|
||||||
|
sources?: {
|
||||||
|
[key: string]: string[];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Reduce TTL to 6 hours instead of 24
|
// Reduce TTL to 6 hours instead of 24
|
||||||
|
@ -56,6 +56,9 @@ interface ExtractResult {
|
|||||||
tokenUsageBreakdown?: TokenUsage[];
|
tokenUsageBreakdown?: TokenUsage[];
|
||||||
llmUsage?: number;
|
llmUsage?: number;
|
||||||
totalUrlsScraped?: number;
|
totalUrlsScraped?: number;
|
||||||
|
sources?: {
|
||||||
|
[key: string]: string[];
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
async function analyzeSchemaAndPrompt(
|
async function analyzeSchemaAndPrompt(
|
||||||
@ -179,6 +182,45 @@ function getRootDomain(url: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add helper function to track sources
|
||||||
|
function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] {
|
||||||
|
const extractedFields: string[] = [];
|
||||||
|
|
||||||
|
if (data && typeof data === 'object') {
|
||||||
|
Object.entries(data).forEach(([key, value]) => {
|
||||||
|
const currentPath = parentPath ? `${parentPath}.${key}` : key;
|
||||||
|
|
||||||
|
if (value !== null && value !== undefined) {
|
||||||
|
extractedFields.push(currentPath);
|
||||||
|
|
||||||
|
if (typeof value === 'object') {
|
||||||
|
extractedFields.push(...trackFieldSources(value, url, currentPath));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return extractedFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add helper to merge sources from multiple extractions
|
||||||
|
function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } {
|
||||||
|
const mergedSources: { [key: string]: string[] } = {};
|
||||||
|
|
||||||
|
sources.forEach(sourceMap => {
|
||||||
|
Object.entries(sourceMap).forEach(([field, urls]) => {
|
||||||
|
if (!mergedSources[field]) {
|
||||||
|
mergedSources[field] = [];
|
||||||
|
}
|
||||||
|
mergedSources[field].push(...urls);
|
||||||
|
// Deduplicate URLs
|
||||||
|
mergedSources[field] = [...new Set(mergedSources[field])];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return mergedSources;
|
||||||
|
}
|
||||||
|
|
||||||
export async function performExtraction(
|
export async function performExtraction(
|
||||||
extractId: string,
|
extractId: string,
|
||||||
options: ExtractServiceOptions,
|
options: ExtractServiceOptions,
|
||||||
@ -191,6 +233,7 @@ export async function performExtraction(
|
|||||||
let multiEntityResult: any = {};
|
let multiEntityResult: any = {};
|
||||||
let singleAnswerResult: any = {};
|
let singleAnswerResult: any = {};
|
||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
|
let extractionSources: { [key: string]: string[] } = {};
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract",
|
module: "extract",
|
||||||
@ -551,6 +594,24 @@ export async function performExtraction(
|
|||||||
// return null;
|
// return null;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
if (multiEntityCompletion?.extract) {
|
||||||
|
const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!);
|
||||||
|
|
||||||
|
// Update URL trace with extracted fields
|
||||||
|
const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!));
|
||||||
|
if (trace) {
|
||||||
|
trace.extractedFields = extractedFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track sources for each field
|
||||||
|
extractedFields.forEach(field => {
|
||||||
|
if (!extractionSources[field]) {
|
||||||
|
extractionSources[field] = [];
|
||||||
|
}
|
||||||
|
extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return multiEntityCompletion.extract;
|
return multiEntityCompletion.extract;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
|
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
|
||||||
@ -727,6 +788,21 @@ export async function performExtraction(
|
|||||||
// }
|
// }
|
||||||
// });
|
// });
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
if (singleAnswerCompletions?.extract) {
|
||||||
|
const singleAnswerSources: { [key: string]: string[] } = {};
|
||||||
|
const usedUrls = Array.from(docsMap.values())
|
||||||
|
.map(doc => doc.metadata.url || doc.metadata.sourceURL!)
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const extractedFields = trackFieldSources(singleAnswerCompletions.extract, '');
|
||||||
|
extractedFields.forEach(field => {
|
||||||
|
singleAnswerSources[field] = usedUrls;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Merge with multi-entity sources
|
||||||
|
extractionSources = mergeSources([extractionSources, singleAnswerSources]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let finalResult = reqSchema
|
let finalResult = reqSchema
|
||||||
@ -817,6 +893,7 @@ export async function performExtraction(
|
|||||||
updateExtract(extractId, {
|
updateExtract(extractId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
llmUsage,
|
llmUsage,
|
||||||
|
sources: extractionSources
|
||||||
}).catch((error) => {
|
}).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to update extract ${extractId} status to completed: ${error}`,
|
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||||
@ -834,5 +911,6 @@ export async function performExtraction(
|
|||||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||||
llmUsage,
|
llmUsage,
|
||||||
totalUrlsScraped,
|
totalUrlsScraped,
|
||||||
|
sources: extractionSources
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -227,6 +227,7 @@ export function getRateLimiterPoints(
|
|||||||
|
|
||||||
const points: number =
|
const points: number =
|
||||||
rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5
|
rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5
|
||||||
|
|
||||||
return points;
|
return points;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user