mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 05:52:00 +08:00
wip
This commit is contained in:
parent
80b507e64e
commit
f92217e3b6
@ -73,5 +73,6 @@ export async function extractStatusController(
|
||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
||||
sources: extract.showSources ? extract.sources : undefined,
|
||||
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
|
||||
sessionIds: extract.sessionIds ? extract.sessionIds : undefined,
|
||||
});
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ type BatchExtractOptions = {
|
||||
doc: Document;
|
||||
useAgent: boolean;
|
||||
extractId?: string;
|
||||
sessionId?: string;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
||||
otherCost: number;
|
||||
smartScrapeCallCount: number;
|
||||
otherCallCount: number;
|
||||
sessionId?: string;
|
||||
}> {
|
||||
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;
|
||||
|
||||
const {
|
||||
multiEntitySchema,
|
||||
links,
|
||||
prompt,
|
||||
systemPrompt,
|
||||
doc,
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId } = options;
|
||||
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
logger: logger.child({
|
||||
@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
||||
let warning: string | undefined;
|
||||
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
|
||||
try {
|
||||
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
||||
const {
|
||||
extractedDataArray: e,
|
||||
warning: w,
|
||||
smartScrapeCost,
|
||||
otherCost,
|
||||
smartScrapeCallCount,
|
||||
otherCallCount
|
||||
} = await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId
|
||||
});
|
||||
extractedDataArray = e;
|
||||
warning = w;
|
||||
|
@ -8,6 +8,7 @@ export enum ExtractStep {
|
||||
MAP_RERANK = "map-rerank",
|
||||
MULTI_ENTITY = "multi-entity",
|
||||
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
|
||||
MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape",
|
||||
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
|
||||
SCRAPE = "scrape",
|
||||
EXTRACT = "extract",
|
||||
@ -17,7 +18,7 @@ export enum ExtractStep {
|
||||
export type ExtractedStep = {
|
||||
step: ExtractStep;
|
||||
startedAt: number;
|
||||
finishedAt: number;
|
||||
finishedAt: number | null;
|
||||
error?: any;
|
||||
discoveredLinks?: string[];
|
||||
};
|
||||
@ -38,6 +39,7 @@ export type StoredExtract = {
|
||||
sources?: {
|
||||
[key: string]: string[];
|
||||
};
|
||||
sessionIds?: string[];
|
||||
};
|
||||
|
||||
// Reduce TTL to 6 hours instead of 24
|
||||
@ -107,6 +109,8 @@ export async function updateExtract(
|
||||
}))
|
||||
};
|
||||
|
||||
console.log(minimalExtract.sessionIds)
|
||||
|
||||
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
||||
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
|
||||
}
|
||||
|
@ -413,8 +413,23 @@ export async function performExtraction(
|
||||
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID());
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE,
|
||||
startedAt: Date.now(),
|
||||
finishedAt: null
|
||||
},
|
||||
],
|
||||
sessionIds
|
||||
});
|
||||
|
||||
// Process chunks sequentially with timeout
|
||||
for (const chunk of chunks) {
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const sessionId = sessionIds[i];
|
||||
const chunkPromises = chunk.map(async (doc) => {
|
||||
try {
|
||||
ajv.compile(multiEntitySchema);
|
||||
@ -432,6 +447,7 @@ export async function performExtraction(
|
||||
doc,
|
||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||
extractId,
|
||||
sessionId
|
||||
}, logger);
|
||||
|
||||
// Race between timeout and completion
|
||||
|
@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { TokenUsage } from "../../../controllers/v1/types";
|
||||
import type { SmartScrapeResult } from "./smartScrape";
|
||||
import { ExtractStep } from "src/lib/extract/extract-redis";
|
||||
|
||||
const commonSmartScrapeProperties = {
|
||||
shouldUseSmartscrape: {
|
||||
@ -185,11 +186,13 @@ export async function extractData({
|
||||
urls,
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId
|
||||
}: {
|
||||
extractOptions: GenerateCompletionsOptions;
|
||||
urls: string[];
|
||||
useAgent: boolean;
|
||||
extractId?: string;
|
||||
sessionId?: string;
|
||||
}): Promise<{
|
||||
extractedDataArray: any[];
|
||||
warning: any;
|
||||
@ -275,7 +278,7 @@ export async function extractData({
|
||||
let smartscrapeResults: SmartScrapeResult[];
|
||||
if (isSingleUrl) {
|
||||
smartscrapeResults = [
|
||||
await smartScrape(urls[0], extract?.smartscrape_prompt, extractId),
|
||||
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
|
||||
];
|
||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||
smartScrapeCallCount++;
|
||||
|
Loading…
x
Reference in New Issue
Block a user