mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-01 04:22:03 +08:00
wip
This commit is contained in:
parent
80b507e64e
commit
f92217e3b6
@ -73,5 +73,6 @@ export async function extractStatusController(
|
|||||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
||||||
sources: extract.showSources ? extract.sources : undefined,
|
sources: extract.showSources ? extract.sources : undefined,
|
||||||
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
|
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
|
||||||
|
sessionIds: extract.sessionIds ? extract.sessionIds : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,7 @@ type BatchExtractOptions = {
|
|||||||
doc: Document;
|
doc: Document;
|
||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId?: string;
|
extractId?: string;
|
||||||
|
sessionId?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -44,9 +45,17 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
|||||||
otherCost: number;
|
otherCost: number;
|
||||||
smartScrapeCallCount: number;
|
smartScrapeCallCount: number;
|
||||||
otherCallCount: number;
|
otherCallCount: number;
|
||||||
|
sessionId?: string;
|
||||||
}> {
|
}> {
|
||||||
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent, extractId } = options;
|
const {
|
||||||
|
multiEntitySchema,
|
||||||
|
links,
|
||||||
|
prompt,
|
||||||
|
systemPrompt,
|
||||||
|
doc,
|
||||||
|
useAgent,
|
||||||
|
extractId,
|
||||||
|
sessionId } = options;
|
||||||
|
|
||||||
const generationOptions: GenerateCompletionsOptions = {
|
const generationOptions: GenerateCompletionsOptions = {
|
||||||
logger: logger.child({
|
logger: logger.child({
|
||||||
@ -71,11 +80,19 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
|||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
|
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
|
||||||
try {
|
try {
|
||||||
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
const {
|
||||||
|
extractedDataArray: e,
|
||||||
|
warning: w,
|
||||||
|
smartScrapeCost,
|
||||||
|
otherCost,
|
||||||
|
smartScrapeCallCount,
|
||||||
|
otherCallCount
|
||||||
|
} = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||||
useAgent,
|
useAgent,
|
||||||
extractId,
|
extractId,
|
||||||
|
sessionId
|
||||||
});
|
});
|
||||||
extractedDataArray = e;
|
extractedDataArray = e;
|
||||||
warning = w;
|
warning = w;
|
||||||
|
@ -8,6 +8,7 @@ export enum ExtractStep {
|
|||||||
MAP_RERANK = "map-rerank",
|
MAP_RERANK = "map-rerank",
|
||||||
MULTI_ENTITY = "multi-entity",
|
MULTI_ENTITY = "multi-entity",
|
||||||
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
|
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
|
||||||
|
MULTI_ENTITY_AGENT_SCRAPE = "multi-entity-agent-scrape",
|
||||||
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
|
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
|
||||||
SCRAPE = "scrape",
|
SCRAPE = "scrape",
|
||||||
EXTRACT = "extract",
|
EXTRACT = "extract",
|
||||||
@ -17,7 +18,7 @@ export enum ExtractStep {
|
|||||||
export type ExtractedStep = {
|
export type ExtractedStep = {
|
||||||
step: ExtractStep;
|
step: ExtractStep;
|
||||||
startedAt: number;
|
startedAt: number;
|
||||||
finishedAt: number;
|
finishedAt: number | null;
|
||||||
error?: any;
|
error?: any;
|
||||||
discoveredLinks?: string[];
|
discoveredLinks?: string[];
|
||||||
};
|
};
|
||||||
@ -38,6 +39,7 @@ export type StoredExtract = {
|
|||||||
sources?: {
|
sources?: {
|
||||||
[key: string]: string[];
|
[key: string]: string[];
|
||||||
};
|
};
|
||||||
|
sessionIds?: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
// Reduce TTL to 6 hours instead of 24
|
// Reduce TTL to 6 hours instead of 24
|
||||||
@ -107,6 +109,8 @@ export async function updateExtract(
|
|||||||
}))
|
}))
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log(minimalExtract.sessionIds)
|
||||||
|
|
||||||
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
||||||
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
|
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
|
||||||
}
|
}
|
||||||
|
@ -413,8 +413,23 @@ export async function performExtraction(
|
|||||||
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
|
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const sessionIds = chunks.map(() => 'fc-' + crypto.randomUUID());
|
||||||
|
await updateExtract(extractId, {
|
||||||
|
status: "processing",
|
||||||
|
steps: [
|
||||||
|
{
|
||||||
|
step: ExtractStep.MULTI_ENTITY_AGENT_SCRAPE,
|
||||||
|
startedAt: Date.now(),
|
||||||
|
finishedAt: null
|
||||||
|
},
|
||||||
|
],
|
||||||
|
sessionIds
|
||||||
|
});
|
||||||
|
|
||||||
// Process chunks sequentially with timeout
|
// Process chunks sequentially with timeout
|
||||||
for (const chunk of chunks) {
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
const chunk = chunks[i];
|
||||||
|
const sessionId = sessionIds[i];
|
||||||
const chunkPromises = chunk.map(async (doc) => {
|
const chunkPromises = chunk.map(async (doc) => {
|
||||||
try {
|
try {
|
||||||
ajv.compile(multiEntitySchema);
|
ajv.compile(multiEntitySchema);
|
||||||
@ -432,6 +447,7 @@ export async function performExtraction(
|
|||||||
doc,
|
doc,
|
||||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||||
extractId,
|
extractId,
|
||||||
|
sessionId
|
||||||
}, logger);
|
}, logger);
|
||||||
|
|
||||||
// Race between timeout and completion
|
// Race between timeout and completion
|
||||||
|
@ -10,6 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
|
|||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
import { TokenUsage } from "../../../controllers/v1/types";
|
import { TokenUsage } from "../../../controllers/v1/types";
|
||||||
import type { SmartScrapeResult } from "./smartScrape";
|
import type { SmartScrapeResult } from "./smartScrape";
|
||||||
|
import { ExtractStep } from "src/lib/extract/extract-redis";
|
||||||
|
|
||||||
const commonSmartScrapeProperties = {
|
const commonSmartScrapeProperties = {
|
||||||
shouldUseSmartscrape: {
|
shouldUseSmartscrape: {
|
||||||
@ -185,11 +186,13 @@ export async function extractData({
|
|||||||
urls,
|
urls,
|
||||||
useAgent,
|
useAgent,
|
||||||
extractId,
|
extractId,
|
||||||
|
sessionId
|
||||||
}: {
|
}: {
|
||||||
extractOptions: GenerateCompletionsOptions;
|
extractOptions: GenerateCompletionsOptions;
|
||||||
urls: string[];
|
urls: string[];
|
||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId?: string;
|
extractId?: string;
|
||||||
|
sessionId?: string;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
extractedDataArray: any[];
|
extractedDataArray: any[];
|
||||||
warning: any;
|
warning: any;
|
||||||
@ -275,7 +278,7 @@ export async function extractData({
|
|||||||
let smartscrapeResults: SmartScrapeResult[];
|
let smartscrapeResults: SmartScrapeResult[];
|
||||||
if (isSingleUrl) {
|
if (isSingleUrl) {
|
||||||
smartscrapeResults = [
|
smartscrapeResults = [
|
||||||
await smartScrape(urls[0], extract?.smartscrape_prompt, extractId),
|
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
|
||||||
];
|
];
|
||||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||||
smartScrapeCallCount++;
|
smartScrapeCallCount++;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user