mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-01 11:02:03 +08:00

* wip * integrating smart-scrape * integrate smartscrape into llmExtract * wip * smart scrape multiple links * fixes * fix * wip * it worked! * wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON * wip * retry model * retry models * feat/scrape+json+extract interfaces ready * vertex -> googleapi * fix/transformArrayToObject. required params on schema is still a bug * change model * o3-mini -> gemini * Update extractSmartScrape.ts * sessionId * sessionId * Nick: f-0 start * Update extraction-service-f0.ts * Update types.ts * Nick: * Update queue-worker.ts * Nick: new interface * rename analyzeSchemaAndPrompt -> F0 * refactor: rename agent ID to model in types and extract logic * agent * id->model * id->model * refactor: standardize agent model handling and validation across extraction logic * livecast agent * (feat/f1) sdks (#1459) * feat: add FIRE-1 agent support to Python and JavaScript SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add FIRE-1 agent support to scrape methods in both SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add prompt and sessionId to AgentOptions interface Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * Update index.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com> * feat(v1): rate limits * Update types.ts * Update llmExtract.ts * add cost tracking * remove * Update requests.http * fix smart scrape cost calc * log sm cost * fix counts * fix * expose cost tracking * models fix * temp: skipLibcheck * get rid of it * fix ts * dont skip lib check * Update extractSmartScrape.ts * Update queue-worker.ts * Update smartScrape.ts * Update requests.http * fix(rate-limiter): * types: fire-1 refine * bill 150 * fix credits used on crawl * ban from crawl * route cost limit warning * Update generic-ai.ts * genres * Update llmExtract.ts * test server diff * cletu --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas <thomas510111@gmail.com> Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com> Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
95 lines
2.2 KiB
TypeScript
95 lines
2.2 KiB
TypeScript
import {
|
|
generateCompletions,
|
|
generateSchemaFromPrompt,
|
|
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
|
import { TokenUsage } from "../../../controllers/v1/types";
|
|
import { z } from "zod";
|
|
import {
|
|
buildAnalyzeSchemaPrompt,
|
|
buildAnalyzeSchemaUserPrompt,
|
|
} from "../build-prompts";
|
|
import { logger } from "../../../lib/logger";
|
|
import { jsonSchema } from "ai";
|
|
import { getModel } from "../../../lib/generic-ai";
|
|
|
|
export async function analyzeSchemaAndPrompt(
|
|
urls: string[],
|
|
schema: any,
|
|
prompt: string,
|
|
): Promise<{
|
|
isMultiEntity: boolean;
|
|
multiEntityKeys: string[];
|
|
reasoning: string;
|
|
keyIndicators: string[];
|
|
tokenUsage: TokenUsage;
|
|
cost: number;
|
|
}> {
|
|
let cost = 0;
|
|
if (!schema) {
|
|
const genRes = await generateSchemaFromPrompt(prompt);
|
|
schema = genRes.extract;
|
|
cost = genRes.cost;
|
|
}
|
|
|
|
const schemaString = JSON.stringify(schema);
|
|
|
|
const model = getModel("gpt-4o");
|
|
|
|
const checkSchema = z
|
|
.object({
|
|
isMultiEntity: z.boolean(),
|
|
multiEntityKeys: z.array(z.string()).optional().default([]),
|
|
reasoning: z.string(),
|
|
keyIndicators: z.array(z.string()),
|
|
})
|
|
.refine(
|
|
(x) => !x.isMultiEntity || x.multiEntityKeys.length > 0,
|
|
"isMultiEntity was true, but no multiEntityKeys",
|
|
);
|
|
|
|
try {
|
|
const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({
|
|
logger,
|
|
options: {
|
|
mode: "llm",
|
|
schema: checkSchema,
|
|
prompt: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls),
|
|
systemPrompt: buildAnalyzeSchemaPrompt(),
|
|
},
|
|
markdown: "",
|
|
model,
|
|
});
|
|
cost += cost2;
|
|
|
|
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
|
checkSchema.parse(result);
|
|
|
|
return {
|
|
isMultiEntity,
|
|
multiEntityKeys,
|
|
reasoning,
|
|
keyIndicators,
|
|
tokenUsage: totalUsage,
|
|
cost,
|
|
};
|
|
} catch (e) {
|
|
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
|
error: e,
|
|
});
|
|
}
|
|
|
|
return {
|
|
isMultiEntity: false,
|
|
multiEntityKeys: [],
|
|
reasoning: "",
|
|
keyIndicators: [],
|
|
tokenUsage: {
|
|
promptTokens: 0,
|
|
completionTokens: 0,
|
|
totalTokens: 0,
|
|
model: model.modelId,
|
|
},
|
|
cost: 0,
|
|
};
|
|
}
|