mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 02:12:01 +08:00
Merge branch 'main' into rafa/sessionIdsExtract
This commit is contained in:
commit
51967c7c3d
@ -8,14 +8,15 @@ import {
|
||||
buildAnalyzeSchemaPrompt,
|
||||
buildAnalyzeSchemaUserPrompt,
|
||||
} from "../build-prompts";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { jsonSchema } from "ai";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { Logger } from "winston";
|
||||
|
||||
export async function analyzeSchemaAndPrompt(
|
||||
urls: string[],
|
||||
schema: any,
|
||||
prompt: string,
|
||||
logger: Logger,
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt(
|
||||
}> {
|
||||
let cost = 0;
|
||||
if (!schema) {
|
||||
const genRes = await generateSchemaFromPrompt(prompt);
|
||||
const genRes = await generateSchemaFromPrompt(prompt, logger);
|
||||
schema = genRes.extract;
|
||||
cost = genRes.cost;
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ export async function singleAnswerCompletion({
|
||||
prompt: string;
|
||||
systemPrompt: string;
|
||||
useAgent: boolean;
|
||||
extractId?: string;
|
||||
extractId: string;
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
tokenUsage: TokenUsage;
|
||||
@ -35,7 +35,11 @@ export async function singleAnswerCompletion({
|
||||
}> {
|
||||
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
logger: logger.child({
|
||||
module: "extract",
|
||||
method: "generateCompletions",
|
||||
extractId,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
|
@ -178,7 +178,7 @@ export async function performExtraction(
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt);
|
||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
|
||||
reqSchema = schemaGenRes.extract;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += schemaGenRes.cost;
|
||||
@ -214,7 +214,7 @@ export async function performExtraction(
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
cost: schemaAnalysisCost,
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
|
@ -186,13 +186,15 @@ export async function extractData({
|
||||
urls,
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId
|
||||
sessionId,
|
||||
scrapeId,
|
||||
}: {
|
||||
extractOptions: GenerateCompletionsOptions;
|
||||
urls: string[];
|
||||
useAgent: boolean;
|
||||
extractId?: string;
|
||||
sessionId?: string;
|
||||
scrapeId?: string;
|
||||
}): Promise<{
|
||||
extractedDataArray: any[];
|
||||
warning: any;
|
||||
@ -214,7 +216,7 @@ export async function extractData({
|
||||
|
||||
if (!schema && extractOptions.options.prompt) {
|
||||
logger.info("Generating schema from prompt");
|
||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt);
|
||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
|
||||
otherCallCount++;
|
||||
otherCost += genRes.cost;
|
||||
schema = genRes.extract;
|
||||
@ -252,7 +254,7 @@ export async function extractData({
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"failed during extractSmartScrape.ts:generateCompletions",
|
||||
error,
|
||||
{ error },
|
||||
);
|
||||
// console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
||||
}
|
||||
@ -263,34 +265,41 @@ export async function extractData({
|
||||
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
||||
// console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
||||
try {
|
||||
console.log("=========================================");
|
||||
console.log(
|
||||
"useAgent:",
|
||||
logger.info("Smart schema resolved", {
|
||||
useAgent,
|
||||
"shouldUseSmartscrape:",
|
||||
extract?.shouldUseSmartscrape,
|
||||
);
|
||||
console.log("url:", urls);
|
||||
console.log("prompt:", extract?.smartscrape_prompt);
|
||||
console.log("=========================================");
|
||||
shouldUseSmartscrape: extract?.shouldUseSmartscrape,
|
||||
url: urls,
|
||||
prompt: extract?.smartscrape_prompt,
|
||||
providedExtractId: extractId,
|
||||
})
|
||||
|
||||
if (useAgent && extract?.shouldUseSmartscrape) {
|
||||
let smartscrapeResults: SmartScrapeResult[];
|
||||
if (isSingleUrl) {
|
||||
smartscrapeResults = [
|
||||
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
|
||||
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId),
|
||||
];
|
||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||
smartScrapeCallCount++;
|
||||
} else {
|
||||
const pages = extract?.smartscrapePages;
|
||||
const pages = extract?.smartscrapePages ?? [];
|
||||
//do it async promiseall instead
|
||||
if (pages.length > 100) {
|
||||
logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", {
|
||||
pagesLength: pages.length,
|
||||
extractId,
|
||||
scrapeId,
|
||||
});
|
||||
}
|
||||
|
||||
smartscrapeResults = await Promise.all(
|
||||
pages.map(async (page) => {
|
||||
pages.slice(0, 100).map(async (page) => {
|
||||
return await smartScrape(
|
||||
urls[page.page_index],
|
||||
page.smartscrape_prompt,
|
||||
undefined,
|
||||
extractId,
|
||||
scrapeId,
|
||||
);
|
||||
}),
|
||||
);
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { z } from "zod";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { logger as _logger } from "../../../lib/logger";
|
||||
import { robustFetch } from "./fetch";
|
||||
import fs from "fs/promises";
|
||||
import { configDotenv } from "dotenv";
|
||||
@ -50,9 +50,19 @@ export async function smartScrape(
|
||||
prompt: string,
|
||||
sessionId?: string,
|
||||
extractId?: string,
|
||||
scrapeId?: string,
|
||||
): Promise<SmartScrapeResult> {
|
||||
let logger = _logger.child({
|
||||
method: "smartScrape",
|
||||
module: "smartScrape",
|
||||
extractId,
|
||||
url,
|
||||
prompt,
|
||||
sessionId,
|
||||
scrapeId,
|
||||
});
|
||||
try {
|
||||
logger.info("Initiating smart scrape request", { url, prompt, sessionId });
|
||||
logger.info("Initiating smart scrape request");
|
||||
|
||||
// Pass schema type as generic parameter to robustFeth
|
||||
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
||||
@ -62,6 +72,8 @@ export async function smartScrape(
|
||||
url,
|
||||
prompt,
|
||||
userProvidedId: sessionId ?? undefined,
|
||||
extractId,
|
||||
scrapeId,
|
||||
models: {
|
||||
thinkingModel: {
|
||||
model: "gemini-2.5-pro-preview-03-25",
|
||||
@ -115,8 +127,6 @@ export async function smartScrape(
|
||||
}
|
||||
|
||||
logger.info("Smart scrape successful", {
|
||||
url,
|
||||
prompt,
|
||||
sessionId: response.sessionId,
|
||||
});
|
||||
|
||||
@ -154,8 +164,6 @@ export async function smartScrape(
|
||||
};
|
||||
|
||||
logger.error("Smart scrape request failed", {
|
||||
url,
|
||||
prompt,
|
||||
error: JSON.stringify(errorInfo),
|
||||
});
|
||||
|
||||
|
@ -25,7 +25,7 @@ export async function performAgent(
|
||||
|
||||
let smartscrapeResults: SmartScrapeResult;
|
||||
try {
|
||||
smartscrapeResults = await smartScrape(url, prompt, sessionId)
|
||||
smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id)
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||
logger.error("Cost limit exceeded", { error })
|
||||
|
@ -259,23 +259,6 @@ export async function generateCompletions({
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
|
||||
const { maxInputTokens, maxOutputTokens } = getModelLimits(
|
||||
currentModel.modelId,
|
||||
);
|
||||
// Calculate 80% of max input tokens (for content)
|
||||
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
|
||||
|
||||
// Use the new trimming function
|
||||
const {
|
||||
text: trimmedMarkdown,
|
||||
numTokens,
|
||||
warning: trimWarning,
|
||||
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
|
||||
|
||||
// WE USE BIG MODELS NOW
|
||||
// markdown = trimmedMarkdown;
|
||||
// warning = trimWarning;
|
||||
|
||||
try {
|
||||
const prompt =
|
||||
options.prompt !== undefined
|
||||
@ -300,16 +283,16 @@ export async function generateCompletions({
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
numTokens: result.usage?.promptTokens ?? 0,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
promptTokens: result.usage?.promptTokens ?? 0,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
numTokens,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
@ -341,16 +324,16 @@ export async function generateCompletions({
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
numTokens: result.usage?.promptTokens ?? 0,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
promptTokens: result.usage?.promptTokens ?? 0,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
numTokens,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
@ -541,13 +524,13 @@ export async function generateCompletions({
|
||||
}
|
||||
|
||||
// Since generateObject doesn't provide token usage, we'll estimate it
|
||||
const promptTokens = numTokens;
|
||||
const completionTokens = result?.usage?.completionTokens ?? 0;
|
||||
const promptTokens = result.usage?.promptTokens ?? 0;
|
||||
const completionTokens = result.usage?.completionTokens ?? 0;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
numTokens: promptTokens,
|
||||
totalUsage: {
|
||||
promptTokens,
|
||||
completionTokens,
|
||||
@ -601,6 +584,7 @@ export async function performLLMExtract(
|
||||
extractOptions: generationOptions,
|
||||
urls: [meta.url],
|
||||
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
|
||||
scrapeId: meta.id,
|
||||
});
|
||||
|
||||
if (warning) {
|
||||
@ -761,6 +745,7 @@ export function removeDefaultProperty(schema: any): any {
|
||||
|
||||
export async function generateSchemaFromPrompt(
|
||||
prompt: string,
|
||||
logger: Logger,
|
||||
): Promise<{ extract: any; cost: number }> {
|
||||
const model = getModel("gpt-4o", "openai");
|
||||
const retryModel = getModel("gpt-4o-mini", "openai");
|
||||
|
Loading…
x
Reference in New Issue
Block a user