mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 02:12:01 +08:00
Merge branch 'main' into rafa/sessionIdsExtract
This commit is contained in:
commit
51967c7c3d
@ -8,14 +8,15 @@ import {
|
|||||||
buildAnalyzeSchemaPrompt,
|
buildAnalyzeSchemaPrompt,
|
||||||
buildAnalyzeSchemaUserPrompt,
|
buildAnalyzeSchemaUserPrompt,
|
||||||
} from "../build-prompts";
|
} from "../build-prompts";
|
||||||
import { logger } from "../../../lib/logger";
|
|
||||||
import { jsonSchema } from "ai";
|
import { jsonSchema } from "ai";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
|
import { Logger } from "winston";
|
||||||
|
|
||||||
export async function analyzeSchemaAndPrompt(
|
export async function analyzeSchemaAndPrompt(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
schema: any,
|
schema: any,
|
||||||
prompt: string,
|
prompt: string,
|
||||||
|
logger: Logger,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
isMultiEntity: boolean;
|
isMultiEntity: boolean;
|
||||||
multiEntityKeys: string[];
|
multiEntityKeys: string[];
|
||||||
@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
}> {
|
}> {
|
||||||
let cost = 0;
|
let cost = 0;
|
||||||
if (!schema) {
|
if (!schema) {
|
||||||
const genRes = await generateSchemaFromPrompt(prompt);
|
const genRes = await generateSchemaFromPrompt(prompt, logger);
|
||||||
schema = genRes.extract;
|
schema = genRes.extract;
|
||||||
cost = genRes.cost;
|
cost = genRes.cost;
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@ export async function singleAnswerCompletion({
|
|||||||
prompt: string;
|
prompt: string;
|
||||||
systemPrompt: string;
|
systemPrompt: string;
|
||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId?: string;
|
extractId: string;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
extract: any;
|
extract: any;
|
||||||
tokenUsage: TokenUsage;
|
tokenUsage: TokenUsage;
|
||||||
@ -35,7 +35,11 @@ export async function singleAnswerCompletion({
|
|||||||
}> {
|
}> {
|
||||||
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
||||||
const generationOptions: GenerateCompletionsOptions = {
|
const generationOptions: GenerateCompletionsOptions = {
|
||||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
logger: logger.child({
|
||||||
|
module: "extract",
|
||||||
|
method: "generateCompletions",
|
||||||
|
extractId,
|
||||||
|
}),
|
||||||
options: {
|
options: {
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt:
|
systemPrompt:
|
||||||
|
@ -178,7 +178,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
let reqSchema = request.schema;
|
let reqSchema = request.schema;
|
||||||
if (!reqSchema && request.prompt) {
|
if (!reqSchema && request.prompt) {
|
||||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt);
|
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
|
||||||
reqSchema = schemaGenRes.extract;
|
reqSchema = schemaGenRes.extract;
|
||||||
costTracking.otherCallCount++;
|
costTracking.otherCallCount++;
|
||||||
costTracking.otherCost += schemaGenRes.cost;
|
costTracking.otherCost += schemaGenRes.cost;
|
||||||
@ -214,7 +214,7 @@ export async function performExtraction(
|
|||||||
keyIndicators,
|
keyIndicators,
|
||||||
tokenUsage: schemaAnalysisTokenUsage,
|
tokenUsage: schemaAnalysisTokenUsage,
|
||||||
cost: schemaAnalysisCost,
|
cost: schemaAnalysisCost,
|
||||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
|
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
|
||||||
|
|
||||||
logger.debug("Analyzed schema.", {
|
logger.debug("Analyzed schema.", {
|
||||||
isMultiEntity,
|
isMultiEntity,
|
||||||
|
@ -186,13 +186,15 @@ export async function extractData({
|
|||||||
urls,
|
urls,
|
||||||
useAgent,
|
useAgent,
|
||||||
extractId,
|
extractId,
|
||||||
sessionId
|
sessionId,
|
||||||
|
scrapeId,
|
||||||
}: {
|
}: {
|
||||||
extractOptions: GenerateCompletionsOptions;
|
extractOptions: GenerateCompletionsOptions;
|
||||||
urls: string[];
|
urls: string[];
|
||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId?: string;
|
extractId?: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
|
scrapeId?: string;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
extractedDataArray: any[];
|
extractedDataArray: any[];
|
||||||
warning: any;
|
warning: any;
|
||||||
@ -214,7 +216,7 @@ export async function extractData({
|
|||||||
|
|
||||||
if (!schema && extractOptions.options.prompt) {
|
if (!schema && extractOptions.options.prompt) {
|
||||||
logger.info("Generating schema from prompt");
|
logger.info("Generating schema from prompt");
|
||||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt);
|
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
|
||||||
otherCallCount++;
|
otherCallCount++;
|
||||||
otherCost += genRes.cost;
|
otherCost += genRes.cost;
|
||||||
schema = genRes.extract;
|
schema = genRes.extract;
|
||||||
@ -252,7 +254,7 @@ export async function extractData({
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(
|
logger.error(
|
||||||
"failed during extractSmartScrape.ts:generateCompletions",
|
"failed during extractSmartScrape.ts:generateCompletions",
|
||||||
error,
|
{ error },
|
||||||
);
|
);
|
||||||
// console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
// console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
||||||
}
|
}
|
||||||
@ -263,34 +265,41 @@ export async function extractData({
|
|||||||
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
||||||
// console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
// console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
||||||
try {
|
try {
|
||||||
console.log("=========================================");
|
logger.info("Smart schema resolved", {
|
||||||
console.log(
|
|
||||||
"useAgent:",
|
|
||||||
useAgent,
|
useAgent,
|
||||||
"shouldUseSmartscrape:",
|
shouldUseSmartscrape: extract?.shouldUseSmartscrape,
|
||||||
extract?.shouldUseSmartscrape,
|
url: urls,
|
||||||
);
|
prompt: extract?.smartscrape_prompt,
|
||||||
console.log("url:", urls);
|
providedExtractId: extractId,
|
||||||
console.log("prompt:", extract?.smartscrape_prompt);
|
})
|
||||||
console.log("=========================================");
|
|
||||||
|
|
||||||
if (useAgent && extract?.shouldUseSmartscrape) {
|
if (useAgent && extract?.shouldUseSmartscrape) {
|
||||||
let smartscrapeResults: SmartScrapeResult[];
|
let smartscrapeResults: SmartScrapeResult[];
|
||||||
if (isSingleUrl) {
|
if (isSingleUrl) {
|
||||||
smartscrapeResults = [
|
smartscrapeResults = [
|
||||||
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
|
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId),
|
||||||
];
|
];
|
||||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||||
smartScrapeCallCount++;
|
smartScrapeCallCount++;
|
||||||
} else {
|
} else {
|
||||||
const pages = extract?.smartscrapePages;
|
const pages = extract?.smartscrapePages ?? [];
|
||||||
//do it async promiseall instead
|
//do it async promiseall instead
|
||||||
|
if (pages.length > 100) {
|
||||||
|
logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", {
|
||||||
|
pagesLength: pages.length,
|
||||||
|
extractId,
|
||||||
|
scrapeId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
smartscrapeResults = await Promise.all(
|
smartscrapeResults = await Promise.all(
|
||||||
pages.map(async (page) => {
|
pages.slice(0, 100).map(async (page) => {
|
||||||
return await smartScrape(
|
return await smartScrape(
|
||||||
urls[page.page_index],
|
urls[page.page_index],
|
||||||
page.smartscrape_prompt,
|
page.smartscrape_prompt,
|
||||||
|
undefined,
|
||||||
extractId,
|
extractId,
|
||||||
|
scrapeId,
|
||||||
);
|
);
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { logger } from "../../../lib/logger";
|
import { logger as _logger } from "../../../lib/logger";
|
||||||
import { robustFetch } from "./fetch";
|
import { robustFetch } from "./fetch";
|
||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
@ -50,9 +50,19 @@ export async function smartScrape(
|
|||||||
prompt: string,
|
prompt: string,
|
||||||
sessionId?: string,
|
sessionId?: string,
|
||||||
extractId?: string,
|
extractId?: string,
|
||||||
|
scrapeId?: string,
|
||||||
): Promise<SmartScrapeResult> {
|
): Promise<SmartScrapeResult> {
|
||||||
|
let logger = _logger.child({
|
||||||
|
method: "smartScrape",
|
||||||
|
module: "smartScrape",
|
||||||
|
extractId,
|
||||||
|
url,
|
||||||
|
prompt,
|
||||||
|
sessionId,
|
||||||
|
scrapeId,
|
||||||
|
});
|
||||||
try {
|
try {
|
||||||
logger.info("Initiating smart scrape request", { url, prompt, sessionId });
|
logger.info("Initiating smart scrape request");
|
||||||
|
|
||||||
// Pass schema type as generic parameter to robustFeth
|
// Pass schema type as generic parameter to robustFeth
|
||||||
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
||||||
@ -62,6 +72,8 @@ export async function smartScrape(
|
|||||||
url,
|
url,
|
||||||
prompt,
|
prompt,
|
||||||
userProvidedId: sessionId ?? undefined,
|
userProvidedId: sessionId ?? undefined,
|
||||||
|
extractId,
|
||||||
|
scrapeId,
|
||||||
models: {
|
models: {
|
||||||
thinkingModel: {
|
thinkingModel: {
|
||||||
model: "gemini-2.5-pro-preview-03-25",
|
model: "gemini-2.5-pro-preview-03-25",
|
||||||
@ -115,8 +127,6 @@ export async function smartScrape(
|
|||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Smart scrape successful", {
|
logger.info("Smart scrape successful", {
|
||||||
url,
|
|
||||||
prompt,
|
|
||||||
sessionId: response.sessionId,
|
sessionId: response.sessionId,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -154,8 +164,6 @@ export async function smartScrape(
|
|||||||
};
|
};
|
||||||
|
|
||||||
logger.error("Smart scrape request failed", {
|
logger.error("Smart scrape request failed", {
|
||||||
url,
|
|
||||||
prompt,
|
|
||||||
error: JSON.stringify(errorInfo),
|
error: JSON.stringify(errorInfo),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ export async function performAgent(
|
|||||||
|
|
||||||
let smartscrapeResults: SmartScrapeResult;
|
let smartscrapeResults: SmartScrapeResult;
|
||||||
try {
|
try {
|
||||||
smartscrapeResults = await smartScrape(url, prompt, sessionId)
|
smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id)
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||||
logger.error("Cost limit exceeded", { error })
|
logger.error("Cost limit exceeded", { error })
|
||||||
|
@ -259,23 +259,6 @@ export async function generateCompletions({
|
|||||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||||
}
|
}
|
||||||
|
|
||||||
const { maxInputTokens, maxOutputTokens } = getModelLimits(
|
|
||||||
currentModel.modelId,
|
|
||||||
);
|
|
||||||
// Calculate 80% of max input tokens (for content)
|
|
||||||
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
|
|
||||||
|
|
||||||
// Use the new trimming function
|
|
||||||
const {
|
|
||||||
text: trimmedMarkdown,
|
|
||||||
numTokens,
|
|
||||||
warning: trimWarning,
|
|
||||||
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
|
|
||||||
|
|
||||||
// WE USE BIG MODELS NOW
|
|
||||||
// markdown = trimmedMarkdown;
|
|
||||||
// warning = trimWarning;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const prompt =
|
const prompt =
|
||||||
options.prompt !== undefined
|
options.prompt !== undefined
|
||||||
@ -300,16 +283,16 @@ export async function generateCompletions({
|
|||||||
return {
|
return {
|
||||||
extract,
|
extract,
|
||||||
warning,
|
warning,
|
||||||
numTokens,
|
numTokens: result.usage?.promptTokens ?? 0,
|
||||||
totalUsage: {
|
totalUsage: {
|
||||||
promptTokens: numTokens,
|
promptTokens: result.usage?.promptTokens ?? 0,
|
||||||
completionTokens: result.usage?.completionTokens ?? 0,
|
completionTokens: result.usage?.completionTokens ?? 0,
|
||||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||||
},
|
},
|
||||||
model: currentModel.modelId,
|
model: currentModel.modelId,
|
||||||
cost: calculateCost(
|
cost: calculateCost(
|
||||||
currentModel.modelId,
|
currentModel.modelId,
|
||||||
numTokens,
|
result.usage?.promptTokens ?? 0,
|
||||||
result.usage?.completionTokens ?? 0,
|
result.usage?.completionTokens ?? 0,
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
@ -341,16 +324,16 @@ export async function generateCompletions({
|
|||||||
return {
|
return {
|
||||||
extract,
|
extract,
|
||||||
warning,
|
warning,
|
||||||
numTokens,
|
numTokens: result.usage?.promptTokens ?? 0,
|
||||||
totalUsage: {
|
totalUsage: {
|
||||||
promptTokens: numTokens,
|
promptTokens: result.usage?.promptTokens ?? 0,
|
||||||
completionTokens: result.usage?.completionTokens ?? 0,
|
completionTokens: result.usage?.completionTokens ?? 0,
|
||||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||||
},
|
},
|
||||||
model: currentModel.modelId,
|
model: currentModel.modelId,
|
||||||
cost: calculateCost(
|
cost: calculateCost(
|
||||||
currentModel.modelId,
|
currentModel.modelId,
|
||||||
numTokens,
|
result.usage?.promptTokens ?? 0,
|
||||||
result.usage?.completionTokens ?? 0,
|
result.usage?.completionTokens ?? 0,
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
@ -541,13 +524,13 @@ export async function generateCompletions({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Since generateObject doesn't provide token usage, we'll estimate it
|
// Since generateObject doesn't provide token usage, we'll estimate it
|
||||||
const promptTokens = numTokens;
|
const promptTokens = result.usage?.promptTokens ?? 0;
|
||||||
const completionTokens = result?.usage?.completionTokens ?? 0;
|
const completionTokens = result.usage?.completionTokens ?? 0;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
extract,
|
extract,
|
||||||
warning,
|
warning,
|
||||||
numTokens,
|
numTokens: promptTokens,
|
||||||
totalUsage: {
|
totalUsage: {
|
||||||
promptTokens,
|
promptTokens,
|
||||||
completionTokens,
|
completionTokens,
|
||||||
@ -601,6 +584,7 @@ export async function performLLMExtract(
|
|||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: [meta.url],
|
urls: [meta.url],
|
||||||
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
|
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
|
||||||
|
scrapeId: meta.id,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (warning) {
|
if (warning) {
|
||||||
@ -761,6 +745,7 @@ export function removeDefaultProperty(schema: any): any {
|
|||||||
|
|
||||||
export async function generateSchemaFromPrompt(
|
export async function generateSchemaFromPrompt(
|
||||||
prompt: string,
|
prompt: string,
|
||||||
|
logger: Logger,
|
||||||
): Promise<{ extract: any; cost: number }> {
|
): Promise<{ extract: any; cost: number }> {
|
||||||
const model = getModel("gpt-4o", "openai");
|
const model = getModel("gpt-4o", "openai");
|
||||||
const retryModel = getModel("gpt-4o-mini", "openai");
|
const retryModel = getModel("gpt-4o-mini", "openai");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user