mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 20:15:54 +08:00
integrate smartscrape into llmExtract
This commit is contained in:
parent
c0fe770520
commit
fd58e782b1
198
apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
Normal file
198
apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
import { Logger } from "winston";
|
||||||
|
import { z } from "zod";
|
||||||
|
import {
|
||||||
|
generateCompletions,
|
||||||
|
GenerateCompletionsOptions,
|
||||||
|
} from "../transformers/llmExtract";
|
||||||
|
import { smartScrape } from "./smartScrape";
|
||||||
|
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||||
|
|
||||||
|
const smartScrapeWrapperSchemaDefinition = {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
// extractedData will be added dynamically
|
||||||
|
shouldUseSmartscrape: {
|
||||||
|
type: "boolean",
|
||||||
|
description:
|
||||||
|
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
||||||
|
},
|
||||||
|
smartscrape_reasoning: {
|
||||||
|
type: ["string", "null"],
|
||||||
|
description:
|
||||||
|
"Fill this only if shouldUseSmartscrape is true. Reasoning for why you think the page requires or doesnt require smartscrape. If it does explain which data you can't get with the initial page load.",
|
||||||
|
},
|
||||||
|
smartscrape_prompt: {
|
||||||
|
type: ["string", "null"],
|
||||||
|
description:
|
||||||
|
"Prompt to use for Smartscrape refinement if shouldUseSmartscrape is true. Explain exactly what actions smartscrape should do. Smartscrape is a tool that can perform actions on the page like clicking, scrolling, etc. It cant extract data it will just return the pages and we will handle the extraction.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
additionalProperties: false, // Keep this for the top-level wrapper
|
||||||
|
required: ["extractedData", "shouldUseSmartscrape"],
|
||||||
|
// Note: Dynamically adding 'smartscrape_reasoning' and 'smartscrape_prompt' to required
|
||||||
|
// based on shouldUseSmartscrape is complex in standard JSON schema and might depend on the LLM's interpretation.
|
||||||
|
// Keeping extractedData and shouldUseSmartscrape as the base requirements.
|
||||||
|
};
|
||||||
|
|
||||||
|
//TODO: go over and check
|
||||||
|
// should add null to all types
|
||||||
|
// type:string should be type:["string","null"]
|
||||||
|
export function makeSchemaNullable(schema: any): any {
|
||||||
|
if (typeof schema !== "object" || schema === null) {
|
||||||
|
return schema; // Base case: not an object/array or is null
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(schema)) {
|
||||||
|
return schema.map(makeSchemaNullable); // Recurse for array items
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process object properties
|
||||||
|
const newSchema: { [key: string]: any } = {};
|
||||||
|
let isObject = false; // Flag to track if this level is an object type
|
||||||
|
|
||||||
|
for (const key in schema) {
|
||||||
|
if (key === "additionalProperties") {
|
||||||
|
continue; // Skip existing additionalProperties, we'll set it later if needed
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key === "type") {
|
||||||
|
const currentType = schema[key];
|
||||||
|
let finalType: string | string[];
|
||||||
|
|
||||||
|
if (typeof currentType === "string") {
|
||||||
|
if (currentType === "object") isObject = true;
|
||||||
|
finalType =
|
||||||
|
currentType === "null" ? currentType : [currentType, "null"];
|
||||||
|
} else if (Array.isArray(currentType)) {
|
||||||
|
if (currentType.includes("object")) isObject = true;
|
||||||
|
finalType = currentType.includes("null")
|
||||||
|
? currentType
|
||||||
|
: [...currentType, "null"];
|
||||||
|
} else {
|
||||||
|
finalType = currentType; // Handle unexpected types?
|
||||||
|
}
|
||||||
|
newSchema[key] = finalType;
|
||||||
|
} else if (typeof schema[key] === "object" && schema[key] !== null) {
|
||||||
|
// Recurse for nested objects (properties, items, definitions, etc.)
|
||||||
|
newSchema[key] = makeSchemaNullable(schema[key]);
|
||||||
|
if (key === "properties") {
|
||||||
|
// Having a 'properties' key strongly implies an object type
|
||||||
|
isObject = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Copy other properties directly (like required, description, etc.)
|
||||||
|
newSchema[key] = schema[key];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// **Crucial Fix:** If this schema represents an object type, add additionalProperties: false
|
||||||
|
if (isObject) {
|
||||||
|
// Ensure 'properties' exists if 'type' was 'object' but 'properties' wasn't defined
|
||||||
|
if (!newSchema.properties) {
|
||||||
|
newSchema.properties = {};
|
||||||
|
}
|
||||||
|
newSchema.additionalProperties = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return newSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps the original schema with SmartScrape fields if an original schema exists.
|
||||||
|
*
|
||||||
|
* @param originalSchema The user-provided schema (JSON Schema object or Zod schema).
|
||||||
|
* @param logger Winston logger instance.
|
||||||
|
* @returns An object containing the schema to use for the LLM call and whether wrapping occurred.
|
||||||
|
*/
|
||||||
|
export function prepareSmartScrapeSchema(
|
||||||
|
originalSchema: any | z.ZodTypeAny | undefined,
|
||||||
|
logger: Logger,
|
||||||
|
) {
|
||||||
|
// Make the user's schema nullable *and* ensure nested objects have additionalProperties:false
|
||||||
|
const nullableAndStrictSchema = makeSchemaNullable(originalSchema);
|
||||||
|
|
||||||
|
const wrappedSchema = {
|
||||||
|
...smartScrapeWrapperSchemaDefinition, // Uses the wrapper defined above
|
||||||
|
properties: {
|
||||||
|
extractedData: nullableAndStrictSchema, // Nest the modified original schema
|
||||||
|
...smartScrapeWrapperSchemaDefinition.properties, // Add smartscrape fields
|
||||||
|
},
|
||||||
|
// required is inherited from smartScrapeWrapperSchemaDefinition
|
||||||
|
// additionalProperties:false is inherited from smartScrapeWrapperSchemaDefinition for the top level
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.info("Wrapping original schema with SmartScrape fields.", {
|
||||||
|
// Limit logging potentially large schemas
|
||||||
|
wrappedSchemaKeys: Object.keys(wrappedSchema.properties),
|
||||||
|
});
|
||||||
|
return { schemaToUse: wrappedSchema };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractData({
|
||||||
|
extractOptions,
|
||||||
|
url,
|
||||||
|
}: {
|
||||||
|
extractOptions: GenerateCompletionsOptions;
|
||||||
|
url: string;
|
||||||
|
}): Promise<{ extractedDataArray: any[]; warning: any }> {
|
||||||
|
//WRAP SCHEMA
|
||||||
|
const schema = extractOptions.options.schema;
|
||||||
|
const logger = extractOptions.logger;
|
||||||
|
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
||||||
|
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger);
|
||||||
|
const extractOptionsNewSchema = {
|
||||||
|
...extractOptions,
|
||||||
|
options: { ...extractOptions.options, schema: schemaToUse },
|
||||||
|
};
|
||||||
|
console.log("schema", schema);
|
||||||
|
console.log("schemaToUse", schemaToUse);
|
||||||
|
|
||||||
|
const { extract, warning, totalUsage } = await generateCompletions(
|
||||||
|
extractOptionsNewSchema,
|
||||||
|
);
|
||||||
|
console.log("extract", extract);
|
||||||
|
|
||||||
|
// const {
|
||||||
|
// extractedData,
|
||||||
|
// shouldUseSmartscrape,
|
||||||
|
// smartscrape_reasoning,
|
||||||
|
// smartscrape_prompt,
|
||||||
|
// } = processSmartScrapeResult(extract, logger);
|
||||||
|
|
||||||
|
const shouldUseSmartscrape = extract?.shouldUseSmartscrape;
|
||||||
|
const smartscrape_reasoning = extract?.smartscrape_reasoning;
|
||||||
|
const smartscrape_prompt = extract?.smartscrape_prompt;
|
||||||
|
let extractedData = extract?.extractedData;
|
||||||
|
|
||||||
|
console.log("shouldUseSmartscrape", shouldUseSmartscrape);
|
||||||
|
console.log("smartscrape_reasoning", smartscrape_reasoning);
|
||||||
|
console.log("smartscrape_prompt", smartscrape_prompt);
|
||||||
|
if (shouldUseSmartscrape) {
|
||||||
|
const smartscrapeResult = await smartScrape(url, smartscrape_prompt);
|
||||||
|
|
||||||
|
const htmls = smartscrapeResult.scrapedPages.map((page) => page.html);
|
||||||
|
const markdowns = await Promise.all(
|
||||||
|
htmls.map(async (html) => await parseMarkdown(html)),
|
||||||
|
);
|
||||||
|
|
||||||
|
extractedData = await Promise.all(
|
||||||
|
markdowns.map(async (markdown) => {
|
||||||
|
const newExtractOptions = {
|
||||||
|
...extractOptions,
|
||||||
|
markdown: markdown,
|
||||||
|
};
|
||||||
|
const { extract, warning, totalUsage, model } =
|
||||||
|
await generateCompletions(newExtractOptions);
|
||||||
|
return extract;
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
// console.log("markdowns", markdowns);
|
||||||
|
// extractedData = smartscrapeResult;
|
||||||
|
} else {
|
||||||
|
extractedData = [extractedData];
|
||||||
|
}
|
||||||
|
|
||||||
|
return { extractedDataArray: extractedData, warning: warning };
|
||||||
|
}
|
88
apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
Normal file
88
apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import { z } from "zod";
|
||||||
|
import { logger } from "../../../lib/logger";
|
||||||
|
import { robustFetch } from "./fetch";
|
||||||
|
|
||||||
|
// Define schemas outside the function scope
|
||||||
|
const tokenUsageDetailSchema = z.object({
|
||||||
|
input_tokens: z.number().int(),
|
||||||
|
output_tokens: z.number().int(),
|
||||||
|
total_cost: z.number().nullable(), // Allows number or null
|
||||||
|
});
|
||||||
|
|
||||||
|
// Schema for an individual scraped page object
|
||||||
|
const scrapedPageSchema = z.object({
|
||||||
|
html: z.string(),
|
||||||
|
reason: z.string(),
|
||||||
|
page: z.number().int(),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Main schema for the structure returned by the smart-scrape endpoint
|
||||||
|
const smartScrapeResultSchema = z.object({
|
||||||
|
sessionId: z.string(),
|
||||||
|
success: z.boolean(),
|
||||||
|
scrapedPages: z.array(scrapedPageSchema),
|
||||||
|
tokenUsage: z.record(
|
||||||
|
z.string(), // Key is the model name (string)
|
||||||
|
tokenUsageDetailSchema, // Value matches the detail schema
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Infer the TypeScript type from the Zod schema
|
||||||
|
type SmartScrapeResult = z.infer<typeof smartScrapeResultSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sends a POST request to the internal /smart-scrape endpoint to extract
|
||||||
|
* structured data from a URL based on a prompt.
|
||||||
|
*
|
||||||
|
* @param url The URL of the page to scrape.
|
||||||
|
* @param prompt The prompt guiding the data extraction.
|
||||||
|
* @returns A promise that resolves to an object matching the SmartScrapeResult type.
|
||||||
|
* @throws Throws an error if the request fails or the response is invalid.
|
||||||
|
*/
|
||||||
|
export async function smartScrape(
|
||||||
|
url: string,
|
||||||
|
prompt: string,
|
||||||
|
): Promise<SmartScrapeResult> {
|
||||||
|
try {
|
||||||
|
logger.info("Initiating smart scrape request", { url, prompt });
|
||||||
|
|
||||||
|
// Pass schema type as generic parameter to robustFetch
|
||||||
|
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
||||||
|
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
|
||||||
|
method: "POST",
|
||||||
|
body: {
|
||||||
|
url,
|
||||||
|
prompt,
|
||||||
|
thinkingModel: {
|
||||||
|
model: "gemini-2.5-pro-exp-03-25",
|
||||||
|
provider: "google",
|
||||||
|
supportTools: true,
|
||||||
|
toolChoice: "required",
|
||||||
|
cost: {
|
||||||
|
input: 1.3,
|
||||||
|
output: 5,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
toolModel: {
|
||||||
|
model: "gemini-2.0-flash",
|
||||||
|
provider: "google",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
schema: smartScrapeResultSchema, // Pass the schema instance for validation
|
||||||
|
logger,
|
||||||
|
mock: null, // Keep mock null if not mocking
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info("Smart scrape successful", {
|
||||||
|
url,
|
||||||
|
prompt,
|
||||||
|
sessionId: response.sessionId,
|
||||||
|
});
|
||||||
|
return response; // The response type now matches SmartScrapeResult
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Smart scrape request failed", { url, prompt, error });
|
||||||
|
// Rethrowing the error to be handled by the caller
|
||||||
|
// Consider more specific error handling or wrapping if needed
|
||||||
|
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
|
||||||
|
}
|
||||||
|
}
|
@ -15,6 +15,7 @@ import { getModel } from "../../../lib/generic-ai";
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
import Ajv from "ajv";
|
import Ajv from "ajv";
|
||||||
|
import { extractData } from "../lib/extractSmartScrape";
|
||||||
|
|
||||||
// TODO: fix this, it's horrible
|
// TODO: fix this, it's horrible
|
||||||
type LanguageModelV1ProviderMetadata = {
|
type LanguageModelV1ProviderMetadata = {
|
||||||
@ -168,7 +169,16 @@ export function trimToTokenLimit(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
export type GenerateCompletionsOptions = {
|
||||||
|
model?: LanguageModel;
|
||||||
|
logger: Logger;
|
||||||
|
options: ExtractOptions;
|
||||||
|
markdown?: string;
|
||||||
|
previousWarning?: string;
|
||||||
|
isExtractEndpoint?: boolean;
|
||||||
|
mode?: "object" | "no-object";
|
||||||
|
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||||
|
};
|
||||||
export async function generateCompletions({
|
export async function generateCompletions({
|
||||||
logger,
|
logger,
|
||||||
options,
|
options,
|
||||||
@ -178,16 +188,7 @@ export async function generateCompletions({
|
|||||||
model = getModel("gpt-4o-mini"),
|
model = getModel("gpt-4o-mini"),
|
||||||
mode = "object",
|
mode = "object",
|
||||||
providerOptions,
|
providerOptions,
|
||||||
}: {
|
}: GenerateCompletionsOptions): Promise<{
|
||||||
model?: LanguageModel;
|
|
||||||
logger: Logger;
|
|
||||||
options: ExtractOptions;
|
|
||||||
markdown?: string;
|
|
||||||
previousWarning?: string;
|
|
||||||
isExtractEndpoint?: boolean;
|
|
||||||
mode?: "object" | "no-object";
|
|
||||||
providerOptions?: LanguageModelV1ProviderMetadata;
|
|
||||||
}): Promise<{
|
|
||||||
extract: any;
|
extract: any;
|
||||||
numTokens: number;
|
numTokens: number;
|
||||||
warning: string | undefined;
|
warning: string | undefined;
|
||||||
@ -397,118 +398,110 @@ export async function performLLMExtract(
|
|||||||
document: Document,
|
document: Document,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
if (meta.options.formats.includes("extract")) {
|
if (meta.options.formats.includes("extract")) {
|
||||||
const originalOptions = meta.options.extract!;
|
// const originalOptions = meta.options.extract!;
|
||||||
let generationOptions = { ...originalOptions }; // Start with original options
|
|
||||||
let schemaWasWrapped = false;
|
|
||||||
|
|
||||||
if (originalOptions.schema) {
|
// let generationOptions = { ...originalOptions }; // Start with original options
|
||||||
const wrappedSchema = {
|
|
||||||
type: "object",
|
|
||||||
properties: {
|
|
||||||
extractedData: originalOptions.schema, // Nest the original schema
|
|
||||||
shouldUseSmartscrape: {
|
|
||||||
type: "boolean",
|
|
||||||
description:
|
|
||||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
|
||||||
},
|
|
||||||
smartscrape_reasoning: {
|
|
||||||
type: "string",
|
|
||||||
description:
|
|
||||||
"Fill this only if shouldUseSmartscrape is true. Reasoning for why you think the page requires or doesnt require smartscrape. If it does explain which data you can't get with the initial page load.",
|
|
||||||
},
|
|
||||||
smartscrape_prompt: {
|
|
||||||
type: "string",
|
|
||||||
description:
|
|
||||||
"Prompt to use for Smartscrape refinement if shouldUseSmartscrape is true. Explain exactly what actions smartscrape should do. Smartscrape is a tool that can perform actions on the page like clicking, scrolling, etc.",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
required: ["reasoning", "shouldUseSmartscrape"],
|
|
||||||
// Conditionally require 'prompt' if 'shouldUseSmartscrape' is true
|
|
||||||
// if: {
|
|
||||||
// properties: {
|
|
||||||
// shouldUseSmartscrape: { const: true },
|
|
||||||
// },
|
|
||||||
// required: ["shouldUseSmartscrape"],
|
|
||||||
// },
|
|
||||||
// then: {
|
|
||||||
// required: ["prompt"],
|
|
||||||
// },
|
|
||||||
};
|
|
||||||
|
|
||||||
// Update generationOptions to use the wrapped schema
|
const generationOptions: GenerateCompletionsOptions = {
|
||||||
generationOptions.schema = wrappedSchema;
|
|
||||||
schemaWasWrapped = true;
|
|
||||||
meta.logger.info("Using wrapped schema for LLM extraction.", {
|
|
||||||
wrappedSchema,
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
meta.logger.info(
|
|
||||||
"No original schema provided, proceeding without wrapping.",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
meta.internalOptions.abort?.throwIfAborted();
|
|
||||||
const { extract, warning, totalUsage, model } = await generateCompletions({
|
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "performLLMExtract/generateCompletions",
|
method: "performLLMExtract/generateCompletions",
|
||||||
}),
|
}),
|
||||||
options: generationOptions, // Pass potentially modified options
|
options: meta.options.extract!,
|
||||||
markdown: document.markdown,
|
markdown: document.markdown,
|
||||||
previousWarning: document.warning,
|
previousWarning: document.warning,
|
||||||
// model: getModel("deepseek-ai/DeepSeek-R1", "deepinfra"),
|
// ... existing model and provider options ...
|
||||||
// model: getModel("deepseek-ai/DeepSeek-V3-0324", "deepinfra"),
|
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||||
|
|
||||||
// model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
|
||||||
// model: getModel("o3-mini", "openai"),
|
// model: getModel("o3-mini", "openai"),
|
||||||
|
// model: getModel("qwen-qwq-32b", "groq"),
|
||||||
|
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||||
|
};
|
||||||
|
|
||||||
// model: getModel("gemini-2.0-flash", "google"),
|
const { extractedDataArray, warning } = await extractData({
|
||||||
// model: getModel("accounts/fireworks/models/deepseek-r1", "fireworks"),
|
extractOptions: generationOptions,
|
||||||
// model: getModel("gpt-4o-mini", "openai"),
|
url: meta.url,
|
||||||
// model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
|
||||||
// model: getModel("o3-mini", "openai"),
|
|
||||||
model: getModel("qwen-qwq-32b", "groq"),
|
|
||||||
|
|
||||||
// model: getModel("claude-3-7-sonnet", "anthropic"),
|
|
||||||
providerOptions: {
|
|
||||||
anthropic: {
|
|
||||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Log token usage
|
//TODO: add merge here
|
||||||
meta.logger.info("LLM extraction token usage", {
|
const extractedData = extractedDataArray[0];
|
||||||
model: model,
|
|
||||||
promptTokens: totalUsage.promptTokens,
|
|
||||||
completionTokens: totalUsage.completionTokens,
|
|
||||||
totalTokens: totalUsage.totalTokens,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Extract the actual data if the schema was wrapped
|
// // Prepare the schema, potentially wrapping it
|
||||||
let finalExtract = schemaWasWrapped ? extract?.extractedData : extract;
|
// const { schemaToUse, schemaWasWrapped } = prepareSmartScrapeSchema(
|
||||||
console.log({ extract });
|
// originalOptions.schema,
|
||||||
// Double-check extraction if wrapping occurred but extractedData is missing
|
// meta.logger,
|
||||||
if (
|
// );
|
||||||
schemaWasWrapped &&
|
|
||||||
finalExtract === undefined &&
|
// // Update generationOptions with the potentially wrapped schema
|
||||||
extract?.hasOwnProperty("extractedData")
|
// generationOptions.schema = schemaToUse;
|
||||||
) {
|
|
||||||
finalExtract = extract.extractedData;
|
// meta.internalOptions.abort?.throwIfAborted();
|
||||||
} else if (schemaWasWrapped && finalExtract === undefined) {
|
// const {
|
||||||
// Log a warning if wrapping occurred but the expected structure wasn't returned
|
// extract: rawExtract,
|
||||||
meta.logger.warn(
|
// warning,
|
||||||
"Schema was wrapped, but LLM result did not contain expected 'extractedData' property.",
|
// totalUsage,
|
||||||
{ extractResult: extract },
|
// model,
|
||||||
);
|
// } = await generateCompletions({
|
||||||
}
|
// logger: meta.logger.child({
|
||||||
|
// method: "performLLMExtract/generateCompletions",
|
||||||
|
// }),
|
||||||
|
// options: generationOptions, // Use the potentially modified options
|
||||||
|
// markdown: document.markdown,
|
||||||
|
// previousWarning: document.warning,
|
||||||
|
// // ... existing model and provider options ...
|
||||||
|
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||||
|
// providerOptions: {
|
||||||
|
// anthropic: {
|
||||||
|
// thinking: { type: "enabled", budgetTokens: 12000 },
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
// });
|
||||||
|
|
||||||
|
// // Log token usage
|
||||||
|
// meta.logger.info("LLM extraction token usage", {
|
||||||
|
// model: model,
|
||||||
|
// promptTokens: totalUsage.promptTokens,
|
||||||
|
// completionTokens: totalUsage.completionTokens,
|
||||||
|
// totalTokens: totalUsage.totalTokens,
|
||||||
|
// });
|
||||||
|
|
||||||
|
// // Process the result to extract data and SmartScrape decision
|
||||||
|
// const {
|
||||||
|
// extractedData,
|
||||||
|
// shouldUseSmartscrape,
|
||||||
|
// smartscrape_reasoning,
|
||||||
|
// smartscrape_prompt,
|
||||||
|
// } = processSmartScrapeResult(rawExtract, schemaWasWrapped, meta.logger);
|
||||||
|
|
||||||
|
// // Log the SmartScrape decision if applicable
|
||||||
|
// if (schemaWasWrapped) {
|
||||||
|
// meta.logger.info("SmartScrape decision processing result", {
|
||||||
|
// shouldUseSmartscrape,
|
||||||
|
// smartscrape_reasoning,
|
||||||
|
// // Don't log the full prompt potentially
|
||||||
|
// smartscrape_prompt_present: !!smartscrape_prompt,
|
||||||
|
// extractedDataIsPresent:
|
||||||
|
// extractedData !== undefined && extractedData !== null,
|
||||||
|
// });
|
||||||
|
|
||||||
|
// // TODO: Implement logic to ACTUALLY trigger SmartScrape based on the result
|
||||||
|
// // For example:
|
||||||
|
// // if (shouldUseSmartscrape && smartscrape_prompt) {
|
||||||
|
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
|
||||||
|
// // // Call the smartScrape function (which needs to be implemented/imported)
|
||||||
|
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
|
||||||
|
// // // Process/merge smartScrapedDocs with extractedData
|
||||||
|
// // // ... potentially update finalExtract ...
|
||||||
|
// // } else {
|
||||||
|
// // meta.logger.info("SmartScrape not required based on LLM output.");
|
||||||
|
// // }
|
||||||
|
// }
|
||||||
|
|
||||||
// Assign the final extracted data
|
// Assign the final extracted data
|
||||||
if (meta.options.formats.includes("json")) {
|
if (meta.options.formats.includes("json")) {
|
||||||
document.json = finalExtract;
|
document.json = extractedData;
|
||||||
} else {
|
} else {
|
||||||
document.extract = finalExtract;
|
document.extract = extractedData;
|
||||||
}
|
}
|
||||||
document.warning = warning;
|
// document.warning = warning;
|
||||||
}
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user