Merge branch 'main' into rafa/sessionIdsExtract

This commit is contained in:
Nicolas 2025-04-16 01:48:13 -07:00
commit 51967c7c3d
7 changed files with 63 additions and 56 deletions

View File

@ -8,14 +8,15 @@ import {
buildAnalyzeSchemaPrompt,
buildAnalyzeSchemaUserPrompt,
} from "../build-prompts";
import { logger } from "../../../lib/logger";
import { jsonSchema } from "ai";
import { getModel } from "../../../lib/generic-ai";
import { Logger } from "winston";
export async function analyzeSchemaAndPrompt(
urls: string[],
schema: any,
prompt: string,
logger: Logger,
): Promise<{
isMultiEntity: boolean;
multiEntityKeys: string[];
@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt(
}> {
let cost = 0;
if (!schema) {
const genRes = await generateSchemaFromPrompt(prompt);
const genRes = await generateSchemaFromPrompt(prompt, logger);
schema = genRes.extract;
cost = genRes.cost;
}

View File

@ -23,7 +23,7 @@ export async function singleAnswerCompletion({
prompt: string;
systemPrompt: string;
useAgent: boolean;
extractId?: string;
extractId: string;
}): Promise<{
extract: any;
tokenUsage: TokenUsage;
@ -35,7 +35,11 @@ export async function singleAnswerCompletion({
}> {
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
const generationOptions: GenerateCompletionsOptions = {
logger: logger.child({ module: "extract", method: "generateCompletions" }),
logger: logger.child({
module: "extract",
method: "generateCompletions",
extractId,
}),
options: {
mode: "llm",
systemPrompt:

View File

@ -178,7 +178,7 @@ export async function performExtraction(
let reqSchema = request.schema;
if (!reqSchema && request.prompt) {
const schemaGenRes = await generateSchemaFromPrompt(request.prompt);
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
reqSchema = schemaGenRes.extract;
costTracking.otherCallCount++;
costTracking.otherCost += schemaGenRes.cost;
@ -214,7 +214,7 @@ export async function performExtraction(
keyIndicators,
tokenUsage: schemaAnalysisTokenUsage,
cost: schemaAnalysisCost,
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
logger.debug("Analyzed schema.", {
isMultiEntity,

View File

@ -186,13 +186,15 @@ export async function extractData({
urls,
useAgent,
extractId,
sessionId
sessionId,
scrapeId,
}: {
extractOptions: GenerateCompletionsOptions;
urls: string[];
useAgent: boolean;
extractId?: string;
sessionId?: string;
scrapeId?: string;
}): Promise<{
extractedDataArray: any[];
warning: any;
@ -214,7 +216,7 @@ export async function extractData({
if (!schema && extractOptions.options.prompt) {
logger.info("Generating schema from prompt");
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt);
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
otherCallCount++;
otherCost += genRes.cost;
schema = genRes.extract;
@ -252,7 +254,7 @@ export async function extractData({
} catch (error) {
logger.error(
"failed during extractSmartScrape.ts:generateCompletions",
error,
{ error },
);
// console.log("failed during extractSmartScrape.ts:generateCompletions", error);
}
@ -263,34 +265,41 @@ export async function extractData({
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
// console.log("smartscrape_prompt", extract?.smartscrape_prompt);
try {
console.log("=========================================");
console.log(
"useAgent:",
logger.info("Smart schema resolved", {
useAgent,
"shouldUseSmartscrape:",
extract?.shouldUseSmartscrape,
);
console.log("url:", urls);
console.log("prompt:", extract?.smartscrape_prompt);
console.log("=========================================");
shouldUseSmartscrape: extract?.shouldUseSmartscrape,
url: urls,
prompt: extract?.smartscrape_prompt,
providedExtractId: extractId,
})
if (useAgent && extract?.shouldUseSmartscrape) {
let smartscrapeResults: SmartScrapeResult[];
if (isSingleUrl) {
smartscrapeResults = [
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId),
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId),
];
smartScrapeCost += smartscrapeResults[0].tokenUsage;
smartScrapeCallCount++;
} else {
const pages = extract?.smartscrapePages;
const pages = extract?.smartscrapePages ?? [];
//do it async promiseall instead
if (pages.length > 100) {
logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", {
pagesLength: pages.length,
extractId,
scrapeId,
});
}
smartscrapeResults = await Promise.all(
pages.map(async (page) => {
pages.slice(0, 100).map(async (page) => {
return await smartScrape(
urls[page.page_index],
page.smartscrape_prompt,
undefined,
extractId,
scrapeId,
);
}),
);

View File

@ -1,5 +1,5 @@
import { z } from "zod";
import { logger } from "../../../lib/logger";
import { logger as _logger } from "../../../lib/logger";
import { robustFetch } from "./fetch";
import fs from "fs/promises";
import { configDotenv } from "dotenv";
@ -50,9 +50,19 @@ export async function smartScrape(
prompt: string,
sessionId?: string,
extractId?: string,
scrapeId?: string,
): Promise<SmartScrapeResult> {
let logger = _logger.child({
method: "smartScrape",
module: "smartScrape",
extractId,
url,
prompt,
sessionId,
scrapeId,
});
try {
logger.info("Initiating smart scrape request", { url, prompt, sessionId });
logger.info("Initiating smart scrape request");
// Pass schema type as generic parameter to robustFeth
const response = await robustFetch<typeof smartScrapeResultSchema>({
@ -62,6 +72,8 @@ export async function smartScrape(
url,
prompt,
userProvidedId: sessionId ?? undefined,
extractId,
scrapeId,
models: {
thinkingModel: {
model: "gemini-2.5-pro-preview-03-25",
@ -115,8 +127,6 @@ export async function smartScrape(
}
logger.info("Smart scrape successful", {
url,
prompt,
sessionId: response.sessionId,
});
@ -154,8 +164,6 @@ export async function smartScrape(
};
logger.error("Smart scrape request failed", {
url,
prompt,
error: JSON.stringify(errorInfo),
});

View File

@ -25,7 +25,7 @@ export async function performAgent(
let smartscrapeResults: SmartScrapeResult;
try {
smartscrapeResults = await smartScrape(url, prompt, sessionId)
smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id)
} catch (error) {
if (error instanceof Error && error.message === "Cost limit exceeded") {
logger.error("Cost limit exceeded", { error })

View File

@ -259,23 +259,6 @@ export async function generateCompletions({
throw new Error("document.markdown is undefined -- this is unexpected");
}
const { maxInputTokens, maxOutputTokens } = getModelLimits(
currentModel.modelId,
);
// Calculate 80% of max input tokens (for content)
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
// Use the new trimming function
const {
text: trimmedMarkdown,
numTokens,
warning: trimWarning,
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
// WE USE BIG MODELS NOW
// markdown = trimmedMarkdown;
// warning = trimWarning;
try {
const prompt =
options.prompt !== undefined
@ -300,16 +283,16 @@ export async function generateCompletions({
return {
extract,
warning,
numTokens,
numTokens: result.usage?.promptTokens ?? 0,
totalUsage: {
promptTokens: numTokens,
promptTokens: result.usage?.promptTokens ?? 0,
completionTokens: result.usage?.completionTokens ?? 0,
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
},
model: currentModel.modelId,
cost: calculateCost(
currentModel.modelId,
numTokens,
result.usage?.promptTokens ?? 0,
result.usage?.completionTokens ?? 0,
),
};
@ -341,16 +324,16 @@ export async function generateCompletions({
return {
extract,
warning,
numTokens,
numTokens: result.usage?.promptTokens ?? 0,
totalUsage: {
promptTokens: numTokens,
promptTokens: result.usage?.promptTokens ?? 0,
completionTokens: result.usage?.completionTokens ?? 0,
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
},
model: currentModel.modelId,
cost: calculateCost(
currentModel.modelId,
numTokens,
result.usage?.promptTokens ?? 0,
result.usage?.completionTokens ?? 0,
),
};
@ -541,13 +524,13 @@ export async function generateCompletions({
}
// Since generateObject doesn't provide token usage, we'll estimate it
const promptTokens = numTokens;
const completionTokens = result?.usage?.completionTokens ?? 0;
const promptTokens = result.usage?.promptTokens ?? 0;
const completionTokens = result.usage?.completionTokens ?? 0;
return {
extract,
warning,
numTokens,
numTokens: promptTokens,
totalUsage: {
promptTokens,
completionTokens,
@ -601,6 +584,7 @@ export async function performLLMExtract(
extractOptions: generationOptions,
urls: [meta.url],
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
scrapeId: meta.id,
});
if (warning) {
@ -761,6 +745,7 @@ export function removeDefaultProperty(schema: any): any {
export async function generateSchemaFromPrompt(
prompt: string,
logger: Logger,
): Promise<{ extract: any; cost: number }> {
const model = getModel("gpt-4o", "openai");
const retryModel = getModel("gpt-4o-mini", "openai");