Merge branch 'main' into rafa/sessionIdsExtract

This commit is contained in:
Nicolas 2025-04-16 01:48:13 -07:00
commit 51967c7c3d
7 changed files with 63 additions and 56 deletions

View File

@ -8,14 +8,15 @@ import {
buildAnalyzeSchemaPrompt, buildAnalyzeSchemaPrompt,
buildAnalyzeSchemaUserPrompt, buildAnalyzeSchemaUserPrompt,
} from "../build-prompts"; } from "../build-prompts";
import { logger } from "../../../lib/logger";
import { jsonSchema } from "ai"; import { jsonSchema } from "ai";
import { getModel } from "../../../lib/generic-ai"; import { getModel } from "../../../lib/generic-ai";
import { Logger } from "winston";
export async function analyzeSchemaAndPrompt( export async function analyzeSchemaAndPrompt(
urls: string[], urls: string[],
schema: any, schema: any,
prompt: string, prompt: string,
logger: Logger,
): Promise<{ ): Promise<{
isMultiEntity: boolean; isMultiEntity: boolean;
multiEntityKeys: string[]; multiEntityKeys: string[];
@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt(
}> { }> {
let cost = 0; let cost = 0;
if (!schema) { if (!schema) {
const genRes = await generateSchemaFromPrompt(prompt); const genRes = await generateSchemaFromPrompt(prompt, logger);
schema = genRes.extract; schema = genRes.extract;
cost = genRes.cost; cost = genRes.cost;
} }

View File

@ -23,7 +23,7 @@ export async function singleAnswerCompletion({
prompt: string; prompt: string;
systemPrompt: string; systemPrompt: string;
useAgent: boolean; useAgent: boolean;
extractId?: string; extractId: string;
}): Promise<{ }): Promise<{
extract: any; extract: any;
tokenUsage: TokenUsage; tokenUsage: TokenUsage;
@ -35,7 +35,11 @@ export async function singleAnswerCompletion({
}> { }> {
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
const generationOptions: GenerateCompletionsOptions = { const generationOptions: GenerateCompletionsOptions = {
logger: logger.child({ module: "extract", method: "generateCompletions" }), logger: logger.child({
module: "extract",
method: "generateCompletions",
extractId,
}),
options: { options: {
mode: "llm", mode: "llm",
systemPrompt: systemPrompt:

View File

@ -178,7 +178,7 @@ export async function performExtraction(
let reqSchema = request.schema; let reqSchema = request.schema;
if (!reqSchema && request.prompt) { if (!reqSchema && request.prompt) {
const schemaGenRes = await generateSchemaFromPrompt(request.prompt); const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
reqSchema = schemaGenRes.extract; reqSchema = schemaGenRes.extract;
costTracking.otherCallCount++; costTracking.otherCallCount++;
costTracking.otherCost += schemaGenRes.cost; costTracking.otherCost += schemaGenRes.cost;
@ -214,7 +214,7 @@ export async function performExtraction(
keyIndicators, keyIndicators,
tokenUsage: schemaAnalysisTokenUsage, tokenUsage: schemaAnalysisTokenUsage,
cost: schemaAnalysisCost, cost: schemaAnalysisCost,
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? ""); } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
logger.debug("Analyzed schema.", { logger.debug("Analyzed schema.", {
isMultiEntity, isMultiEntity,

View File

@ -186,13 +186,15 @@ export async function extractData({
urls, urls,
useAgent, useAgent,
extractId, extractId,
sessionId sessionId,
scrapeId,
}: { }: {
extractOptions: GenerateCompletionsOptions; extractOptions: GenerateCompletionsOptions;
urls: string[]; urls: string[];
useAgent: boolean; useAgent: boolean;
extractId?: string; extractId?: string;
sessionId?: string; sessionId?: string;
scrapeId?: string;
}): Promise<{ }): Promise<{
extractedDataArray: any[]; extractedDataArray: any[];
warning: any; warning: any;
@ -214,7 +216,7 @@ export async function extractData({
if (!schema && extractOptions.options.prompt) { if (!schema && extractOptions.options.prompt) {
logger.info("Generating schema from prompt"); logger.info("Generating schema from prompt");
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt); const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
otherCallCount++; otherCallCount++;
otherCost += genRes.cost; otherCost += genRes.cost;
schema = genRes.extract; schema = genRes.extract;
@ -252,7 +254,7 @@ export async function extractData({
} catch (error) { } catch (error) {
logger.error( logger.error(
"failed during extractSmartScrape.ts:generateCompletions", "failed during extractSmartScrape.ts:generateCompletions",
error, { error },
); );
// console.log("failed during extractSmartScrape.ts:generateCompletions", error); // console.log("failed during extractSmartScrape.ts:generateCompletions", error);
} }
@ -263,34 +265,41 @@ export async function extractData({
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); // console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
// console.log("smartscrape_prompt", extract?.smartscrape_prompt); // console.log("smartscrape_prompt", extract?.smartscrape_prompt);
try { try {
console.log("========================================="); logger.info("Smart schema resolved", {
console.log(
"useAgent:",
useAgent, useAgent,
"shouldUseSmartscrape:", shouldUseSmartscrape: extract?.shouldUseSmartscrape,
extract?.shouldUseSmartscrape, url: urls,
); prompt: extract?.smartscrape_prompt,
console.log("url:", urls); providedExtractId: extractId,
console.log("prompt:", extract?.smartscrape_prompt); })
console.log("=========================================");
if (useAgent && extract?.shouldUseSmartscrape) { if (useAgent && extract?.shouldUseSmartscrape) {
let smartscrapeResults: SmartScrapeResult[]; let smartscrapeResults: SmartScrapeResult[];
if (isSingleUrl) { if (isSingleUrl) {
smartscrapeResults = [ smartscrapeResults = [
await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId), await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId),
]; ];
smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCost += smartscrapeResults[0].tokenUsage;
smartScrapeCallCount++; smartScrapeCallCount++;
} else { } else {
const pages = extract?.smartscrapePages; const pages = extract?.smartscrapePages ?? [];
//do it async promiseall instead //do it async promiseall instead
if (pages.length > 100) {
logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", {
pagesLength: pages.length,
extractId,
scrapeId,
});
}
smartscrapeResults = await Promise.all( smartscrapeResults = await Promise.all(
pages.map(async (page) => { pages.slice(0, 100).map(async (page) => {
return await smartScrape( return await smartScrape(
urls[page.page_index], urls[page.page_index],
page.smartscrape_prompt, page.smartscrape_prompt,
undefined,
extractId, extractId,
scrapeId,
); );
}), }),
); );

View File

@ -1,5 +1,5 @@
import { z } from "zod"; import { z } from "zod";
import { logger } from "../../../lib/logger"; import { logger as _logger } from "../../../lib/logger";
import { robustFetch } from "./fetch"; import { robustFetch } from "./fetch";
import fs from "fs/promises"; import fs from "fs/promises";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
@ -50,9 +50,19 @@ export async function smartScrape(
prompt: string, prompt: string,
sessionId?: string, sessionId?: string,
extractId?: string, extractId?: string,
scrapeId?: string,
): Promise<SmartScrapeResult> { ): Promise<SmartScrapeResult> {
let logger = _logger.child({
method: "smartScrape",
module: "smartScrape",
extractId,
url,
prompt,
sessionId,
scrapeId,
});
try { try {
logger.info("Initiating smart scrape request", { url, prompt, sessionId }); logger.info("Initiating smart scrape request");
// Pass schema type as generic parameter to robustFeth // Pass schema type as generic parameter to robustFeth
const response = await robustFetch<typeof smartScrapeResultSchema>({ const response = await robustFetch<typeof smartScrapeResultSchema>({
@ -62,6 +72,8 @@ export async function smartScrape(
url, url,
prompt, prompt,
userProvidedId: sessionId ?? undefined, userProvidedId: sessionId ?? undefined,
extractId,
scrapeId,
models: { models: {
thinkingModel: { thinkingModel: {
model: "gemini-2.5-pro-preview-03-25", model: "gemini-2.5-pro-preview-03-25",
@ -115,8 +127,6 @@ export async function smartScrape(
} }
logger.info("Smart scrape successful", { logger.info("Smart scrape successful", {
url,
prompt,
sessionId: response.sessionId, sessionId: response.sessionId,
}); });
@ -154,8 +164,6 @@ export async function smartScrape(
}; };
logger.error("Smart scrape request failed", { logger.error("Smart scrape request failed", {
url,
prompt,
error: JSON.stringify(errorInfo), error: JSON.stringify(errorInfo),
}); });

View File

@ -25,7 +25,7 @@ export async function performAgent(
let smartscrapeResults: SmartScrapeResult; let smartscrapeResults: SmartScrapeResult;
try { try {
smartscrapeResults = await smartScrape(url, prompt, sessionId) smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id)
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "Cost limit exceeded") { if (error instanceof Error && error.message === "Cost limit exceeded") {
logger.error("Cost limit exceeded", { error }) logger.error("Cost limit exceeded", { error })

View File

@ -259,23 +259,6 @@ export async function generateCompletions({
throw new Error("document.markdown is undefined -- this is unexpected"); throw new Error("document.markdown is undefined -- this is unexpected");
} }
const { maxInputTokens, maxOutputTokens } = getModelLimits(
currentModel.modelId,
);
// Calculate 80% of max input tokens (for content)
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
// Use the new trimming function
const {
text: trimmedMarkdown,
numTokens,
warning: trimWarning,
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
// WE USE BIG MODELS NOW
// markdown = trimmedMarkdown;
// warning = trimWarning;
try { try {
const prompt = const prompt =
options.prompt !== undefined options.prompt !== undefined
@ -300,16 +283,16 @@ export async function generateCompletions({
return { return {
extract, extract,
warning, warning,
numTokens, numTokens: result.usage?.promptTokens ?? 0,
totalUsage: { totalUsage: {
promptTokens: numTokens, promptTokens: result.usage?.promptTokens ?? 0,
completionTokens: result.usage?.completionTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0,
totalTokens: numTokens + (result.usage?.completionTokens ?? 0), totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
}, },
model: currentModel.modelId, model: currentModel.modelId,
cost: calculateCost( cost: calculateCost(
currentModel.modelId, currentModel.modelId,
numTokens, result.usage?.promptTokens ?? 0,
result.usage?.completionTokens ?? 0, result.usage?.completionTokens ?? 0,
), ),
}; };
@ -341,16 +324,16 @@ export async function generateCompletions({
return { return {
extract, extract,
warning, warning,
numTokens, numTokens: result.usage?.promptTokens ?? 0,
totalUsage: { totalUsage: {
promptTokens: numTokens, promptTokens: result.usage?.promptTokens ?? 0,
completionTokens: result.usage?.completionTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0,
totalTokens: numTokens + (result.usage?.completionTokens ?? 0), totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
}, },
model: currentModel.modelId, model: currentModel.modelId,
cost: calculateCost( cost: calculateCost(
currentModel.modelId, currentModel.modelId,
numTokens, result.usage?.promptTokens ?? 0,
result.usage?.completionTokens ?? 0, result.usage?.completionTokens ?? 0,
), ),
}; };
@ -541,13 +524,13 @@ export async function generateCompletions({
} }
// Since generateObject doesn't provide token usage, we'll estimate it // Since generateObject doesn't provide token usage, we'll estimate it
const promptTokens = numTokens; const promptTokens = result.usage?.promptTokens ?? 0;
const completionTokens = result?.usage?.completionTokens ?? 0; const completionTokens = result.usage?.completionTokens ?? 0;
return { return {
extract, extract,
warning, warning,
numTokens, numTokens: promptTokens,
totalUsage: { totalUsage: {
promptTokens, promptTokens,
completionTokens, completionTokens,
@ -601,6 +584,7 @@ export async function performLLMExtract(
extractOptions: generationOptions, extractOptions: generationOptions,
urls: [meta.url], urls: [meta.url],
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model), useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
scrapeId: meta.id,
}); });
if (warning) { if (warning) {
@ -761,6 +745,7 @@ export function removeDefaultProperty(schema: any): any {
export async function generateSchemaFromPrompt( export async function generateSchemaFromPrompt(
prompt: string, prompt: string,
logger: Logger,
): Promise<{ extract: any; cost: number }> { ): Promise<{ extract: any; cost: number }> {
const model = getModel("gpt-4o", "openai"); const model = getModel("gpt-4o", "openai");
const retryModel = getModel("gpt-4o-mini", "openai"); const retryModel = getModel("gpt-4o-mini", "openai");