diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index b02588be..98e3ccc0 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -8,14 +8,15 @@ import { buildAnalyzeSchemaPrompt, buildAnalyzeSchemaUserPrompt, } from "../build-prompts"; -import { logger } from "../../../lib/logger"; import { jsonSchema } from "ai"; import { getModel } from "../../../lib/generic-ai"; +import { Logger } from "winston"; export async function analyzeSchemaAndPrompt( urls: string[], schema: any, prompt: string, + logger: Logger, ): Promise<{ isMultiEntity: boolean; multiEntityKeys: string[]; @@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt( }> { let cost = 0; if (!schema) { - const genRes = await generateSchemaFromPrompt(prompt); + const genRes = await generateSchemaFromPrompt(prompt, logger); schema = genRes.extract; cost = genRes.cost; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 5e76954e..4b36a58f 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -23,7 +23,7 @@ export async function singleAnswerCompletion({ prompt: string; systemPrompt: string; useAgent: boolean; - extractId?: string; + extractId: string; }): Promise<{ extract: any; tokenUsage: TokenUsage; @@ -35,7 +35,11 @@ export async function singleAnswerCompletion({ }> { const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; const generationOptions: GenerateCompletionsOptions = { - logger: logger.child({ module: "extract", method: "generateCompletions" }), + logger: logger.child({ + module: "extract", + method: "generateCompletions", + extractId, + }), options: { mode: "llm", systemPrompt: diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index dd828f4f..0b72f379 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -178,7 +178,7 @@ export async function performExtraction( let reqSchema = request.schema; if (!reqSchema && request.prompt) { - const schemaGenRes = await generateSchemaFromPrompt(request.prompt); + const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); reqSchema = schemaGenRes.extract; costTracking.otherCallCount++; costTracking.otherCost += schemaGenRes.cost; @@ -214,7 +214,7 @@ export async function performExtraction( keyIndicators, tokenUsage: schemaAnalysisTokenUsage, cost: schemaAnalysisCost, - } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? ""); + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); logger.debug("Analyzed schema.", { isMultiEntity, diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index f642383a..b6f19d17 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -186,13 +186,15 @@ export async function extractData({ urls, useAgent, extractId, - sessionId + sessionId, + scrapeId, }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; extractId?: string; sessionId?: string; + scrapeId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -214,7 +216,7 @@ export async function extractData({ if (!schema && extractOptions.options.prompt) { logger.info("Generating schema from prompt"); - const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt); + const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger); otherCallCount++; otherCost += genRes.cost; schema = genRes.extract; @@ -252,7 +254,7 @@ export async function extractData({ } catch (error) { logger.error( "failed during extractSmartScrape.ts:generateCompletions", - error, + { error }, ); // console.log("failed during extractSmartScrape.ts:generateCompletions", error); } @@ -263,34 +265,41 @@ export async function extractData({ // console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); // console.log("smartscrape_prompt", extract?.smartscrape_prompt); try { - console.log("========================================="); - console.log( - "useAgent:", + logger.info("Smart schema resolved", { useAgent, - "shouldUseSmartscrape:", - extract?.shouldUseSmartscrape, - ); - console.log("url:", urls); - console.log("prompt:", extract?.smartscrape_prompt); - console.log("========================================="); + shouldUseSmartscrape: extract?.shouldUseSmartscrape, + url: urls, + prompt: extract?.smartscrape_prompt, + providedExtractId: extractId, + }) if (useAgent && extract?.shouldUseSmartscrape) { let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; } else { - const pages = extract?.smartscrapePages; + const pages = extract?.smartscrapePages ?? []; //do it async promiseall instead + if (pages.length > 100) { + logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", { + pagesLength: pages.length, + extractId, + scrapeId, + }); + } + smartscrapeResults = await Promise.all( - pages.map(async (page) => { + pages.slice(0, 100).map(async (page) => { return await smartScrape( urls[page.page_index], page.smartscrape_prompt, + undefined, extractId, + scrapeId, ); }), ); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 8e0b45f6..046a7b5e 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { logger } from "../../../lib/logger"; +import { logger as _logger } from "../../../lib/logger"; import { robustFetch } from "./fetch"; import fs from "fs/promises"; import { configDotenv } from "dotenv"; @@ -50,9 +50,19 @@ export async function smartScrape( prompt: string, sessionId?: string, extractId?: string, + scrapeId?: string, ): Promise { + let logger = _logger.child({ + method: "smartScrape", + module: "smartScrape", + extractId, + url, + prompt, + sessionId, + scrapeId, + }); try { - logger.info("Initiating smart scrape request", { url, prompt, sessionId }); + logger.info("Initiating smart scrape request"); // Pass schema type as generic parameter to robustFeth const response = await robustFetch({ @@ -62,6 +72,8 @@ export async function smartScrape( url, prompt, userProvidedId: sessionId ?? undefined, + extractId, + scrapeId, models: { thinkingModel: { model: "gemini-2.5-pro-preview-03-25", @@ -115,8 +127,6 @@ export async function smartScrape( } logger.info("Smart scrape successful", { - url, - prompt, sessionId: response.sessionId, }); @@ -154,8 +164,6 @@ export async function smartScrape( }; logger.error("Smart scrape request failed", { - url, - prompt, error: JSON.stringify(errorInfo), }); diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 6ab32862..30a0f46f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -25,7 +25,7 @@ export async function performAgent( let smartscrapeResults: SmartScrapeResult; try { - smartscrapeResults = await smartScrape(url, prompt, sessionId) + smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { logger.error("Cost limit exceeded", { error }) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 502e268e..fe65d591 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -259,23 +259,6 @@ export async function generateCompletions({ throw new Error("document.markdown is undefined -- this is unexpected"); } - const { maxInputTokens, maxOutputTokens } = getModelLimits( - currentModel.modelId, - ); - // Calculate 80% of max input tokens (for content) - const maxTokensSafe = Math.floor(maxInputTokens * 0.8); - - // Use the new trimming function - const { - text: trimmedMarkdown, - numTokens, - warning: trimWarning, - } = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning); - - // WE USE BIG MODELS NOW - // markdown = trimmedMarkdown; - // warning = trimWarning; - try { const prompt = options.prompt !== undefined @@ -300,16 +283,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -341,16 +324,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -541,13 +524,13 @@ export async function generateCompletions({ } // Since generateObject doesn't provide token usage, we'll estimate it - const promptTokens = numTokens; - const completionTokens = result?.usage?.completionTokens ?? 0; + const promptTokens = result.usage?.promptTokens ?? 0; + const completionTokens = result.usage?.completionTokens ?? 0; return { extract, warning, - numTokens, + numTokens: promptTokens, totalUsage: { promptTokens, completionTokens, @@ -601,6 +584,7 @@ export async function performLLMExtract( extractOptions: generationOptions, urls: [meta.url], useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model), + scrapeId: meta.id, }); if (warning) { @@ -761,6 +745,7 @@ export function removeDefaultProperty(schema: any): any { export async function generateSchemaFromPrompt( prompt: string, + logger: Logger, ): Promise<{ extract: any; cost: number }> { const model = getModel("gpt-4o", "openai"); const retryModel = getModel("gpt-4o-mini", "openai");