From b6abe4f26b9a4a4441252ba326147fcc5b08b662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:23:18 -0700 Subject: [PATCH 1/9] fix(smartScrape): pass extract id --- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 8e0b45f6..ff0316c7 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -62,6 +62,7 @@ export async function smartScrape( url, prompt, userProvidedId: sessionId ?? undefined, + extractId, models: { thinkingModel: { model: "gemini-2.5-pro-preview-03-25", From 0935ec210e6ec7694d2ed46785e29dc09f3b5847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:34:00 -0700 Subject: [PATCH 2/9] feat(smartScrape): better loggin --- .../api/src/scraper/scrapeURL/lib/smartScrape.ts | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index ff0316c7..4e4cbb20 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { logger } from "../../../lib/logger"; +import { logger as _logger } from "../../../lib/logger"; import { robustFetch } from "./fetch"; import fs from "fs/promises"; import { configDotenv } from "dotenv"; @@ -51,8 +51,16 @@ export async function smartScrape( sessionId?: string, extractId?: string, ): Promise { + let logger = _logger.child({ + method: "smartScrape", + module: "smartScrape", + extractId, + url, + prompt, + sessionId, + }); try { - logger.info("Initiating smart scrape request", { url, prompt, sessionId }); + logger.info("Initiating smart scrape request"); // Pass schema type as generic parameter to robustFeth const response = await robustFetch({ @@ -116,8 +124,6 @@ export async function smartScrape( } logger.info("Smart scrape successful", { - url, - prompt, sessionId: response.sessionId, }); @@ -155,8 +161,6 @@ export async function smartScrape( }; logger.error("Smart scrape request failed", { - url, - prompt, error: JSON.stringify(errorInfo), }); From 2193bee13356b814d280d7c799e6d0e851edb09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:52:20 -0700 Subject: [PATCH 3/9] Improve logging --- .../lib/extract/completions/singleAnswer.ts | 8 ++++++-- .../scrapeURL/lib/extractSmartScrape.ts | 18 +++++++----------- .../scrapeURL/transformers/llmExtract.ts | 1 + 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 5e76954e..4b36a58f 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -23,7 +23,7 @@ export async function singleAnswerCompletion({ prompt: string; systemPrompt: string; useAgent: boolean; - extractId?: string; + extractId: string; }): Promise<{ extract: any; tokenUsage: TokenUsage; @@ -35,7 +35,11 @@ export async function singleAnswerCompletion({ }> { const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt; const generationOptions: GenerateCompletionsOptions = { - logger: logger.child({ module: "extract", method: "generateCompletions" }), + logger: logger.child({ + module: "extract", + method: "generateCompletions", + extractId, + }), options: { mode: "llm", systemPrompt: diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 7380f380..955832ff 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -211,7 +211,7 @@ export async function extractData({ if (!schema && extractOptions.options.prompt) { logger.info("Generating schema from prompt"); - const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt); + const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger); otherCallCount++; otherCost += genRes.cost; schema = genRes.extract; @@ -249,7 +249,7 @@ export async function extractData({ } catch (error) { logger.error( "failed during extractSmartScrape.ts:generateCompletions", - error, + { error }, ); // console.log("failed during extractSmartScrape.ts:generateCompletions", error); } @@ -260,16 +260,12 @@ export async function extractData({ // console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); // console.log("smartscrape_prompt", extract?.smartscrape_prompt); try { - console.log("========================================="); - console.log( - "useAgent:", + logger.info("Smart schema resolved", { useAgent, - "shouldUseSmartscrape:", - extract?.shouldUseSmartscrape, - ); - console.log("url:", urls); - console.log("prompt:", extract?.smartscrape_prompt); - console.log("========================================="); + shouldUseSmartscrape: extract?.shouldUseSmartscrape, + url: urls, + prompt: extract?.smartscrape_prompt, + }) if (useAgent && extract?.shouldUseSmartscrape) { let smartscrapeResults: SmartScrapeResult[]; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 502e268e..ebdb7a6b 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -761,6 +761,7 @@ export function removeDefaultProperty(schema: any): any { export async function generateSchemaFromPrompt( prompt: string, + logger: Logger, ): Promise<{ extract: any; cost: number }> { const model = getModel("gpt-4o", "openai"); const retryModel = getModel("gpt-4o-mini", "openai"); From 2245650bc3db47ad8c5ef12e40fe24bccc475375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 15 Apr 2025 23:54:56 -0700 Subject: [PATCH 4/9] fix --- .../src/lib/extract/completions/analyzeSchemaAndPrompt.ts | 5 +++-- apps/api/src/lib/extract/extraction-service.ts | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index b02588be..98e3ccc0 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -8,14 +8,15 @@ import { buildAnalyzeSchemaPrompt, buildAnalyzeSchemaUserPrompt, } from "../build-prompts"; -import { logger } from "../../../lib/logger"; import { jsonSchema } from "ai"; import { getModel } from "../../../lib/generic-ai"; +import { Logger } from "winston"; export async function analyzeSchemaAndPrompt( urls: string[], schema: any, prompt: string, + logger: Logger, ): Promise<{ isMultiEntity: boolean; multiEntityKeys: string[]; @@ -26,7 +27,7 @@ export async function analyzeSchemaAndPrompt( }> { let cost = 0; if (!schema) { - const genRes = await generateSchemaFromPrompt(prompt); + const genRes = await generateSchemaFromPrompt(prompt, logger); schema = genRes.extract; cost = genRes.cost; } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1a6a4262..87f4f76a 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -178,7 +178,7 @@ export async function performExtraction( let reqSchema = request.schema; if (!reqSchema && request.prompt) { - const schemaGenRes = await generateSchemaFromPrompt(request.prompt); + const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger); reqSchema = schemaGenRes.extract; costTracking.otherCallCount++; costTracking.otherCost += schemaGenRes.cost; @@ -214,7 +214,7 @@ export async function performExtraction( keyIndicators, tokenUsage: schemaAnalysisTokenUsage, cost: schemaAnalysisCost, - } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? ""); + } = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger); logger.debug("Analyzed schema.", { isMultiEntity, From a06910115b8e4f0486b6015fcc7e316edecbe141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:03:07 -0700 Subject: [PATCH 5/9] asd --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 955832ff..cc63a09f 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -265,6 +265,7 @@ export async function extractData({ shouldUseSmartscrape: extract?.shouldUseSmartscrape, url: urls, prompt: extract?.smartscrape_prompt, + providedExtractId: extractId, }) if (useAgent && extract?.shouldUseSmartscrape) { From edd4c3090800ba3754a16fe5bb7922dbd4d866e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:11:27 -0700 Subject: [PATCH 6/9] FIX IT --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index cc63a09f..d2ad2a06 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -272,7 +272,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -284,6 +284,7 @@ export async function extractData({ return await smartScrape( urls[page.page_index], page.smartscrape_prompt, + undefined, extractId, ); }), From 0d813b628bd4e2a2c81ba87a0b1d96fb8657b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 00:25:42 -0700 Subject: [PATCH 7/9] feat: correlate smart scrape --- apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts | 5 ++++- apps/api/src/scraper/scrapeURL/lib/smartScrape.ts | 3 +++ apps/api/src/scraper/scrapeURL/transformers/agent.ts | 2 +- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index d2ad2a06..cee1d5a6 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -185,11 +185,13 @@ export async function extractData({ urls, useAgent, extractId, + scrapeId, }: { extractOptions: GenerateCompletionsOptions; urls: string[]; useAgent: boolean; extractId?: string; + scrapeId?: string; }): Promise<{ extractedDataArray: any[]; warning: any; @@ -272,7 +274,7 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId), + await smartScrape(urls[0], extract?.smartscrape_prompt, undefined, extractId, scrapeId), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -286,6 +288,7 @@ export async function extractData({ page.smartscrape_prompt, undefined, extractId, + scrapeId, ); }), ); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 4e4cbb20..046a7b5e 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -50,6 +50,7 @@ export async function smartScrape( prompt: string, sessionId?: string, extractId?: string, + scrapeId?: string, ): Promise { let logger = _logger.child({ method: "smartScrape", @@ -58,6 +59,7 @@ export async function smartScrape( url, prompt, sessionId, + scrapeId, }); try { logger.info("Initiating smart scrape request"); @@ -71,6 +73,7 @@ export async function smartScrape( prompt, userProvidedId: sessionId ?? undefined, extractId, + scrapeId, models: { thinkingModel: { model: "gemini-2.5-pro-preview-03-25", diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 6ab32862..30a0f46f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -25,7 +25,7 @@ export async function performAgent( let smartscrapeResults: SmartScrapeResult; try { - smartscrapeResults = await smartScrape(url, prompt, sessionId) + smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { logger.error("Cost limit exceeded", { error }) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index ebdb7a6b..804ead2e 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -601,6 +601,7 @@ export async function performLLMExtract( extractOptions: generationOptions, urls: [meta.url], useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model), + scrapeId: meta.id, }); if (warning) { From 129b10e4789b33fcda73542b0a5cf3023f798d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 01:04:48 -0700 Subject: [PATCH 8/9] fix(llmExtract): cost calculation --- .../scrapeURL/transformers/llmExtract.ts | 39 ++++++------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 804ead2e..fe65d591 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -259,23 +259,6 @@ export async function generateCompletions({ throw new Error("document.markdown is undefined -- this is unexpected"); } - const { maxInputTokens, maxOutputTokens } = getModelLimits( - currentModel.modelId, - ); - // Calculate 80% of max input tokens (for content) - const maxTokensSafe = Math.floor(maxInputTokens * 0.8); - - // Use the new trimming function - const { - text: trimmedMarkdown, - numTokens, - warning: trimWarning, - } = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning); - - // WE USE BIG MODELS NOW - // markdown = trimmedMarkdown; - // warning = trimWarning; - try { const prompt = options.prompt !== undefined @@ -300,16 +283,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -341,16 +324,16 @@ export async function generateCompletions({ return { extract, warning, - numTokens, + numTokens: result.usage?.promptTokens ?? 0, totalUsage: { - promptTokens: numTokens, + promptTokens: result.usage?.promptTokens ?? 0, completionTokens: result.usage?.completionTokens ?? 0, - totalTokens: numTokens + (result.usage?.completionTokens ?? 0), + totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0), }, model: currentModel.modelId, cost: calculateCost( currentModel.modelId, - numTokens, + result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0, ), }; @@ -541,13 +524,13 @@ export async function generateCompletions({ } // Since generateObject doesn't provide token usage, we'll estimate it - const promptTokens = numTokens; - const completionTokens = result?.usage?.completionTokens ?? 0; + const promptTokens = result.usage?.promptTokens ?? 0; + const completionTokens = result.usage?.completionTokens ?? 0; return { extract, warning, - numTokens, + numTokens: promptTokens, totalUsage: { promptTokens, completionTokens, From dcef6fbc13fcc673f3fea91385996b95beebacf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 01:13:49 -0700 Subject: [PATCH 9/9] feat(extractSmartScrape): mog it to 100 pages max --- .../src/scraper/scrapeURL/lib/extractSmartScrape.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index cee1d5a6..a20c5d2c 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -279,10 +279,18 @@ export async function extractData({ smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; } else { - const pages = extract?.smartscrapePages; + const pages = extract?.smartscrapePages ?? []; //do it async promiseall instead + if (pages.length > 100) { + logger.warn("Smart scrape pages limit exceeded, only first 100 pages will be scraped", { + pagesLength: pages.length, + extractId, + scrapeId, + }); + } + smartscrapeResults = await Promise.all( - pages.map(async (page) => { + pages.slice(0, 100).map(async (page) => { return await smartScrape( urls[page.page_index], page.smartscrape_prompt,