From 6a93293fd020eb077607c9c4280ff7a849f4c1ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 16 Apr 2025 16:39:48 -0700 Subject: [PATCH] feat(smart-scrape): use correct models for multi-entity assembly --- .../scrapeURL/lib/extractSmartScrape.ts | 20 +++++++++++++------ .../src/scraper/scrapeURL/lib/smartScrape.ts | 12 +++++++++-- .../scraper/scrapeURL/transformers/agent.ts | 7 ++++++- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 37205343..82f16d12 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -315,7 +315,13 @@ export async function extractData({ let smartscrapeResults: SmartScrapeResult[]; if (isSingleUrl) { smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt, sessionId, extractId, scrapeId), + await smartScrape({ + url: urls[0], + prompt: extract?.smartscrape_prompt, + sessionId, + extractId, + scrapeId, + }), ]; smartScrapeCost += smartscrapeResults[0].tokenUsage; smartScrapeCallCount++; @@ -332,13 +338,13 @@ export async function extractData({ smartscrapeResults = await Promise.all( pages.slice(0, 100).map(async (page) => { - return await smartScrape( - urls[page.page_index], - page.smartscrape_prompt, - undefined, + return await smartScrape({ + url: urls[page.page_index], + prompt: page.smartscrape_prompt, + sessionId, extractId, scrapeId, - ); + }); }), ); smartScrapeCost += smartscrapeResults.reduce( @@ -364,6 +370,8 @@ export async function extractData({ const newExtractOptions = { ...extractOptions, markdown: markdown, + model: getModel("gemini-2.5-pro-preview-03-25", "vertex"), + retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"), }; const { extract, warning, totalUsage, model, cost } = await generateCompletions(newExtractOptions); diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 046a7b5e..a913ec27 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -45,13 +45,21 @@ export type SmartScrapeResult = z.infer; * @returns A promise that resolves to an object matching the SmartScrapeResult type. * @throws Throws an error if the request fails or the response is invalid. */ -export async function smartScrape( +export async function smartScrape({ + url, + prompt, + sessionId, + extractId, + scrapeId, + beforeSubmission, +}: { url: string, prompt: string, sessionId?: string, extractId?: string, scrapeId?: string, -): Promise { + beforeSubmission?: () => unknown, +}): Promise { let logger = _logger.child({ method: "smartScrape", module: "smartScrape", diff --git a/apps/api/src/scraper/scrapeURL/transformers/agent.ts b/apps/api/src/scraper/scrapeURL/transformers/agent.ts index 30a0f46f..5ad304d3 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/agent.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/agent.ts @@ -25,7 +25,12 @@ export async function performAgent( let smartscrapeResults: SmartScrapeResult; try { - smartscrapeResults = await smartScrape(url, prompt, sessionId, undefined, meta.id) + smartscrapeResults = await smartScrape({ + url, + prompt, + sessionId, + scrapeId: meta.id, + }) } catch (error) { if (error instanceof Error && error.message === "Cost limit exceeded") { logger.error("Cost limit exceeded", { error })