This commit is contained in:
Thomas Kosmas 2025-04-03 23:01:18 +03:00
parent 2fdff9cc45
commit 2ffde5abc1
5 changed files with 36 additions and 28 deletions

View File

@ -1,5 +1,8 @@
import { logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; import {
generateCompletions,
GenerateCompletionsOptions,
} from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document"; import { buildDocument } from "../build-document";
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types"; import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
import { Document } from "../../../controllers/v1/types"; import { Document } from "../../../controllers/v1/types";
@ -54,7 +57,7 @@ export async function batchExtractPromise(
const { extractedDataArray, warning } = await extractData({ const { extractedDataArray, warning } = await extractData({
extractOptions: generationOptions, extractOptions: generationOptions,
url: doc.metadata.sourceURL || doc.metadata.url || "", urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
}); });
await fs.writeFile( await fs.writeFile(
@ -74,5 +77,5 @@ export async function batchExtractPromise(
}, },
warning: warning, warning: warning,
sources: [doc.metadata.url || doc.metadata.sourceURL || ""], sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
} };
} }

View File

@ -1,5 +1,8 @@
import { logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; import {
generateCompletions,
GenerateCompletionsOptions,
} from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document"; import { buildDocument } from "../build-document";
import { Document, TokenUsage } from "../../../controllers/v1/types"; import { Document, TokenUsage } from "../../../controllers/v1/types";
import { getModel } from "../../../lib/generic-ai"; import { getModel } from "../../../lib/generic-ai";
@ -35,7 +38,9 @@ export async function singleAnswerCompletion({
prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt, prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt,
schema: rSchema, schema: rSchema,
}, },
markdown: singleAnswerDocs.map((x, i) => `[ID: ${i}]` + buildDocument(x)).join("\n"), markdown: singleAnswerDocs
.map((x, i) => `[ID: ${i}]` + buildDocument(x))
.join("\n"),
isExtractEndpoint: true, isExtractEndpoint: true,
model: getModel("gemini-2.0-flash", "google"), model: getModel("gemini-2.0-flash", "google"),
}; };
@ -58,7 +63,6 @@ export async function singleAnswerCompletion({
), ),
}; };
// const completion = await generateCompletions({ // const completion = await generateCompletions({
// logger: logger.child({ module: "extract", method: "generateCompletions" }), // logger: logger.child({ module: "extract", method: "generateCompletions" }),
// options: { // options: {
@ -79,7 +83,7 @@ export async function singleAnswerCompletion({
// ); // );
return { return {
extract: completion.extract, extract: completion.extract,
tokenUsage: completion.totalUsage, tokenUsage: completion.tokenUsage,
sources: singleAnswerDocs.map( sources: singleAnswerDocs.map(
(doc) => doc.metadata.url || doc.metadata.sourceURL || "", (doc) => doc.metadata.url || doc.metadata.sourceURL || "",
), ),

View File

@ -455,11 +455,12 @@ export async function performExtraction(
); );
// Race between timeout and completion // Race between timeout and completion
const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>; const multiEntityCompletion = (await completionPromise) as Awaited<
ReturnType<typeof batchExtractPromise>
>;
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
// Track multi-entity extraction tokens // Track multi-entity extraction tokens
if (multiEntityCompletion) { if (multiEntityCompletion) {
tokenUsage.push(multiEntityCompletion.totalUsage); tokenUsage.push(multiEntityCompletion.totalUsage);
@ -520,7 +521,9 @@ export async function performExtraction(
); );
extractionResults.push(...validResults); extractionResults.push(...validResults);
// Merge all extracts from valid results into a single array // Merge all extracts from valid results into a single array
const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]); const extractArrays = validResults.map((r) =>
Array.isArray(r.extract) ? r.extract : [r.extract],
);
const mergedExtracts = extractArrays.flat(); const mergedExtracts = extractArrays.flat();
multiEntityCompletions.push(...mergedExtracts); multiEntityCompletions.push(...mergedExtracts);
logger.debug("All multi-entity completion chunks finished.", { logger.debug("All multi-entity completion chunks finished.", {
@ -684,7 +687,7 @@ export async function performExtraction(
tokenUsage: singleAnswerTokenUsage, tokenUsage: singleAnswerTokenUsage,
sources: singleAnswerSources, sources: singleAnswerSources,
} = await singleAnswerCompletion({ } = await singleAnswerCompletion({
url: request.urls?.[0] || "", urls: [request.urls?.[0] || ""],
singleAnswerDocs, singleAnswerDocs,
rSchema, rSchema,
links, links,
@ -693,10 +696,7 @@ export async function performExtraction(
}); });
logger.debug("Done generating singleAnswer completions."); logger.debug("Done generating singleAnswer completions.");
singleAnswerResult = transformArrayToObject( singleAnswerResult = transformArrayToObject(rSchema, completionResult);
rSchema,
completionResult,
);
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);

View File

@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
shouldUseSmartscrape: { shouldUseSmartscrape: {
type: "boolean", type: "boolean",
description: description:
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.", "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
}, },
// Note: extractedData is added dynamically in prepareSmartScrapeSchema // Note: extractedData is added dynamically in prepareSmartScrapeSchema
}; };
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
type: ["string", "null"], type: ["string", "null"],
// Using the more detailed multi-step description as the common one // Using the more detailed multi-step description as the common one
description: description:
"Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.", "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
}, },
smartscrape_prompt: { smartscrape_prompt: {
type: ["string", "null"], type: ["string", "null"],
// Using the more detailed multi-step description as the common one // Using the more detailed multi-step description as the common one
description: description:
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').", "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
}, },
}; };
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
smartScrapePages: { smartScrapePages: {
type: "array", type: "array",
description: description:
"Make an entry for each page we want to run smart scrape on.", "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
items: { items: {
type: "object", type: "object",
properties: { properties: {
@ -185,7 +185,7 @@ export async function extractData({
//WRAP SCHEMA //WRAP SCHEMA
const schema = extractOptions.options.schema; const schema = extractOptions.options.schema;
const logger = extractOptions.logger; const logger = extractOptions.logger;
const isSingleUrl = urls.length === 0; const isSingleUrl = urls.length === 1;
console.log("!!!!!!!!!!!!!!!!!!hereee"); console.log("!!!!!!!!!!!!!!!!!!hereee");
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl); const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
const extractOptionsNewSchema = { const extractOptionsNewSchema = {

View File

@ -413,12 +413,13 @@ export async function performLLMExtract(
// model: getModel("o3-mini", "openai"), // Keeping existing model selection // model: getModel("o3-mini", "openai"), // Keeping existing model selection
// model: getModel("o3-mini", "openai"), // model: getModel("o3-mini", "openai"),
// model: getModel("qwen-qwq-32b", "groq"), // model: getModel("qwen-qwq-32b", "groq"),
// model: getModel("gemini-2.0-flash", "google"),
model: getModel("gemini-2.5-pro-exp-03-25", "google"), model: getModel("gemini-2.5-pro-exp-03-25", "google"),
}; };
const { extractedDataArray, warning } = await extractData({ const { extractedDataArray, warning } = await extractData({
extractOptions: generationOptions, extractOptions: generationOptions,
url: meta.url, urls: [meta.url],
}); });
//TODO: add merge here //TODO: add merge here