fixes

2025-08-16 11:45:56 +08:00 · 2025-04-03 23:01:18 +03:00 · 2025-04-03 23:01:18 +03:00 · 2ffde5abc1
commit 2ffde5abc1
parent 2fdff9cc45
5 changed files with 36 additions and 28 deletions
--- a/apps/api/src/lib/extract/completions/batchExtract.ts
+++ b/apps/api/src/lib/extract/completions/batchExtract.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
  generateCompletions,
  GenerateCompletionsOptions,
 } from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
 import { Document } from "../../../controllers/v1/types";
@ -54,7 +57,7 @@ export async function batchExtractPromise(
  const { extractedDataArray, warning } = await extractData({
    extractOptions: generationOptions,
-    url: doc.metadata.sourceURL || doc.metadata.url || "",
+    urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
  });
  await fs.writeFile(
@ -74,5 +77,5 @@ export async function batchExtractPromise(
    },
    warning: warning,
    sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
-  }
+  };
 }
--- a/apps/api/src/lib/extract/completions/singleAnswer.ts
+++ b/apps/api/src/lib/extract/completions/singleAnswer.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
  generateCompletions,
  GenerateCompletionsOptions,
 } from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { Document, TokenUsage } from "../../../controllers/v1/types";
 import { getModel } from "../../../lib/generic-ai";
@ -35,7 +38,9 @@ export async function singleAnswerCompletion({
      prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt,
      schema: rSchema,
    },
-      markdown: singleAnswerDocs.map((x, i) => `[ID: ${i}]` + buildDocument(x)).join("\n"),
+    markdown: singleAnswerDocs
      .map((x, i) => `[ID: ${i}]` + buildDocument(x))
      .join("\n"),
    isExtractEndpoint: true,
    model: getModel("gemini-2.0-flash", "google"),
  };
@ -58,7 +63,6 @@ export async function singleAnswerCompletion({
    ),
  };
  // const completion = await generateCompletions({
  //   logger: logger.child({ module: "extract", method: "generateCompletions" }),
  //   options: {
@ -79,7 +83,7 @@ export async function singleAnswerCompletion({
  // );
  return {
    extract: completion.extract,
-    tokenUsage: completion.totalUsage,
+    tokenUsage: completion.tokenUsage,
    sources: singleAnswerDocs.map(
      (doc) => doc.metadata.url || doc.metadata.sourceURL || "",
    ),
--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -455,11 +455,12 @@ export async function performExtraction(
          );
          // Race between timeout and completion
-          const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
+          const multiEntityCompletion = (await completionPromise) as Awaited<
            ReturnType<typeof batchExtractPromise>
          >;
          // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
          // Track multi-entity extraction tokens
          if (multiEntityCompletion) {
            tokenUsage.push(multiEntityCompletion.totalUsage);
@ -520,7 +521,9 @@ export async function performExtraction(
      );
      extractionResults.push(...validResults);
      // Merge all extracts from valid results into a single array
-      const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
+      const extractArrays = validResults.map((r) =>
        Array.isArray(r.extract) ? r.extract : [r.extract],
      );
      const mergedExtracts = extractArrays.flat();
      multiEntityCompletions.push(...mergedExtracts);
      logger.debug("All multi-entity completion chunks finished.", {
@ -684,7 +687,7 @@ export async function performExtraction(
      tokenUsage: singleAnswerTokenUsage,
      sources: singleAnswerSources,
    } = await singleAnswerCompletion({
-      url: request.urls?.[0] || "",
+      urls: [request.urls?.[0] || ""],
      singleAnswerDocs,
      rSchema,
      links,
@ -693,10 +696,7 @@ export async function performExtraction(
    });
    logger.debug("Done generating singleAnswer completions.");
-    singleAnswerResult = transformArrayToObject(
+    singleAnswerResult = transformArrayToObject(rSchema, completionResult);
      rSchema,
      completionResult,
    );
    singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
  shouldUseSmartscrape: {
    type: "boolean",
    description:
-      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
+      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
  },
  // Note: extractedData is added dynamically in prepareSmartScrapeSchema
 };
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
+      "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
  },
  smartscrape_prompt: {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
+      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
  },
 };
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
    smartScrapePages: {
      type: "array",
      description:
-        "Make an entry for each page we want to run smart scrape on.",
+        "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
      items: {
        type: "object",
        properties: {
@ -185,7 +185,7 @@ export async function extractData({
  //WRAP SCHEMA
  const schema = extractOptions.options.schema;
  const logger = extractOptions.logger;
-  const isSingleUrl = urls.length === 0;
+  const isSingleUrl = urls.length === 1;
  console.log("!!!!!!!!!!!!!!!!!!hereee");
  const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
  const extractOptionsNewSchema = {
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -413,12 +413,13 @@ export async function performLLMExtract(
      // model: getModel("o3-mini", "openai"), // Keeping existing model selection
      // model: getModel("o3-mini", "openai"),
      // model: getModel("qwen-qwq-32b", "groq"),
      // model: getModel("gemini-2.0-flash", "google"),
      model: getModel("gemini-2.5-pro-exp-03-25", "google"),
    };
    const { extractedDataArray, warning } = await extractData({
      extractOptions: generationOptions,
-      url: meta.url,
+      urls: [meta.url],
    });
    //TODO: add merge here