integrate smartscrape into llmExtract

2025-08-15 20:15:54 +08:00 · 2025-04-03 20:16:34 +03:00 · 2025-04-03 20:16:34 +03:00 · fd58e782b1
commit fd58e782b1
parent c0fe770520
3 changed files with 385 additions and 106 deletions
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -0,0 +1,198 @@
 import { Logger } from "winston";
 import { z } from "zod";
 import {
  generateCompletions,
  GenerateCompletionsOptions,
 } from "../transformers/llmExtract";
 import { smartScrape } from "./smartScrape";
 import { parseMarkdown } from "../../../lib/html-to-markdown";
 const smartScrapeWrapperSchemaDefinition = {
  type: "object",
  properties: {
    // extractedData will be added dynamically
    shouldUseSmartscrape: {
      type: "boolean",
      description:
        "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
    },
    smartscrape_reasoning: {
      type: ["string", "null"],
      description:
        "Fill this only if shouldUseSmartscrape is true. Reasoning for why you think the page requires or doesnt require smartscrape. If it does explain which data you can't get with the initial page load.",
    },
    smartscrape_prompt: {
      type: ["string", "null"],
      description:
        "Prompt to use for Smartscrape refinement if shouldUseSmartscrape is true. Explain exactly what actions smartscrape should do. Smartscrape is a tool that can perform actions on the page like clicking, scrolling, etc. It cant extract data it will just return the pages and we will handle the extraction.",
    },
  },
  additionalProperties: false, // Keep this for the top-level wrapper
  required: ["extractedData", "shouldUseSmartscrape"],
  // Note: Dynamically adding 'smartscrape_reasoning' and 'smartscrape_prompt' to required
  // based on shouldUseSmartscrape is complex in standard JSON schema and might depend on the LLM's interpretation.
  // Keeping extractedData and shouldUseSmartscrape as the base requirements.
 };
 //TODO: go over and check
 // should add null to all types
 // type:string should be type:["string","null"]
 export function makeSchemaNullable(schema: any): any {
  if (typeof schema !== "object" || schema === null) {
    return schema; // Base case: not an object/array or is null
  }
  if (Array.isArray(schema)) {
    return schema.map(makeSchemaNullable); // Recurse for array items
  }
  // Process object properties
  const newSchema: { [key: string]: any } = {};
  let isObject = false; // Flag to track if this level is an object type
  for (const key in schema) {
    if (key === "additionalProperties") {
      continue; // Skip existing additionalProperties, we'll set it later if needed
    }
    if (key === "type") {
      const currentType = schema[key];
      let finalType: string | string[];
      if (typeof currentType === "string") {
        if (currentType === "object") isObject = true;
        finalType =
          currentType === "null" ? currentType : [currentType, "null"];
      } else if (Array.isArray(currentType)) {
        if (currentType.includes("object")) isObject = true;
        finalType = currentType.includes("null")
          ? currentType
          : [...currentType, "null"];
      } else {
        finalType = currentType; // Handle unexpected types?
      }
      newSchema[key] = finalType;
    } else if (typeof schema[key] === "object" && schema[key] !== null) {
      // Recurse for nested objects (properties, items, definitions, etc.)
      newSchema[key] = makeSchemaNullable(schema[key]);
      if (key === "properties") {
        // Having a 'properties' key strongly implies an object type
        isObject = true;
      }
    } else {
      // Copy other properties directly (like required, description, etc.)
      newSchema[key] = schema[key];
    }
  }
  // **Crucial Fix:** If this schema represents an object type, add additionalProperties: false
  if (isObject) {
    // Ensure 'properties' exists if 'type' was 'object' but 'properties' wasn't defined
    if (!newSchema.properties) {
      newSchema.properties = {};
    }
    newSchema.additionalProperties = false;
  }
  return newSchema;
 }
 /**
 * Wraps the original schema with SmartScrape fields if an original schema exists.
 *
 * @param originalSchema The user-provided schema (JSON Schema object or Zod schema).
 * @param logger Winston logger instance.
 * @returns An object containing the schema to use for the LLM call and whether wrapping occurred.
 */
 export function prepareSmartScrapeSchema(
  originalSchema: any | z.ZodTypeAny | undefined,
  logger: Logger,
 ) {
  // Make the user's schema nullable *and* ensure nested objects have additionalProperties:false
  const nullableAndStrictSchema = makeSchemaNullable(originalSchema);
  const wrappedSchema = {
    ...smartScrapeWrapperSchemaDefinition, // Uses the wrapper defined above
    properties: {
      extractedData: nullableAndStrictSchema, // Nest the modified original schema
      ...smartScrapeWrapperSchemaDefinition.properties, // Add smartscrape fields
    },
    // required is inherited from smartScrapeWrapperSchemaDefinition
    // additionalProperties:false is inherited from smartScrapeWrapperSchemaDefinition for the top level
  };
  logger.info("Wrapping original schema with SmartScrape fields.", {
    // Limit logging potentially large schemas
    wrappedSchemaKeys: Object.keys(wrappedSchema.properties),
  });
  return { schemaToUse: wrappedSchema };
 }
 export async function extractData({
  extractOptions,
  url,
 }: {
  extractOptions: GenerateCompletionsOptions;
  url: string;
 }): Promise<{ extractedDataArray: any[]; warning: any }> {
  //WRAP SCHEMA
  const schema = extractOptions.options.schema;
  const logger = extractOptions.logger;
  console.log("!!!!!!!!!!!!!!!!!!hereee");
  const { schemaToUse } = prepareSmartScrapeSchema(schema, logger);
  const extractOptionsNewSchema = {
    ...extractOptions,
    options: { ...extractOptions.options, schema: schemaToUse },
  };
  console.log("schema", schema);
  console.log("schemaToUse", schemaToUse);
  const { extract, warning, totalUsage } = await generateCompletions(
    extractOptionsNewSchema,
  );
  console.log("extract", extract);
  // const {
  //   extractedData,
  //   shouldUseSmartscrape,
  //   smartscrape_reasoning,
  //   smartscrape_prompt,
  // } = processSmartScrapeResult(extract, logger);
  const shouldUseSmartscrape = extract?.shouldUseSmartscrape;
  const smartscrape_reasoning = extract?.smartscrape_reasoning;
  const smartscrape_prompt = extract?.smartscrape_prompt;
  let extractedData = extract?.extractedData;
  console.log("shouldUseSmartscrape", shouldUseSmartscrape);
  console.log("smartscrape_reasoning", smartscrape_reasoning);
  console.log("smartscrape_prompt", smartscrape_prompt);
  if (shouldUseSmartscrape) {
    const smartscrapeResult = await smartScrape(url, smartscrape_prompt);
    const htmls = smartscrapeResult.scrapedPages.map((page) => page.html);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {
          ...extractOptions,
          markdown: markdown,
        };
        const { extract, warning, totalUsage, model } =
          await generateCompletions(newExtractOptions);
        return extract;
      }),
    );
    // console.log("markdowns", markdowns);
    // extractedData = smartscrapeResult;
  } else {
    extractedData = [extractedData];
  }
  return { extractedDataArray: extractedData, warning: warning };
 }
--- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
@ -0,0 +1,88 @@
 import { z } from "zod";
 import { logger } from "../../../lib/logger";
 import { robustFetch } from "./fetch";
 // Define schemas outside the function scope
 const tokenUsageDetailSchema = z.object({
  input_tokens: z.number().int(),
  output_tokens: z.number().int(),
  total_cost: z.number().nullable(), // Allows number or null
 });
 // Schema for an individual scraped page object
 const scrapedPageSchema = z.object({
  html: z.string(),
  reason: z.string(),
  page: z.number().int(),
 });
 // Main schema for the structure returned by the smart-scrape endpoint
 const smartScrapeResultSchema = z.object({
  sessionId: z.string(),
  success: z.boolean(),
  scrapedPages: z.array(scrapedPageSchema),
  tokenUsage: z.record(
    z.string(), // Key is the model name (string)
    tokenUsageDetailSchema, // Value matches the detail schema
  ),
 });
 // Infer the TypeScript type from the Zod schema
 type SmartScrapeResult = z.infer<typeof smartScrapeResultSchema>;
 /**
 * Sends a POST request to the internal /smart-scrape endpoint to extract
 * structured data from a URL based on a prompt.
 *
 * @param url The URL of the page to scrape.
 * @param prompt The prompt guiding the data extraction.
 * @returns A promise that resolves to an object matching the SmartScrapeResult type.
 * @throws Throws an error if the request fails or the response is invalid.
 */
 export async function smartScrape(
  url: string,
  prompt: string,
 ): Promise<SmartScrapeResult> {
  try {
    logger.info("Initiating smart scrape request", { url, prompt });
    // Pass schema type as generic parameter to robustFetch
    const response = await robustFetch<typeof smartScrapeResultSchema>({
      url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
      method: "POST",
      body: {
        url,
        prompt,
        thinkingModel: {
          model: "gemini-2.5-pro-exp-03-25",
          provider: "google",
          supportTools: true,
          toolChoice: "required",
          cost: {
            input: 1.3,
            output: 5,
          },
        },
        toolModel: {
          model: "gemini-2.0-flash",
          provider: "google",
        },
      },
      schema: smartScrapeResultSchema, // Pass the schema instance for validation
      logger,
      mock: null, // Keep mock null if not mocking
    });
    logger.info("Smart scrape successful", {
      url,
      prompt,
      sessionId: response.sessionId,
    });
    return response; // The response type now matches SmartScrapeResult
  } catch (error) {
    logger.error("Smart scrape request failed", { url, prompt, error });
    // Rethrowing the error to be handled by the caller
    // Consider more specific error handling or wrapping if needed
    throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
  }
 }
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -15,6 +15,7 @@ import { getModel } from "../../../lib/generic-ai";
 import { z } from "zod";
 import fs from "fs/promises";
 import Ajv from "ajv";
 import { extractData } from "../lib/extractSmartScrape";
 // TODO: fix this, it's horrible
 type LanguageModelV1ProviderMetadata = {
@ -168,7 +169,16 @@ export function trimToTokenLimit(
    };
  }
 }
-
+export type GenerateCompletionsOptions = {
  model?: LanguageModel;
  logger: Logger;
  options: ExtractOptions;
  markdown?: string;
  previousWarning?: string;
  isExtractEndpoint?: boolean;
  mode?: "object" | "no-object";
  providerOptions?: LanguageModelV1ProviderMetadata;
 };
 export async function generateCompletions({
  logger,
  options,
@ -178,16 +188,7 @@ export async function generateCompletions({
  model = getModel("gpt-4o-mini"),
  mode = "object",
  providerOptions,
-}: {
+}: GenerateCompletionsOptions): Promise<{
  model?: LanguageModel;
  logger: Logger;
  options: ExtractOptions;
  markdown?: string;
  previousWarning?: string;
  isExtractEndpoint?: boolean;
  mode?: "object" | "no-object";
  providerOptions?: LanguageModelV1ProviderMetadata;
 }): Promise<{
  extract: any;
  numTokens: number;
  warning: string | undefined;
@ -397,118 +398,110 @@ export async function performLLMExtract(
  document: Document,
 ): Promise<Document> {
  if (meta.options.formats.includes("extract")) {
-    const originalOptions = meta.options.extract!;
+    // const originalOptions = meta.options.extract!;
    let generationOptions = { ...originalOptions }; // Start with original options
    let schemaWasWrapped = false;
-    if (originalOptions.schema) {
+    // let generationOptions = { ...originalOptions }; // Start with original options
      const wrappedSchema = {
        type: "object",
        properties: {
          extractedData: originalOptions.schema, // Nest the original schema
          shouldUseSmartscrape: {
            type: "boolean",
            description:
              "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
          },
          smartscrape_reasoning: {
            type: "string",
            description:
              "Fill this only if shouldUseSmartscrape is true. Reasoning for why you think the page requires or doesnt require smartscrape. If it does explain which data you can't get with the initial page load.",
          },
          smartscrape_prompt: {
            type: "string",
            description:
              "Prompt to use for Smartscrape refinement if shouldUseSmartscrape is true. Explain exactly what actions smartscrape should do. Smartscrape is a tool that can perform actions on the page like clicking, scrolling, etc.",
          },
        },
        required: ["reasoning", "shouldUseSmartscrape"],
        // Conditionally require 'prompt' if 'shouldUseSmartscrape' is true
        // if: {
        //   properties: {
        //     shouldUseSmartscrape: { const: true },
        //   },
        //   required: ["shouldUseSmartscrape"],
        // },
        // then: {
        //   required: ["prompt"],
        // },
      };
-      // Update generationOptions to use the wrapped schema
+    const generationOptions: GenerateCompletionsOptions = {
      generationOptions.schema = wrappedSchema;
      schemaWasWrapped = true;
      meta.logger.info("Using wrapped schema for LLM extraction.", {
        wrappedSchema,
      });
    } else {
      meta.logger.info(
        "No original schema provided, proceeding without wrapping.",
      );
    }
    meta.internalOptions.abort?.throwIfAborted();
    const { extract, warning, totalUsage, model } = await generateCompletions({
      logger: meta.logger.child({
        method: "performLLMExtract/generateCompletions",
      }),
-      options: generationOptions, // Pass potentially modified options
+      options: meta.options.extract!,
      markdown: document.markdown,
      previousWarning: document.warning,
-      // model: getModel("deepseek-ai/DeepSeek-R1", "deepinfra"),
+      // ... existing model and provider options ...
-      // model: getModel("deepseek-ai/DeepSeek-V3-0324", "deepinfra"),
+      // model: getModel("o3-mini", "openai"), // Keeping existing model selection
      // model: getModel("gemini-2.5-pro-exp-03-25", "google"),
      // model: getModel("o3-mini", "openai"),
      // model: getModel("qwen-qwq-32b", "groq"),
      model: getModel("gemini-2.5-pro-exp-03-25", "google"),
    };
-      // model: getModel("gemini-2.0-flash", "google"),
+    const { extractedDataArray, warning } = await extractData({
-      // model: getModel("accounts/fireworks/models/deepseek-r1", "fireworks"),
+      extractOptions: generationOptions,
-      // model: getModel("gpt-4o-mini", "openai"),
+      url: meta.url,
      // model: getModel("gemini-2.5-pro-exp-03-25", "google"),
      // model: getModel("o3-mini", "openai"),
      model: getModel("qwen-qwq-32b", "groq"),
      // model: getModel("claude-3-7-sonnet", "anthropic"),
      providerOptions: {
        anthropic: {
          thinking: { type: "enabled", budgetTokens: 12000 },
        },
      },
    });
-    // Log token usage
+    //TODO: add merge here
-    meta.logger.info("LLM extraction token usage", {
+    const extractedData = extractedDataArray[0];
      model: model,
      promptTokens: totalUsage.promptTokens,
      completionTokens: totalUsage.completionTokens,
      totalTokens: totalUsage.totalTokens,
    });
-    // Extract the actual data if the schema was wrapped
+    // // Prepare the schema, potentially wrapping it
-    let finalExtract = schemaWasWrapped ? extract?.extractedData : extract;
+    // const { schemaToUse, schemaWasWrapped } = prepareSmartScrapeSchema(
-    console.log({ extract });
+    //   originalOptions.schema,
-    // Double-check extraction if wrapping occurred but extractedData is missing
+    //   meta.logger,
-    if (
+    // );
-      schemaWasWrapped &&
+
-      finalExtract === undefined &&
+    // // Update generationOptions with the potentially wrapped schema
-      extract?.hasOwnProperty("extractedData")
+    // generationOptions.schema = schemaToUse;
-    ) {
+
-      finalExtract = extract.extractedData;
+    // meta.internalOptions.abort?.throwIfAborted();
-    } else if (schemaWasWrapped && finalExtract === undefined) {
+    // const {
-      // Log a warning if wrapping occurred but the expected structure wasn't returned
+    //   extract: rawExtract,
-      meta.logger.warn(
+    //   warning,
-        "Schema was wrapped, but LLM result did not contain expected 'extractedData' property.",
+    //   totalUsage,
-        { extractResult: extract },
+    //   model,
-      );
+    // } = await generateCompletions({
-    }
+    //   logger: meta.logger.child({
    //     method: "performLLMExtract/generateCompletions",
    //   }),
    //   options: generationOptions, // Use the potentially modified options
    //   markdown: document.markdown,
    //   previousWarning: document.warning,
    //   // ... existing model and provider options ...
    //   model: getModel("o3-mini", "openai"), // Keeping existing model selection
    //   providerOptions: {
    //     anthropic: {
    //       thinking: { type: "enabled", budgetTokens: 12000 },
    //     },
    //   },
    // });
    // // Log token usage
    // meta.logger.info("LLM extraction token usage", {
    //   model: model,
    //   promptTokens: totalUsage.promptTokens,
    //   completionTokens: totalUsage.completionTokens,
    //   totalTokens: totalUsage.totalTokens,
    // });
    // // Process the result to extract data and SmartScrape decision
    // const {
    //   extractedData,
    //   shouldUseSmartscrape,
    //   smartscrape_reasoning,
    //   smartscrape_prompt,
    // } = processSmartScrapeResult(rawExtract, schemaWasWrapped, meta.logger);
    // // Log the SmartScrape decision if applicable
    // if (schemaWasWrapped) {
    //   meta.logger.info("SmartScrape decision processing result", {
    //     shouldUseSmartscrape,
    //     smartscrape_reasoning,
    //     // Don't log the full prompt potentially
    //     smartscrape_prompt_present: !!smartscrape_prompt,
    //     extractedDataIsPresent:
    //       extractedData !== undefined && extractedData !== null,
    //   });
    //   // TODO: Implement logic to ACTUALLY trigger SmartScrape based on the result
    //   // For example:
    //   // if (shouldUseSmartscrape && smartscrape_prompt) {
    //   //   meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
    //   //   // Call the smartScrape function (which needs to be implemented/imported)
    //   //   // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
    //   //   // Process/merge smartScrapedDocs with extractedData
    //   //   // ... potentially update finalExtract ...
    //   // } else {
    //   //   meta.logger.info("SmartScrape not required based on LLM output.");
    //   // }
    // }
    // Assign the final extracted data
    if (meta.options.formats.includes("json")) {
-      document.json = finalExtract;
+      document.json = extractedData;
    } else {
-      document.extract = finalExtract;
+      document.extract = extractedData;
    }
-    document.warning = warning;
+    // document.warning = warning;
  }
  return document;