Nick: extract without a schema should work as expected

2025-08-12 14:08:59 +08:00 · 2025-01-14 11:37:00 -03:00 · 2025-01-14 11:37:00 -03:00 · 957eea4113
commit 957eea4113
parent 61e6af2b16
3 changed files with 152 additions and 53 deletions
--- a/apps/api/src/lib/extract/extract-redis.ts
+++ b/apps/api/src/lib/extract/extract-redis.ts
@ -9,9 +9,7 @@ export enum ExtractStep {
  MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
  MULTI_ENTITY_EXTRACT = "multi-entity-extract",
  SCRAPE = "scrape",
-
  EXTRACT = "extract",
-
  COMPLETE = "complete",
 }

--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -8,7 +8,10 @@ import { PlanType } from "../../types";
 import { logger } from "../logger";
 import { processUrl } from "./url-processor";
 import { scrapeDocument } from "./document-scraper";
-import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
+import {
+  generateOpenAICompletions,
+  generateSchemaFromPrompt,
+} from "../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "./build-document";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
@ -45,7 +48,6 @@ interface ExtractResult {
  error?: string;
 }

-
 async function analyzeSchemaAndPrompt(
  urls: string[],
  schema: any,
@ -56,6 +58,10 @@ async function analyzeSchemaAndPrompt(
  reasoning?: string;
  keyIndicators?: string[];
 }> {
+  if (!schema) {
+    schema = await generateSchemaFromPrompt(prompt);
+  }
+
  const schemaString = JSON.stringify(schema);

  const checkSchema = z.object({
@ -132,7 +138,7 @@ type completions = {
  extract: Record<string, any>;
  numTokens: number;
  warning?: string;
-}
+};

 function getRootDomain(url: string): string {
  try {
@ -199,7 +205,9 @@ export async function performExtraction(
            },
          ],
        });
-    }));
+      },
+    ),
+  );

  const processedUrls = await Promise.all(urlPromises);
  const links = processedUrls.flat().filter((url) => url);
@ -227,7 +235,13 @@ export async function performExtraction(
  });

  let reqSchema = request.schema;
+  if (!reqSchema && request.prompt) {
+    reqSchema = await generateSchemaFromPrompt(request.prompt);
+  }
+
+  if (reqSchema) {
    reqSchema = await dereferenceSchema(reqSchema);
+  }

  // agent evaluates if the schema or the prompt has an array with big amount of items
  // also it checks if the schema any other properties that are not arrays
@ -236,7 +250,7 @@ export async function performExtraction(
  // 2. the second one is multiple completions that will extract the items from the array
  let startAnalyze = Date.now();
  const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
-    await analyzeSchemaAndPrompt(links, request.schema, request.prompt ?? "");
+    await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");

  // console.log("\nIs Multi Entity:", isMultiEntity);
  // console.log("\nMulti Entity Keys:", multiEntityKeys);
@ -244,8 +258,11 @@ export async function performExtraction(
  // console.log("\nKey Indicators:", keyIndicators);

  let rSchema = reqSchema;
-  if (isMultiEntity) {
-    const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(reqSchema, multiEntityKeys)
+  if (isMultiEntity && reqSchema) {
+    const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
+      reqSchema,
+      multiEntityKeys,
+    );
    rSchema = singleAnswerSchema;

    await updateExtract(extractId, {
@ -260,7 +277,6 @@ export async function performExtraction(
      ],
    });

-
    const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;

    await updateExtract(extractId, {
@ -287,10 +303,10 @@ export async function performExtraction(
            timeout,
          },
          urlTraces,
-        )
+        );
      }
      return docsMap.get(url);
-    })
+    });

    let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
      (doc): doc is Document => doc !== null,
@ -342,25 +358,28 @@ export async function performExtraction(
            logger.child({ method: "extractService/checkShouldExtract" }),
            {
              mode: "llm",
-              systemPrompt: "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
+              systemPrompt:
+                "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
              prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`,
              schema: {
-                "type": "object",
-                "properties": {
-                  "extract": {
-                    "type": "boolean"
-                  }
+                type: "object",
+                properties: {
+                  extract: {
+                    type: "boolean",
+                  },
+                },
+                required: ["extract"],
              },
-                "required": ["extract"]
-              }
            },
            buildDocument(doc),
            undefined,
-            true
+            true,
          );

          if (!shouldExtractCheck.extract["extract"]) {
-            console.log(`Skipping extraction for ${doc.metadata.url} as content is irrelevant`);
+            console.log(
+              `Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
+            );
            return null;
          }
          // Add confidence score to schema with 5 levels
@ -370,10 +389,14 @@ export async function performExtraction(
              ...multiEntitySchema.properties,
              is_content_relevant: {
                type: "boolean",
-                description: "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information."
-              }
+                description:
+                  "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
              },
-            required: [...(multiEntitySchema.required || []), "is_content_relevant"]
+            },
+            required: [
+              ...(multiEntitySchema.required || []),
+              "is_content_relevant",
+            ],
          };
          // console.log("schemaWithConfidence", schemaWithConfidence);

@ -384,13 +407,17 @@ export async function performExtraction(
                step: ExtractStep.MULTI_ENTITY_EXTRACT,
                startedAt: startScrape,
                finishedAt: Date.now(),
-                discoveredLinks: [doc.metadata.url || doc.metadata.sourceURL || ""],
+                discoveredLinks: [
+                  doc.metadata.url || doc.metadata.sourceURL || "",
+                ],
              },
            ],
          });

          const completionPromise = generateOpenAICompletions(
-            logger.child({ method: "extractService/generateOpenAICompletions" }),
+            logger.child({
+              method: "extractService/generateOpenAICompletions",
+            }),
            {
              mode: "llm",
              systemPrompt:
@ -406,10 +433,10 @@ export async function performExtraction(
          );

          // Race between timeout and completion
-          const multiEntityCompletion = await Promise.race([
+          const multiEntityCompletion = (await Promise.race([
            completionPromise,
-            timeoutPromise
-          ]) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
+            timeoutPromise,
+          ])) as Awaited<ReturnType<typeof generateOpenAICompletions>>;

          // console.log(multiEntityCompletion.extract)
          // if (!multiEntityCompletion.extract?.is_content_relevant) {
@ -452,11 +479,16 @@ export async function performExtraction(

      // Wait for current chunk to complete before processing next chunk
      const chunkResults = await Promise.all(chunkPromises);
-      multiEntityCompletions.push(...chunkResults.filter(result => result !== null));
+      multiEntityCompletions.push(
+        ...chunkResults.filter((result) => result !== null),
+      );
    }

    try {
-      multiEntityResult = transformArrayToObject(multiEntitySchema, multiEntityCompletions);
+      multiEntityResult = transformArrayToObject(
+        multiEntitySchema,
+        multiEntityCompletions,
+      );
      multiEntityResult = deduplicateObjectsArray(multiEntityResult);
      multiEntityResult = mergeNullValObjs(multiEntityResult);
      // @nick: maybe we can add here a llm that checks if the array probably has a primary key?
@ -464,13 +496,19 @@ export async function performExtraction(
      logger.error(`Failed to transform array to object: ${error}`);
      return {
        success: false,
-        error: "An unexpected error occurred. Please contact help@firecrawl.com for help.",
+        error:
+          "An unexpected error occurred. Please contact help@firecrawl.com for help.",
        extractId,
        urlTrace: urlTraces,
      };
    }
  }
-  if (rSchema && Object.keys(rSchema).length > 0 && rSchema.properties && Object.keys(rSchema.properties).length > 0) {
+  if (
+    rSchema &&
+    Object.keys(rSchema).length > 0 &&
+    rSchema.properties &&
+    Object.keys(rSchema.properties).length > 0
+  ) {
    // Scrape documents
    const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
    let singleAnswerDocs: Document[] = [];
@ -513,7 +551,9 @@ export async function performExtraction(
        }
      }

-      singleAnswerDocs.push(...results.filter((doc): doc is Document => doc !== null));
+      singleAnswerDocs.push(
+        ...results.filter((doc): doc is Document => doc !== null),
+      );
    } catch (error) {
      return {
        success: false,
@ -527,7 +567,8 @@ export async function performExtraction(
      // All urls are invalid
      return {
        success: false,
-        error: "All provided URLs are invalid. Please check your input and try again.",
+        error:
+          "All provided URLs are invalid. Please check your input and try again.",
        extractId,
        urlTrace: request.urlTrace ? urlTraces : undefined,
      };
@ -584,7 +625,9 @@ export async function performExtraction(
    // }
  }

-  const finalResult = await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult);
+  const finalResult = reqSchema
+    ? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult)
+    : singleAnswerResult || multiEntityResult;

  let linksBilled = links.length * 5;

--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -4,6 +4,7 @@ import { TiktokenModel } from "@dqbd/tiktoken";
 import { Document, ExtractOptions } from "../../../controllers/v1/types";
 import { Logger } from "winston";
 import { EngineResultsTracker, Meta } from "..";
+import { logger } from "../../../lib/logger";

 const maxTokens = 32000;
 const modifier = 4;
@ -254,3 +255,60 @@ export function removeDefaultProperty(schema: any): any {

  return rest;
 }
+
+export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
+  const openai = new OpenAI();
+
+  const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
+  let lastError: Error | null = null;
+
+  for (const temp of temperatures) {
+    try {
+      const result = await openai.beta.chat.completions.parse({
+        model: "gpt-4o",
+        temperature: temp,
+        messages: [
+          {
+            role: "system",
+            content: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
+Consider:
+1. The type of data being requested
+2. Required fields vs optional fields
+3. Appropriate data types for each field
+4. Nested objects and arrays where appropriate
+
+Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
+          },
+          {
+            role: "user",
+            content: `Generate a JSON schema for extracting the following information: ${prompt}`,
+          },
+        ],
+        response_format: {
+          type: "json_object",
+        },
+      });
+
+      if (result.choices[0].message.refusal !== null) {
+        throw new Error("LLM refused to generate schema");
+      }
+
+      let schema;
+      try {
+        schema = JSON.parse(result.choices[0].message.content ?? "");
+        return schema;
+      } catch (e) {
+        throw new Error("Failed to parse schema JSON from LLM response");
+      }
+    } catch (error) {
+      lastError = error as Error;
+      logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
+      continue;
+    }
+  }
+
+  // If we get here, all attempts failed
+  throw new Error(
+    `Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
+  );
+}