Merge branch 'tom/extract-v2' of https://github.com/mendableai/firecrawl into tom/extract-v2

2025-08-18 07:35:55 +08:00 · 2025-04-03 17:37:23 -03:00 · 2025-04-03 17:37:23 -03:00 · 9786bc2fc0
commit 9786bc2fc0
parent 17e8b91109 fea102dac6
6 changed files with 118 additions and 25 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -12,6 +12,7 @@ content-type: application/json
  "url":"https://firecrawl.dev"
 }
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -120,3 +121,91 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        },
        "respect_robots_txt": {
          "type": ["boolean","null"],
          "description": "Does firecrawl respect the robots.txt files?"
        }
      },
      "required": ["description", "respect_robots_txt"]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        }
      },
      "required": ["description" ]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape to Extract Array of Titles
 # @name scrapeItemsArray
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
    "formats": ["json"],
  "jsonOptions": {
    "prompt": "Extract all the main article or blog post titles from the page into an array.",
    "schema": {
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "description": "An array containing the extracted items.",
          "items": { 
            "type": "object",
            "properties": {
              "title": {
                "type": "string",
                "description": "The title of a single article or blog post."
              }
            },
            "required": ["title"] 
          }
        }
      },
      "required": ["items"] 
    }
    // "systemPrompt": "You are an expert structured data extractor." 
  }
 }
--- a/apps/api/src/lib/extract/completions/batchExtract.ts
+++ b/apps/api/src/lib/extract/completions/batchExtract.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
  generateCompletions,
  GenerateCompletionsOptions,
 } from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
 import { Document } from "../../../controllers/v1/types";
@ -74,5 +77,5 @@ export async function batchExtractPromise(
    },
    warning: warning,
    sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
-  }
+  };
 }
--- a/apps/api/src/lib/extract/completions/singleAnswer.ts
+++ b/apps/api/src/lib/extract/completions/singleAnswer.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
  generateCompletions,
  GenerateCompletionsOptions,
 } from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { Document, TokenUsage } from "../../../controllers/v1/types";
 import { getModel } from "../../../lib/generic-ai";
@ -39,7 +42,7 @@ export async function singleAnswerCompletion({
      model: getModel("gemini-2.0-flash", "google"),
    };
-    const { extractedDataArray, warning } = await extractData({
+  const { extractedDataArray, warning } = await extractData({
    extractOptions: generationOptions,
    urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
  });
@ -57,7 +60,6 @@ export async function singleAnswerCompletion({
    ),
  };
  // const completion = await generateCompletions({
  //   logger: logger.child({ module: "extract", method: "generateCompletions" }),
  //   options: {
@ -78,12 +80,7 @@ export async function singleAnswerCompletion({
  // );
  return {
    extract: completion.extract,
-    tokenUsage: {
+    tokenUsage: completion.tokenUsage,
      promptTokens: 0,
      completionTokens: 0,
      totalTokens: 0,
      model: "gemini-2.0-flash",
    },
    sources: singleAnswerDocs.map(
      (doc) => doc.metadata.url || doc.metadata.sourceURL || "",
    ),
--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -455,10 +455,11 @@ export async function performExtraction(
          );
          // Race between timeout and completion
-          const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
+          const multiEntityCompletion = (await completionPromise) as Awaited<
            ReturnType<typeof batchExtractPromise>
          >;
          // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
          // Track multi-entity extraction tokens
          if (multiEntityCompletion) {
@ -520,7 +521,9 @@ export async function performExtraction(
      );
      extractionResults.push(...validResults);
      // Merge all extracts from valid results into a single array
-      const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
+      const extractArrays = validResults.map((r) =>
        Array.isArray(r.extract) ? r.extract : [r.extract],
      );
      const mergedExtracts = extractArrays.flat();
      multiEntityCompletions.push(...mergedExtracts);
      logger.debug("All multi-entity completion chunks finished.", {
@ -692,10 +695,7 @@ export async function performExtraction(
    });
    logger.debug("Done generating singleAnswer completions.");
-    singleAnswerResult = transformArrayToObject(
+    singleAnswerResult = transformArrayToObject(rSchema, completionResult);
      rSchema,
      completionResult,
    );
    singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
  shouldUseSmartscrape: {
    type: "boolean",
    description:
-      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
+      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
  },
  // Note: extractedData is added dynamically in prepareSmartScrapeSchema
 };
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
+      "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
  },
  smartscrape_prompt: {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
+      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
  },
 };
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
    smartScrapePages: {
      type: "array",
      description:
-        "Make an entry for each page we want to run smart scrape on.",
+        "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
      items: {
        type: "object",
        properties: {
@ -185,7 +185,7 @@ export async function extractData({
  //WRAP SCHEMA
  const schema = extractOptions.options.schema;
  const logger = extractOptions.logger;
-  const isSingleUrl = urls.length === 0;
+  const isSingleUrl = urls.length === 1;
  console.log("!!!!!!!!!!!!!!!!!!hereee");
  const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
  const extractOptionsNewSchema = {
@ -239,15 +239,18 @@ export async function extractData({
        }),
      );
    }
    console.log("smartscrapeResults", smartscrapeResults);
    const scrapedPages = smartscrapeResults.map(
      (result) => result.scrapedPages,
    );
-    const htmls = scrapedPages.map((page) => page.html);
+    console.log("scrapedPages", scrapedPages);
    const htmls = scrapedPages.flat().map((page) => page.html);
    console.log("htmls", htmls);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
-
+    console.log("markdowns", markdowns);
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -413,6 +413,7 @@ export async function performLLMExtract(
      // model: getModel("o3-mini", "openai"), // Keeping existing model selection
      // model: getModel("o3-mini", "openai"),
      // model: getModel("qwen-qwq-32b", "groq"),
      // model: getModel("gemini-2.0-flash", "google"),
      model: getModel("gemini-2.5-pro-exp-03-25", "google"),
    };