Merge branch 'tom/extract-v2' of https://github.com/mendableai/firecrawl into tom/extract-v2

2025-08-18 01:55:58 +08:00 · 2025-04-03 17:37:23 -03:00 · 2025-04-03 17:37:23 -03:00 · 9786bc2fc0
commit 9786bc2fc0
parent 17e8b91109 fea102dac6
6 changed files with 118 additions and 25 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -12,6 +12,7 @@ content-type: application/json
  "url":"https://firecrawl.dev"
 }

+
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -120,3 +121,91 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}

+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        },
+        "respect_robots_txt": {
+          "type": ["boolean","null"],
+          "description": "Does firecrawl respect the robots.txt files?"
+        }
+      },
+      "required": ["description", "respect_robots_txt"]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+
+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        }
+
+      },
+      "required": ["description" ]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+### Scrape to Extract Array of Titles
+# @name scrapeItemsArray
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
+    "formats": ["json"],
+  "jsonOptions": {
+    "prompt": "Extract all the main article or blog post titles from the page into an array.",
+    "schema": {
+      "type": "object",
+      "properties": {
+        "items": {
+          "type": "array",
+          "description": "An array containing the extracted items.",
+          "items": { 
+            "type": "object",
+            "properties": {
+              "title": {
+                "type": "string",
+                "description": "The title of a single article or blog post."
+              }
+            },
+            "required": ["title"] 
+          }
+        }
+      },
+      "required": ["items"] 
+    }
+    // "systemPrompt": "You are an expert structured data extractor." 
+  }
+}
--- a/apps/api/src/lib/extract/completions/batchExtract.ts
+++ b/apps/api/src/lib/extract/completions/batchExtract.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
+  generateCompletions,
+  GenerateCompletionsOptions,
+} from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
 import { Document } from "../../../controllers/v1/types";
@ -74,5 +77,5 @@ export async function batchExtractPromise(
    },
    warning: warning,
    sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
-  }
+  };
 }
--- a/apps/api/src/lib/extract/completions/singleAnswer.ts
+++ b/apps/api/src/lib/extract/completions/singleAnswer.ts
@ -1,5 +1,8 @@
 import { logger } from "../../../lib/logger";
-import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
+import {
+  generateCompletions,
+  GenerateCompletionsOptions,
+} from "../../../scraper/scrapeURL/transformers/llmExtract";
 import { buildDocument } from "../build-document";
 import { Document, TokenUsage } from "../../../controllers/v1/types";
 import { getModel } from "../../../lib/generic-ai";
@ -57,7 +60,6 @@ export async function singleAnswerCompletion({
    ),
  };

-
  // const completion = await generateCompletions({
  //   logger: logger.child({ module: "extract", method: "generateCompletions" }),
  //   options: {
@ -78,12 +80,7 @@ export async function singleAnswerCompletion({
  // );
  return {
    extract: completion.extract,
-    tokenUsage: {
-      promptTokens: 0,
-      completionTokens: 0,
-      totalTokens: 0,
-      model: "gemini-2.0-flash",
-    },
+    tokenUsage: completion.tokenUsage,
    sources: singleAnswerDocs.map(
      (doc) => doc.metadata.url || doc.metadata.sourceURL || "",
    ),
--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -455,11 +455,12 @@ export async function performExtraction(
          );

          // Race between timeout and completion
-          const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
+          const multiEntityCompletion = (await completionPromise) as Awaited<
+            ReturnType<typeof batchExtractPromise>
+          >;

          // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema

-
          // Track multi-entity extraction tokens
          if (multiEntityCompletion) {
            tokenUsage.push(multiEntityCompletion.totalUsage);
@ -520,7 +521,9 @@ export async function performExtraction(
      );
      extractionResults.push(...validResults);
      // Merge all extracts from valid results into a single array
-      const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
+      const extractArrays = validResults.map((r) =>
+        Array.isArray(r.extract) ? r.extract : [r.extract],
+      );
      const mergedExtracts = extractArrays.flat();
      multiEntityCompletions.push(...mergedExtracts);
      logger.debug("All multi-entity completion chunks finished.", {
@ -692,10 +695,7 @@ export async function performExtraction(
    });
    logger.debug("Done generating singleAnswer completions.");

-    singleAnswerResult = transformArrayToObject(
-      rSchema,
-      completionResult,
-    );
+    singleAnswerResult = transformArrayToObject(rSchema, completionResult);

    singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);

--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
  shouldUseSmartscrape: {
    type: "boolean",
    description:
-      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
+      "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
  },
  // Note: extractedData is added dynamically in prepareSmartScrapeSchema
 };
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
+      "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
  },
  smartscrape_prompt: {
    type: ["string", "null"],
    // Using the more detailed multi-step description as the common one
    description:
-      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
+      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
  },
 };

@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
    smartScrapePages: {
      type: "array",
      description:
-        "Make an entry for each page we want to run smart scrape on.",
+        "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
      items: {
        type: "object",
        properties: {
@ -185,7 +185,7 @@ export async function extractData({
  //WRAP SCHEMA
  const schema = extractOptions.options.schema;
  const logger = extractOptions.logger;
-  const isSingleUrl = urls.length === 0;
+  const isSingleUrl = urls.length === 1;
  console.log("!!!!!!!!!!!!!!!!!!hereee");
  const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
  const extractOptionsNewSchema = {
@ -239,15 +239,18 @@ export async function extractData({
        }),
      );
    }
+    console.log("smartscrapeResults", smartscrapeResults);

    const scrapedPages = smartscrapeResults.map(
      (result) => result.scrapedPages,
    );
-    const htmls = scrapedPages.map((page) => page.html);
+    console.log("scrapedPages", scrapedPages);
+    const htmls = scrapedPages.flat().map((page) => page.html);
+    console.log("htmls", htmls);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
-
+    console.log("markdowns", markdowns);
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -413,6 +413,7 @@ export async function performLLMExtract(
      // model: getModel("o3-mini", "openai"), // Keeping existing model selection
      // model: getModel("o3-mini", "openai"),
      // model: getModel("qwen-qwq-32b", "groq"),
+      // model: getModel("gemini-2.0-flash", "google"),
      model: getModel("gemini-2.5-pro-exp-03-25", "google"),
    };