Merge pull request #1072 from mendableai/nsc/json-format

(feat/formats) Extract format renamed to json format
2025-08-12 19:29:01 +08:00 · 2025-01-18 17:38:52 -03:00 · 2025-01-18 17:38:52 -03:00 · 9109e78e15
commit 9109e78e15
parent 6383bf270a 5d62e8264b
10 changed files with 106 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "url": "https://www.mendable.ai/",
-      "formats": ["extract"],
-      "extract": {
+      "formats": ["json"],
+      "jsonOptions": {
        "schema": {
          "type": "object",
          "properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
      "ogSiteName": "Mendable",
      "sourceURL": "https://mendable.ai/"
    },
-    "llm_extraction": {
+    "json": {
      "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
      "supports_sso": true,
      "is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "url": "https://docs.firecrawl.dev/",
-      "formats": ["extract"],
-      "extract": {
+      "formats": ["json"],
+      "jsonOptions": {
        "prompt": "Extract the company mission from the page."
      }
    }'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

 data = app.scrape_url('https://news.ycombinator.com', {
-    'formats': ['extract'],
-    'extract': {
+    'formats': ['json'],
+    'jsonOptions': {
        'schema': TopArticlesSchema.model_json_schema()
    }
 })
-print(data["extract"])
+print(data["json"])
 ```

 ## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
 });

 const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
-  extractorOptions: { extractionSchema: schema },
+  jsonOptions: { extractionSchema: schema },
 });

-console.log(scrapeResult.data["llm_extraction"]);
+console.log(scrapeResult.data["json"]);
 ```

 ## Open Source vs Cloud Offering
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -33,6 +33,7 @@ export async function scrapeController(
    basePriority: 10,
  });

+
  await addScrapeJob(
    {
      url: req.body.url,
@ -96,7 +97,7 @@ export async function scrapeController(
    // Don't bill if we're early returning
    return;
  }
-  if (req.body.extract && req.body.formats.includes("extract")) {
+  if (req.body.extract && req.body.formats.includes("extract") ) {
    creditsToBeBilled = 5;
  }

--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -125,6 +125,7 @@ export const scrapeOptions = z
        "screenshot",
        "screenshot@fullPage",
        "extract",
+        "json"
      ])
      .array()
      .optional()
@ -139,7 +140,10 @@ export const scrapeOptions = z
    onlyMainContent: z.boolean().default(true),
    timeout: z.number().int().positive().finite().safe().optional(),
    waitFor: z.number().int().nonnegative().finite().safe().default(0),
+    // Deprecate this to jsonOptions
    extract: extractOptions.optional(),
+    // New
+    jsonOptions: extractOptions.optional(),
    mobile: z.boolean().default(false),
    parsePDF: z.boolean().default(true),
    actions: actionsSchema.optional(),
@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions
    (obj) => {
      const hasExtractFormat = obj.formats?.includes("extract");
      const hasExtractOptions = obj.extract !== undefined;
+      const hasJsonFormat = obj.formats?.includes("json");
+      const hasJsonOptions = obj.jsonOptions !== undefined;
      return (
        (hasExtractFormat && hasExtractOptions) ||
-        (!hasExtractFormat && !hasExtractOptions)
+        (!hasExtractFormat && !hasExtractOptions) ||
+        (hasJsonFormat && hasJsonOptions) ||
+        (!hasJsonFormat && !hasJsonOptions)
      );
    },
    {
      message:
-        "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
+        "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
    },
  )
  .transform((obj) => {
-    if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
-      return { ...obj, timeout: 60000 };
+    // Handle timeout
+    if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
+      obj = { ...obj, timeout: 60000 };
    }
+
+    if(obj.formats?.includes("json")) {
+      obj.formats.push("extract");
+    }
+
+    // Convert JSON options to extract options if needed
+    if (obj.jsonOptions && !obj.extract) {
+      obj = {
+        ...obj,
+        extract: {
+          prompt: obj.jsonOptions.prompt,
+          systemPrompt: obj.jsonOptions.systemPrompt,
+          schema: obj.jsonOptions.schema,
+          mode: "llm"
+        }
+      };
+    }
+
    return obj;
  });

@ -410,6 +437,7 @@ export type Document = {
  links?: string[];
  screenshot?: string;
  extract?: any;
+  json?: any;
  warning?: string;
  actions?: {
    screenshots?: string[];
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -233,7 +233,12 @@ export async function performLLMExtract(
      document.markdown,
      document.warning,
    );
-    document.extract = extract;
+    
+    if (meta.options.formats.includes("json")) {
+      document.json = extract;
+    } else {
+      document.extract = extract;
+    }
    document.warning = warning;
  }

--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.14.1",
+  "version": "1.15.0",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
 * Defines the options and configurations available for scraping web content.
 */
 export interface CrawlScrapeOptions {
-  formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
+  formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
  headers?: Record<string, string>;
  includeTags?: string[];
  excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
    schema?: LLMSchema;
    systemPrompt?: string;
  };
+  json?:{
+    prompt?: string;
+    schema?: LLMSchema;
+    systemPrompt?: string;
+  }
  actions?: ActionsSchema;
 }

@ -393,6 +398,23 @@ export default class FirecrawlApp {
        },
      };
    }
+
+    if (jsonData?.jsonOptions?.schema) {
+      let schema = jsonData.jsonOptions.schema;
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        jsonOptions: {
+          ...jsonData.jsonOptions,
+          schema: schema,
+        },
+      };
+    }
    try {
      const response: AxiosResponse = await axios.post(
        this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
        },
      };
    }
+    if (jsonData?.jsonOptions?.schema) {
+      let schema = jsonData.jsonOptions.schema;
+
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        jsonOptions: {
+          ...jsonData.jsonOptions,
+          schema: schema,
+        },
+      };
+    }
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/batch/scrape`,
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.9.0"
+__version__ = "1.10.0"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -112,6 +112,18 @@ class FirecrawlApp:
                if key not in ['extract']:
                    scrape_params[key] = value

+            json = params.get("jsonOptions", {})
+            if json:
+                if 'schema' in json and hasattr(json['schema'], 'schema'):
+                    json['schema'] = json['schema'].schema()
+                scrape_params['jsonOptions'] = json
+
+            # Include any other params directly at the top level of scrape_params
+            for key, value in params.items():
+                if key not in ['jsonOptions']:
+                    scrape_params[key] = value
+
+
        endpoint = f'/v1/scrape'
        # Make the POST request with the prepared headers and JSON data
        response = requests.post(
--- a/apps/python-sdk/pyproject.toml
+++ b/apps/python-sdk/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 dynamic = ["version"]
-name = "firecrawl-py"
+name = "firecrawl"
 description = "Python SDK for Firecrawl API"
 readme = {file="README.md", content-type = "text/markdown"}
 requires-python = ">=3.8"
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -17,7 +17,7 @@ def get_version():


 setup(
-    name="firecrawl-py",
+    name="firecrawl",
    version=get_version(),
    url="https://github.com/mendableai/firecrawl",
    author="Mendable.ai",