Merge pull request #1072 from mendableai/nsc/json-format

(feat/formats) Extract format renamed to json format
2025-08-14 03:25:56 +08:00 · 2025-01-18 17:38:52 -03:00 · 2025-01-18 17:38:52 -03:00 · 9109e78e15
commit 9109e78e15
parent 6383bf270a 5d62e8264b
10 changed files with 106 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "url": "https://www.mendable.ai/",
-      "formats": ["extract"],
+      "formats": ["json"],
-      "extract": {
+      "jsonOptions": {
        "schema": {
          "type": "object",
          "properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
      "ogSiteName": "Mendable",
      "sourceURL": "https://mendable.ai/"
    },
-    "llm_extraction": {
+    "json": {
      "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
      "supports_sso": true,
      "is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "url": "https://docs.firecrawl.dev/",
-      "formats": ["extract"],
+      "formats": ["json"],
-      "extract": {
+      "jsonOptions": {
        "prompt": "Extract the company mission from the page."
      }
    }'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
 data = app.scrape_url('https://news.ycombinator.com', {
-    'formats': ['extract'],
+    'formats': ['json'],
-    'extract': {
+    'jsonOptions': {
        'schema': TopArticlesSchema.model_json_schema()
    }
 })
-print(data["extract"])
+print(data["json"])
 ```
 ## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
 });
 const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
-  extractorOptions: { extractionSchema: schema },
+  jsonOptions: { extractionSchema: schema },
 });
-console.log(scrapeResult.data["llm_extraction"]);
+console.log(scrapeResult.data["json"]);
 ```
 ## Open Source vs Cloud Offering
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -33,6 +33,7 @@ export async function scrapeController(
    basePriority: 10,
  });
  await addScrapeJob(
    {
      url: req.body.url,
@ -96,7 +97,7 @@ export async function scrapeController(
    // Don't bill if we're early returning
    return;
  }
-  if (req.body.extract && req.body.formats.includes("extract")) {
+  if (req.body.extract && req.body.formats.includes("extract") ) {
    creditsToBeBilled = 5;
  }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -125,6 +125,7 @@ export const scrapeOptions = z
        "screenshot",
        "screenshot@fullPage",
        "extract",
        "json"
      ])
      .array()
      .optional()
@ -139,7 +140,10 @@ export const scrapeOptions = z
    onlyMainContent: z.boolean().default(true),
    timeout: z.number().int().positive().finite().safe().optional(),
    waitFor: z.number().int().nonnegative().finite().safe().default(0),
    // Deprecate this to jsonOptions
    extract: extractOptions.optional(),
    // New
    jsonOptions: extractOptions.optional(),
    mobile: z.boolean().default(false),
    parsePDF: z.boolean().default(true),
    actions: actionsSchema.optional(),
@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions
    (obj) => {
      const hasExtractFormat = obj.formats?.includes("extract");
      const hasExtractOptions = obj.extract !== undefined;
      const hasJsonFormat = obj.formats?.includes("json");
      const hasJsonOptions = obj.jsonOptions !== undefined;
      return (
        (hasExtractFormat && hasExtractOptions) ||
-        (!hasExtractFormat && !hasExtractOptions)
+        (!hasExtractFormat && !hasExtractOptions) ||
        (hasJsonFormat && hasJsonOptions) ||
        (!hasJsonFormat && !hasJsonOptions)
      );
    },
    {
      message:
-        "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
+        "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
    },
  )
  .transform((obj) => {
-    if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
+    // Handle timeout
-      return { ...obj, timeout: 60000 };
+    if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
      obj = { ...obj, timeout: 60000 };
    }
    if(obj.formats?.includes("json")) {
      obj.formats.push("extract");
    }
    // Convert JSON options to extract options if needed
    if (obj.jsonOptions && !obj.extract) {
      obj = {
        ...obj,
        extract: {
          prompt: obj.jsonOptions.prompt,
          systemPrompt: obj.jsonOptions.systemPrompt,
          schema: obj.jsonOptions.schema,
          mode: "llm"
        }
      };
    }
    return obj;
  });
@ -410,6 +437,7 @@ export type Document = {
  links?: string[];
  screenshot?: string;
  extract?: any;
  json?: any;
  warning?: string;
  actions?: {
    screenshots?: string[];
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -233,7 +233,12 @@ export async function performLLMExtract(
      document.markdown,
      document.warning,
    );
    if (meta.options.formats.includes("json")) {
      document.json = extract;
    } else {
      document.extract = extract;
    }
    document.warning = warning;
  }
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.14.1",
+  "version": "1.15.0",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
 * Defines the options and configurations available for scraping web content.
 */
 export interface CrawlScrapeOptions {
-  formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
+  formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
  headers?: Record<string, string>;
  includeTags?: string[];
  excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
    schema?: LLMSchema;
    systemPrompt?: string;
  };
  json?:{
    prompt?: string;
    schema?: LLMSchema;
    systemPrompt?: string;
  }
  actions?: ActionsSchema;
 }
@ -393,6 +398,23 @@ export default class FirecrawlApp {
        },
      };
    }
    if (jsonData?.jsonOptions?.schema) {
      let schema = jsonData.jsonOptions.schema;
      // Try parsing the schema as a Zod schema
      try {
        schema = zodToJsonSchema(schema);
      } catch (error) {
      }
      jsonData = {
        ...jsonData,
        jsonOptions: {
          ...jsonData.jsonOptions,
          schema: schema,
        },
      };
    }
    try {
      const response: AxiosResponse = await axios.post(
        this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
        },
      };
    }
    if (jsonData?.jsonOptions?.schema) {
      let schema = jsonData.jsonOptions.schema;
      // Try parsing the schema as a Zod schema
      try {
        schema = zodToJsonSchema(schema);
      } catch (error) {
      }
      jsonData = {
        ...jsonData,
        jsonOptions: {
          ...jsonData.jsonOptions,
          schema: schema,
        },
      };
    }
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/batch/scrape`,
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os
 from .firecrawl import FirecrawlApp # noqa
-__version__ = "1.9.0"
+__version__ = "1.10.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -112,6 +112,18 @@ class FirecrawlApp:
                if key not in ['extract']:
                    scrape_params[key] = value
            json = params.get("jsonOptions", {})
            if json:
                if 'schema' in json and hasattr(json['schema'], 'schema'):
                    json['schema'] = json['schema'].schema()
                scrape_params['jsonOptions'] = json
            # Include any other params directly at the top level of scrape_params
            for key, value in params.items():
                if key not in ['jsonOptions']:
                    scrape_params[key] = value
        endpoint = f'/v1/scrape'
        # Make the POST request with the prepared headers and JSON data
        response = requests.post(
--- a/apps/python-sdk/pyproject.toml
+++ b/apps/python-sdk/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 dynamic = ["version"]
-name = "firecrawl-py"
+name = "firecrawl"
 description = "Python SDK for Firecrawl API"
 readme = {file="README.md", content-type = "text/markdown"}
 requires-python = ">=3.8"
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -17,7 +17,7 @@ def get_version():
 setup(
-    name="firecrawl-py",
+    name="firecrawl",
    version=get_version(),
    url="https://github.com/mendableai/firecrawl",
    author="Mendable.ai",