Merge pull request #1072 from mendableai/nsc/json-format

(feat/formats) Extract format renamed to json format
This commit is contained in:
Nicolas 2025-01-18 17:38:52 -03:00 committed by GitHub
commit 9109e78e15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 106 additions and 21 deletions

View File

@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://www.mendable.ai/", "url": "https://www.mendable.ai/",
"formats": ["extract"], "formats": ["json"],
"extract": { "jsonOptions": {
"schema": { "schema": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
"ogSiteName": "Mendable", "ogSiteName": "Mendable",
"sourceURL": "https://mendable.ai/" "sourceURL": "https://mendable.ai/"
}, },
"llm_extraction": { "json": {
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
"supports_sso": true, "supports_sso": true,
"is_open_source": false, "is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://docs.firecrawl.dev/", "url": "https://docs.firecrawl.dev/",
"formats": ["extract"], "formats": ["json"],
"extract": { "jsonOptions": {
"prompt": "Extract the company mission from the page." "prompt": "Extract the company mission from the page."
} }
}' }'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', { data = app.scrape_url('https://news.ycombinator.com', {
'formats': ['extract'], 'formats': ['json'],
'extract': { 'jsonOptions': {
'schema': TopArticlesSchema.model_json_schema() 'schema': TopArticlesSchema.model_json_schema()
} }
}) })
print(data["extract"]) print(data["json"])
``` ```
## Using the Node SDK ## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
}); });
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", { const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: schema }, jsonOptions: { extractionSchema: schema },
}); });
console.log(scrapeResult.data["llm_extraction"]); console.log(scrapeResult.data["json"]);
``` ```
## Open Source vs Cloud Offering ## Open Source vs Cloud Offering

View File

@ -33,6 +33,7 @@ export async function scrapeController(
basePriority: 10, basePriority: 10,
}); });
await addScrapeJob( await addScrapeJob(
{ {
url: req.body.url, url: req.body.url,
@ -96,7 +97,7 @@ export async function scrapeController(
// Don't bill if we're early returning // Don't bill if we're early returning
return; return;
} }
if (req.body.extract && req.body.formats.includes("extract")) { if (req.body.extract && req.body.formats.includes("extract") ) {
creditsToBeBilled = 5; creditsToBeBilled = 5;
} }

View File

@ -125,6 +125,7 @@ export const scrapeOptions = z
"screenshot", "screenshot",
"screenshot@fullPage", "screenshot@fullPage",
"extract", "extract",
"json"
]) ])
.array() .array()
.optional() .optional()
@ -139,7 +140,10 @@ export const scrapeOptions = z
onlyMainContent: z.boolean().default(true), onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(), timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().default(0), waitFor: z.number().int().nonnegative().finite().safe().default(0),
// Deprecate this to jsonOptions
extract: extractOptions.optional(), extract: extractOptions.optional(),
// New
jsonOptions: extractOptions.optional(),
mobile: z.boolean().default(false), mobile: z.boolean().default(false),
parsePDF: z.boolean().default(true), parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(), actions: actionsSchema.optional(),
@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions
(obj) => { (obj) => {
const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined; const hasExtractOptions = obj.extract !== undefined;
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return ( return (
(hasExtractFormat && hasExtractOptions) || (hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions) (!hasExtractFormat && !hasExtractOptions) ||
(hasJsonFormat && hasJsonOptions) ||
(!hasJsonFormat && !hasJsonOptions)
); );
}, },
{ {
message: message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa", "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
}, },
) )
.transform((obj) => { .transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { // Handle timeout
return { ...obj, timeout: 60000 }; if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
obj = { ...obj, timeout: 60000 };
} }
if(obj.formats?.includes("json")) {
obj.formats.push("extract");
}
// Convert JSON options to extract options if needed
if (obj.jsonOptions && !obj.extract) {
obj = {
...obj,
extract: {
prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema,
mode: "llm"
}
};
}
return obj; return obj;
}); });
@ -410,6 +437,7 @@ export type Document = {
links?: string[]; links?: string[];
screenshot?: string; screenshot?: string;
extract?: any; extract?: any;
json?: any;
warning?: string; warning?: string;
actions?: { actions?: {
screenshots?: string[]; screenshots?: string[];

View File

@ -233,7 +233,12 @@ export async function performLLMExtract(
document.markdown, document.markdown,
document.warning, document.warning,
); );
if (meta.options.formats.includes("json")) {
document.json = extract;
} else {
document.extract = extract; document.extract = extract;
}
document.warning = warning; document.warning = warning;
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.14.1", "version": "1.15.0",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
* Defines the options and configurations available for scraping web content. * Defines the options and configurations available for scraping web content.
*/ */
export interface CrawlScrapeOptions { export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[]; formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
headers?: Record<string, string>; headers?: Record<string, string>;
includeTags?: string[]; includeTags?: string[];
excludeTags?: string[]; excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
schema?: LLMSchema; schema?: LLMSchema;
systemPrompt?: string; systemPrompt?: string;
}; };
json?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
}
actions?: ActionsSchema; actions?: ActionsSchema;
} }
@ -393,6 +398,23 @@ export default class FirecrawlApp {
}, },
}; };
} }
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
this.apiUrl + `/v1/scrape`, this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
}, },
}; };
} }
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`, this.apiUrl + `/v1/batch/scrape`,

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.9.0" __version__ = "1.10.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -112,6 +112,18 @@ class FirecrawlApp:
if key not in ['extract']: if key not in ['extract']:
scrape_params[key] = value scrape_params[key] = value
json = params.get("jsonOptions", {})
if json:
if 'schema' in json and hasattr(json['schema'], 'schema'):
json['schema'] = json['schema'].schema()
scrape_params['jsonOptions'] = json
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['jsonOptions']:
scrape_params[key] = value
endpoint = f'/v1/scrape' endpoint = f'/v1/scrape'
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(

View File

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
dynamic = ["version"] dynamic = ["version"]
name = "firecrawl-py" name = "firecrawl"
description = "Python SDK for Firecrawl API" description = "Python SDK for Firecrawl API"
readme = {file="README.md", content-type = "text/markdown"} readme = {file="README.md", content-type = "text/markdown"}
requires-python = ">=3.8" requires-python = ">=3.8"

View File

@ -17,7 +17,7 @@ def get_version():
setup( setup(
name="firecrawl-py", name="firecrawl",
version=get_version(), version=get_version(),
url="https://github.com/mendableai/firecrawl", url="https://github.com/mendableai/firecrawl",
author="Mendable.ai", author="Mendable.ai",