Merge pull request #1072 from mendableai/nsc/json-format

(feat/formats) Extract format renamed to json format
This commit is contained in:
Nicolas 2025-01-18 17:38:52 -03:00 committed by GitHub
commit 9109e78e15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 106 additions and 21 deletions

View File

@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://www.mendable.ai/",
"formats": ["extract"],
"extract": {
"formats": ["json"],
"jsonOptions": {
"schema": {
"type": "object",
"properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
"ogSiteName": "Mendable",
"sourceURL": "https://mendable.ai/"
},
"llm_extraction": {
"json": {
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
"supports_sso": true,
"is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://docs.firecrawl.dev/",
"formats": ["extract"],
"extract": {
"formats": ["json"],
"jsonOptions": {
"prompt": "Extract the company mission from the page."
}
}'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'formats': ['extract'],
'extract': {
'formats': ['json'],
'jsonOptions': {
'schema': TopArticlesSchema.model_json_schema()
}
})
print(data["extract"])
print(data["json"])
```
## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
});
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: schema },
jsonOptions: { extractionSchema: schema },
});
console.log(scrapeResult.data["llm_extraction"]);
console.log(scrapeResult.data["json"]);
```
## Open Source vs Cloud Offering

View File

@ -33,6 +33,7 @@ export async function scrapeController(
basePriority: 10,
});
await addScrapeJob(
{
url: req.body.url,
@ -96,7 +97,7 @@ export async function scrapeController(
// Don't bill if we're early returning
return;
}
if (req.body.extract && req.body.formats.includes("extract")) {
if (req.body.extract && req.body.formats.includes("extract") ) {
creditsToBeBilled = 5;
}

View File

@ -125,6 +125,7 @@ export const scrapeOptions = z
"screenshot",
"screenshot@fullPage",
"extract",
"json"
])
.array()
.optional()
@ -139,7 +140,10 @@ export const scrapeOptions = z
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().default(0),
// Deprecate this to jsonOptions
extract: extractOptions.optional(),
// New
jsonOptions: extractOptions.optional(),
mobile: z.boolean().default(false),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
(!hasExtractFormat && !hasExtractOptions) ||
(hasJsonFormat && hasJsonOptions) ||
(!hasJsonFormat && !hasJsonOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
},
)
.transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
// Handle timeout
if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
obj = { ...obj, timeout: 60000 };
}
if(obj.formats?.includes("json")) {
obj.formats.push("extract");
}
// Convert JSON options to extract options if needed
if (obj.jsonOptions && !obj.extract) {
obj = {
...obj,
extract: {
prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema,
mode: "llm"
}
};
}
return obj;
});
@ -410,6 +437,7 @@ export type Document = {
links?: string[];
screenshot?: string;
extract?: any;
json?: any;
warning?: string;
actions?: {
screenshots?: string[];

View File

@ -233,7 +233,12 @@ export async function performLLMExtract(
document.markdown,
document.warning,
);
document.extract = extract;
if (meta.options.formats.includes("json")) {
document.json = extract;
} else {
document.extract = extract;
}
document.warning = warning;
}

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.14.1",
"version": "1.15.0",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
* Defines the options and configurations available for scraping web content.
*/
export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
headers?: Record<string, string>;
includeTags?: string[];
excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
schema?: LLMSchema;
systemPrompt?: string;
};
json?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
}
actions?: ActionsSchema;
}
@ -393,6 +398,23 @@ export default class FirecrawlApp {
},
};
}
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try {
const response: AxiosResponse = await axios.post(
this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
},
};
}
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.9.0"
__version__ = "1.10.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -112,6 +112,18 @@ class FirecrawlApp:
if key not in ['extract']:
scrape_params[key] = value
json = params.get("jsonOptions", {})
if json:
if 'schema' in json and hasattr(json['schema'], 'schema'):
json['schema'] = json['schema'].schema()
scrape_params['jsonOptions'] = json
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['jsonOptions']:
scrape_params[key] = value
endpoint = f'/v1/scrape'
# Make the POST request with the prepared headers and JSON data
response = requests.post(

View File

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
dynamic = ["version"]
name = "firecrawl-py"
name = "firecrawl"
description = "Python SDK for Firecrawl API"
readme = {file="README.md", content-type = "text/markdown"}
requires-python = ">=3.8"

View File

@ -17,7 +17,7 @@ def get_version():
setup(
name="firecrawl-py",
name="firecrawl",
version=get_version(),
url="https://github.com/mendableai/firecrawl",
author="Mendable.ai",