Nick: extract to json in the sdks as well

This commit is contained in:
Nicolas 2025-01-18 17:23:21 -03:00
parent 34b40f6a23
commit b030a1c5da
3 changed files with 62 additions and 11 deletions

View File

@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://www.mendable.ai/", "url": "https://www.mendable.ai/",
"formats": ["extract"], "formats": ["json"],
"extract": { "jsonOptions": {
"schema": { "schema": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
"ogSiteName": "Mendable", "ogSiteName": "Mendable",
"sourceURL": "https://mendable.ai/" "sourceURL": "https://mendable.ai/"
}, },
"llm_extraction": { "json": {
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
"supports_sso": true, "supports_sso": true,
"is_open_source": false, "is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://docs.firecrawl.dev/", "url": "https://docs.firecrawl.dev/",
"formats": ["extract"], "formats": ["json"],
"extract": { "jsonOptions": {
"prompt": "Extract the company mission from the page." "prompt": "Extract the company mission from the page."
} }
}' }'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', { data = app.scrape_url('https://news.ycombinator.com', {
'formats': ['extract'], 'formats': ['json'],
'extract': { 'jsonOptions': {
'schema': TopArticlesSchema.model_json_schema() 'schema': TopArticlesSchema.model_json_schema()
} }
}) })
print(data["extract"]) print(data["json"])
``` ```
## Using the Node SDK ## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
}); });
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", { const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: schema }, jsonOptions: { extractionSchema: schema },
}); });
console.log(scrapeResult.data["llm_extraction"]); console.log(scrapeResult.data["json"]);
``` ```
## Open Source vs Cloud Offering ## Open Source vs Cloud Offering

View File

@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
* Defines the options and configurations available for scraping web content. * Defines the options and configurations available for scraping web content.
*/ */
export interface CrawlScrapeOptions { export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[]; formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
headers?: Record<string, string>; headers?: Record<string, string>;
includeTags?: string[]; includeTags?: string[];
excludeTags?: string[]; excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
schema?: LLMSchema; schema?: LLMSchema;
systemPrompt?: string; systemPrompt?: string;
}; };
json?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
}
actions?: ActionsSchema; actions?: ActionsSchema;
} }
@ -393,6 +398,23 @@ export default class FirecrawlApp {
}, },
}; };
} }
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
this.apiUrl + `/v1/scrape`, this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
}, },
}; };
} }
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`, this.apiUrl + `/v1/batch/scrape`,

View File

@ -112,6 +112,18 @@ class FirecrawlApp:
if key not in ['extract']: if key not in ['extract']:
scrape_params[key] = value scrape_params[key] = value
json = params.get("jsonOptions", {})
if json:
if 'schema' in json and hasattr(json['schema'], 'schema'):
json['schema'] = json['schema'].schema()
scrape_params['jsonOptions'] = json
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['jsonOptions']:
scrape_params[key] = value
endpoint = f'/v1/scrape' endpoint = f'/v1/scrape'
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(