Nick: extract to json in the sdks as well

This commit is contained in:
Nicolas 2025-01-18 17:23:21 -03:00
parent 34b40f6a23
commit b030a1c5da
3 changed files with 62 additions and 11 deletions

View File

@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://www.mendable.ai/",
"formats": ["extract"],
"extract": {
"formats": ["json"],
"jsonOptions": {
"schema": {
"type": "object",
"properties": {
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
"ogSiteName": "Mendable",
"sourceURL": "https://mendable.ai/"
},
"llm_extraction": {
"json": {
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
"supports_sso": true,
"is_open_source": false,
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://docs.firecrawl.dev/",
"formats": ["extract"],
"extract": {
"formats": ["json"],
"jsonOptions": {
"prompt": "Extract the company mission from the page."
}
}'
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'formats': ['extract'],
'extract': {
'formats': ['json'],
'jsonOptions': {
'schema': TopArticlesSchema.model_json_schema()
}
})
print(data["extract"])
print(data["json"])
```
## Using the Node SDK
@ -526,10 +526,10 @@ const schema = z.object({
});
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: schema },
jsonOptions: { extractionSchema: schema },
});
console.log(scrapeResult.data["llm_extraction"]);
console.log(scrapeResult.data["json"]);
```
## Open Source vs Cloud Offering

View File

@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
* Defines the options and configurations available for scraping web content.
*/
export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
headers?: Record<string, string>;
includeTags?: string[];
excludeTags?: string[];
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
schema?: LLMSchema;
systemPrompt?: string;
};
json?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
}
actions?: ActionsSchema;
}
@ -393,6 +398,23 @@ export default class FirecrawlApp {
},
};
}
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try {
const response: AxiosResponse = await axios.post(
this.apiUrl + `/v1/scrape`,
@ -772,6 +794,23 @@ export default class FirecrawlApp {
},
};
}
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,

View File

@ -112,6 +112,18 @@ class FirecrawlApp:
if key not in ['extract']:
scrape_params[key] = value
json = params.get("jsonOptions", {})
if json:
if 'schema' in json and hasattr(json['schema'], 'schema'):
json['schema'] = json['schema'].schema()
scrape_params['jsonOptions'] = json
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['jsonOptions']:
scrape_params[key] = value
endpoint = f'/v1/scrape'
# Make the POST request with the prepared headers and JSON data
response = requests.post(