mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 00:12:02 +08:00
Nick: extract to json in the sdks as well
This commit is contained in:
parent
34b40f6a23
commit
b030a1c5da
20
README.md
20
README.md
@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://www.mendable.ai/",
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
"ogSiteName": "Mendable",
|
||||
"sourceURL": "https://mendable.ai/"
|
||||
},
|
||||
"llm_extraction": {
|
||||
"json": {
|
||||
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
|
||||
"supports_sso": true,
|
||||
"is_open_source": false,
|
||||
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://docs.firecrawl.dev/",
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"prompt": "Extract the company mission from the page."
|
||||
}
|
||||
}'
|
||||
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
data = app.scrape_url('https://news.ycombinator.com', {
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'formats': ['json'],
|
||||
'jsonOptions': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
print(data["extract"])
|
||||
print(data["json"])
|
||||
```
|
||||
|
||||
## Using the Node SDK
|
||||
@ -526,10 +526,10 @@ const schema = z.object({
|
||||
});
|
||||
|
||||
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: schema },
|
||||
jsonOptions: { extractionSchema: schema },
|
||||
});
|
||||
|
||||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
console.log(scrapeResult.data["json"]);
|
||||
```
|
||||
|
||||
## Open Source vs Cloud Offering
|
||||
|
@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
||||
* Defines the options and configurations available for scraping web content.
|
||||
*/
|
||||
export interface CrawlScrapeOptions {
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
|
||||
headers?: Record<string, string>;
|
||||
includeTags?: string[];
|
||||
excludeTags?: string[];
|
||||
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
|
||||
schema?: LLMSchema;
|
||||
systemPrompt?: string;
|
||||
};
|
||||
json?:{
|
||||
prompt?: string;
|
||||
schema?: LLMSchema;
|
||||
systemPrompt?: string;
|
||||
}
|
||||
actions?: ActionsSchema;
|
||||
}
|
||||
|
||||
@ -393,6 +398,23 @@ export default class FirecrawlApp {
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
if (jsonData?.jsonOptions?.schema) {
|
||||
let schema = jsonData.jsonOptions.schema;
|
||||
// Try parsing the schema as a Zod schema
|
||||
try {
|
||||
schema = zodToJsonSchema(schema);
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
jsonOptions: {
|
||||
...jsonData.jsonOptions,
|
||||
schema: schema,
|
||||
},
|
||||
};
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
this.apiUrl + `/v1/scrape`,
|
||||
@ -772,6 +794,23 @@ export default class FirecrawlApp {
|
||||
},
|
||||
};
|
||||
}
|
||||
if (jsonData?.jsonOptions?.schema) {
|
||||
let schema = jsonData.jsonOptions.schema;
|
||||
|
||||
// Try parsing the schema as a Zod schema
|
||||
try {
|
||||
schema = zodToJsonSchema(schema);
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
jsonOptions: {
|
||||
...jsonData.jsonOptions,
|
||||
schema: schema,
|
||||
},
|
||||
};
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
|
@ -112,6 +112,18 @@ class FirecrawlApp:
|
||||
if key not in ['extract']:
|
||||
scrape_params[key] = value
|
||||
|
||||
json = params.get("jsonOptions", {})
|
||||
if json:
|
||||
if 'schema' in json and hasattr(json['schema'], 'schema'):
|
||||
json['schema'] = json['schema'].schema()
|
||||
scrape_params['jsonOptions'] = json
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key not in ['jsonOptions']:
|
||||
scrape_params[key] = value
|
||||
|
||||
|
||||
endpoint = f'/v1/scrape'
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
|
Loading…
x
Reference in New Issue
Block a user