mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 20:52:02 +08:00
Nick: extract to json in the sdks as well
This commit is contained in:
parent
34b40f6a23
commit
b030a1c5da
20
README.md
20
README.md
@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
-d '{
|
-d '{
|
||||||
"url": "https://www.mendable.ai/",
|
"url": "https://www.mendable.ai/",
|
||||||
"formats": ["extract"],
|
"formats": ["json"],
|
||||||
"extract": {
|
"jsonOptions": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||||||
"ogSiteName": "Mendable",
|
"ogSiteName": "Mendable",
|
||||||
"sourceURL": "https://mendable.ai/"
|
"sourceURL": "https://mendable.ai/"
|
||||||
},
|
},
|
||||||
"llm_extraction": {
|
"json": {
|
||||||
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
|
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
|
||||||
"supports_sso": true,
|
"supports_sso": true,
|
||||||
"is_open_source": false,
|
"is_open_source": false,
|
||||||
@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
-d '{
|
-d '{
|
||||||
"url": "https://docs.firecrawl.dev/",
|
"url": "https://docs.firecrawl.dev/",
|
||||||
"formats": ["extract"],
|
"formats": ["json"],
|
||||||
"extract": {
|
"jsonOptions": {
|
||||||
"prompt": "Extract the company mission from the page."
|
"prompt": "Extract the company mission from the page."
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel):
|
|||||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||||
|
|
||||||
data = app.scrape_url('https://news.ycombinator.com', {
|
data = app.scrape_url('https://news.ycombinator.com', {
|
||||||
'formats': ['extract'],
|
'formats': ['json'],
|
||||||
'extract': {
|
'jsonOptions': {
|
||||||
'schema': TopArticlesSchema.model_json_schema()
|
'schema': TopArticlesSchema.model_json_schema()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
print(data["extract"])
|
print(data["json"])
|
||||||
```
|
```
|
||||||
|
|
||||||
## Using the Node SDK
|
## Using the Node SDK
|
||||||
@ -526,10 +526,10 @@ const schema = z.object({
|
|||||||
});
|
});
|
||||||
|
|
||||||
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
extractorOptions: { extractionSchema: schema },
|
jsonOptions: { extractionSchema: schema },
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(scrapeResult.data["llm_extraction"]);
|
console.log(scrapeResult.data["json"]);
|
||||||
```
|
```
|
||||||
|
|
||||||
## Open Source vs Cloud Offering
|
## Open Source vs Cloud Offering
|
||||||
|
@ -78,7 +78,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
* Defines the options and configurations available for scraping web content.
|
* Defines the options and configurations available for scraping web content.
|
||||||
*/
|
*/
|
||||||
export interface CrawlScrapeOptions {
|
export interface CrawlScrapeOptions {
|
||||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
|
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
includeTags?: string[];
|
includeTags?: string[];
|
||||||
excludeTags?: string[];
|
excludeTags?: string[];
|
||||||
@ -127,6 +127,11 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
|
|||||||
schema?: LLMSchema;
|
schema?: LLMSchema;
|
||||||
systemPrompt?: string;
|
systemPrompt?: string;
|
||||||
};
|
};
|
||||||
|
json?:{
|
||||||
|
prompt?: string;
|
||||||
|
schema?: LLMSchema;
|
||||||
|
systemPrompt?: string;
|
||||||
|
}
|
||||||
actions?: ActionsSchema;
|
actions?: ActionsSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -393,6 +398,23 @@ export default class FirecrawlApp {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (jsonData?.jsonOptions?.schema) {
|
||||||
|
let schema = jsonData.jsonOptions.schema;
|
||||||
|
// Try parsing the schema as a Zod schema
|
||||||
|
try {
|
||||||
|
schema = zodToJsonSchema(schema);
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
|
}
|
||||||
|
jsonData = {
|
||||||
|
...jsonData,
|
||||||
|
jsonOptions: {
|
||||||
|
...jsonData.jsonOptions,
|
||||||
|
schema: schema,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
const response: AxiosResponse = await axios.post(
|
const response: AxiosResponse = await axios.post(
|
||||||
this.apiUrl + `/v1/scrape`,
|
this.apiUrl + `/v1/scrape`,
|
||||||
@ -772,6 +794,23 @@ export default class FirecrawlApp {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
if (jsonData?.jsonOptions?.schema) {
|
||||||
|
let schema = jsonData.jsonOptions.schema;
|
||||||
|
|
||||||
|
// Try parsing the schema as a Zod schema
|
||||||
|
try {
|
||||||
|
schema = zodToJsonSchema(schema);
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
|
}
|
||||||
|
jsonData = {
|
||||||
|
...jsonData,
|
||||||
|
jsonOptions: {
|
||||||
|
...jsonData.jsonOptions,
|
||||||
|
schema: schema,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
const response: AxiosResponse = await this.postRequest(
|
const response: AxiosResponse = await this.postRequest(
|
||||||
this.apiUrl + `/v1/batch/scrape`,
|
this.apiUrl + `/v1/batch/scrape`,
|
||||||
|
@ -112,6 +112,18 @@ class FirecrawlApp:
|
|||||||
if key not in ['extract']:
|
if key not in ['extract']:
|
||||||
scrape_params[key] = value
|
scrape_params[key] = value
|
||||||
|
|
||||||
|
json = params.get("jsonOptions", {})
|
||||||
|
if json:
|
||||||
|
if 'schema' in json and hasattr(json['schema'], 'schema'):
|
||||||
|
json['schema'] = json['schema'].schema()
|
||||||
|
scrape_params['jsonOptions'] = json
|
||||||
|
|
||||||
|
# Include any other params directly at the top level of scrape_params
|
||||||
|
for key, value in params.items():
|
||||||
|
if key not in ['jsonOptions']:
|
||||||
|
scrape_params[key] = value
|
||||||
|
|
||||||
|
|
||||||
endpoint = f'/v1/scrape'
|
endpoint = f'/v1/scrape'
|
||||||
# Make the POST request with the prepared headers and JSON data
|
# Make the POST request with the prepared headers and JSON data
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user