mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 04:39:03 +08:00
Nick: python working
This commit is contained in:
parent
bb4808443c
commit
45e33563eb
@ -75,6 +75,7 @@ export interface FirecrawlDocument {
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
extract?: Record<any, any>;
|
||||
screenshot?: string;
|
||||
metadata: FirecrawlDocumentMetadata;
|
||||
}
|
||||
@ -344,20 +345,30 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: any = { url, ...params };
|
||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
||||
let schema = jsonData.extractorOptions.extractionSchema;
|
||||
if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
|
||||
let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof z.ZodSchema) {
|
||||
schema = zodToJsonSchema(schema);
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
extractorOptions: {
|
||||
...jsonData.extractorOptions,
|
||||
extractionSchema: schema,
|
||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
||||
},
|
||||
};
|
||||
if(this.version === 'v0') {
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
extractorOptions: {
|
||||
...jsonData.extractorOptions,
|
||||
extractionSchema: schema,
|
||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
||||
},
|
||||
};
|
||||
} else {
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
extract: {
|
||||
...jsonData.extract,
|
||||
schema: schema,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
|
@ -1,17 +1,18 @@
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
app = FirecrawlApp(api_key="fc-")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
||||
crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
|
||||
print(crawl_result)
|
||||
|
||||
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
print(llm_extraction_result['extract'])
|
||||
|
||||
# Define schema to extract contents into using json schema
|
||||
# # Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -62,7 +60,10 @@ json_schema = {
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
||||
|
||||
|
||||
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
|
@ -58,21 +58,31 @@ class FirecrawlApp:
|
||||
|
||||
# If there are additional params, process them
|
||||
if params:
|
||||
# Initialize extractorOptions if present
|
||||
extractor_options = params.get('extractorOptions', {})
|
||||
# Check and convert the extractionSchema if it's a Pydantic model
|
||||
if 'extractionSchema' in extractor_options:
|
||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||
# Update the scrape_params with the processed extractorOptions
|
||||
scrape_params['extractorOptions'] = extractor_options
|
||||
if self.version == 'v0':
|
||||
# Handle extractorOptions (for v0 compatibility)
|
||||
extractor_options = params.get('extractorOptions', {})
|
||||
if extractor_options:
|
||||
if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||
scrape_params['extractorOptions'] = extractor_options
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key != 'extractorOptions':
|
||||
scrape_params[key] = value
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key not in ['extractorOptions']:
|
||||
scrape_params[key] = value
|
||||
elif self.version == 'v1':
|
||||
# Handle extract (for v1)
|
||||
extract = params.get('extract', {})
|
||||
if extract:
|
||||
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
||||
extract['schema'] = extract['schema'].schema()
|
||||
scrape_params['extract'] = extract
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key not in ['extract']:
|
||||
scrape_params[key] = value
|
||||
|
||||
endpoint = f'/{self.version}/scrape'
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
|
Loading…
x
Reference in New Issue
Block a user