Nick: python working

This commit is contained in:
Nicolas 2024-08-30 12:58:38 -03:00
parent bb4808443c
commit 45e33563eb
3 changed files with 58 additions and 36 deletions

View File

@ -75,6 +75,7 @@ export interface FirecrawlDocument {
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
extract?: Record<any, any>;
screenshot?: string; screenshot?: string;
metadata: FirecrawlDocumentMetadata; metadata: FirecrawlDocumentMetadata;
} }
@ -344,20 +345,30 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: any = { url, ...params }; let jsonData: any = { url, ...params };
if (jsonData?.extractorOptions?.extractionSchema) { if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
let schema = jsonData.extractorOptions.extractionSchema; let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) { if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema); schema = zodToJsonSchema(schema);
} }
jsonData = { if(this.version === 'v0') {
...jsonData, jsonData = {
extractorOptions: { ...jsonData,
...jsonData.extractorOptions, extractorOptions: {
extractionSchema: schema, ...jsonData.extractorOptions,
mode: jsonData.extractorOptions.mode || "llm-extraction", extractionSchema: schema,
}, mode: jsonData.extractorOptions.mode || "llm-extraction",
}; },
};
} else {
jsonData = {
...jsonData,
extract: {
...jsonData.extract,
schema: schema,
},
};
}
} }
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(

View File

@ -1,17 +1,18 @@
import uuid import uuid
from firecrawl.firecrawl import FirecrawlApp from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY") app = FirecrawlApp(api_key="fc-")
# Scrape a website: # Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev') scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown']) print(scrape_result['markdown'])
# Crawl a website: # Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
print(crawl_result) print(crawl_result)
# LLM Extraction: # LLM Extraction:
# Define schema to extract contents into using pydantic # Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': { 'formats': ['extract'],
'extractionSchema': TopArticlesSchema.model_json_schema(), 'extract': {
'mode': 'llm-extraction' 'schema': TopArticlesSchema.model_json_schema()
},
'pageOptions':{
'onlyMainContent': True
} }
}) })
print(llm_extraction_result['llm_extraction']) print(llm_extraction_result['extract'])
# Define schema to extract contents into using json schema # # Define schema to extract contents into using json schema
json_schema = { json_schema = {
"type": "object", "type": "object",
"properties": { "properties": {
@ -62,7 +60,10 @@ json_schema = {
"required": ["top"] "required": ["top"]
} }
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { app2 = FirecrawlApp(api_key="fc-", version="v0")
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
'extractorOptions': { 'extractorOptions': {
'extractionSchema': json_schema, 'extractionSchema': json_schema,
'mode': 'llm-extraction' 'mode': 'llm-extraction'

View File

@ -58,21 +58,31 @@ class FirecrawlApp:
# If there are additional params, process them # If there are additional params, process them
if params: if params:
# Initialize extractorOptions if present if self.version == 'v0':
extractor_options = params.get('extractorOptions', {}) # Handle extractorOptions (for v0 compatibility)
# Check and convert the extractionSchema if it's a Pydantic model extractor_options = params.get('extractorOptions', {})
if 'extractionSchema' in extractor_options: if extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'): if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') scrape_params['extractorOptions'] = extractor_options
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params # Include any other params directly at the top level of scrape_params
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key not in ['extractorOptions']:
scrape_params[key] = value scrape_params[key] = value
elif self.version == 'v1':
# Handle extract (for v1)
extract = params.get('extract', {})
if extract:
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
extract['schema'] = extract['schema'].schema()
scrape_params['extract'] = extract
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['extract']:
scrape_params[key] = value
endpoint = f'/{self.version}/scrape' endpoint = f'/{self.version}/scrape'
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data