Nick: python working

This commit is contained in:
Nicolas 2024-08-30 12:58:38 -03:00
parent bb4808443c
commit 45e33563eb
3 changed files with 58 additions and 36 deletions

View File

@ -75,6 +75,7 @@ export interface FirecrawlDocument {
html?: string;
rawHtml?: string;
links?: string[];
extract?: Record<any, any>;
screenshot?: string;
metadata: FirecrawlDocumentMetadata;
}
@ -344,20 +345,30 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: any = { url, ...params };
if (jsonData?.extractorOptions?.extractionSchema) {
let schema = jsonData.extractorOptions.extractionSchema;
if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = {
...jsonData,
extractorOptions: {
...jsonData.extractorOptions,
extractionSchema: schema,
mode: jsonData.extractorOptions.mode || "llm-extraction",
},
};
if(this.version === 'v0') {
jsonData = {
...jsonData,
extractorOptions: {
...jsonData.extractorOptions,
extractionSchema: schema,
mode: jsonData.extractorOptions.mode || "llm-extraction",
},
};
} else {
jsonData = {
...jsonData,
extract: {
...jsonData.extract,
schema: schema,
},
};
}
}
try {
const response: AxiosResponse = await axios.post(

View File

@ -1,17 +1,18 @@
import uuid
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
app = FirecrawlApp(api_key="fc-")
# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
print(crawl_result)
# LLM Extraction:
# Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
'formats': ['extract'],
'extract': {
'schema': TopArticlesSchema.model_json_schema()
}
})
print(llm_extraction_result['llm_extraction'])
print(llm_extraction_result['extract'])
# Define schema to extract contents into using json schema
# # Define schema to extract contents into using json schema
json_schema = {
"type": "object",
"properties": {
@ -62,7 +60,10 @@ json_schema = {
"required": ["top"]
}
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
app2 = FirecrawlApp(api_key="fc-", version="v0")
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': json_schema,
'mode': 'llm-extraction'

View File

@ -58,21 +58,31 @@ class FirecrawlApp:
# If there are additional params, process them
if params:
# Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
if self.version == 'v0':
# Handle extractorOptions (for v0 compatibility)
extractor_options = params.get('extractorOptions', {})
if extractor_options:
if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['extractorOptions']:
scrape_params[key] = value
elif self.version == 'v1':
# Handle extract (for v1)
extract = params.get('extract', {})
if extract:
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
extract['schema'] = extract['schema'].schema()
scrape_params['extract'] = extract
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key not in ['extract']:
scrape_params[key] = value
endpoint = f'/{self.version}/scrape'
# Make the POST request with the prepared headers and JSON data