From 45e33563ebe87df0065ed2ce500009c0d4a247f0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 30 Aug 2024 12:58:38 -0300 Subject: [PATCH] Nick: python working --- apps/js-sdk/firecrawl/src/index.ts | 31 ++++++++++++++------- apps/python-sdk/example.py | 25 +++++++++-------- apps/python-sdk/firecrawl/firecrawl.py | 38 ++++++++++++++++---------- 3 files changed, 58 insertions(+), 36 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 2527be96..a5b3af2f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -75,6 +75,7 @@ export interface FirecrawlDocument { html?: string; rawHtml?: string; links?: string[]; + extract?: Record; screenshot?: string; metadata: FirecrawlDocumentMetadata; } @@ -344,20 +345,30 @@ export default class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { - let schema = jsonData.extractorOptions.extractionSchema; + if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) { + let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas if (schema instanceof z.ZodSchema) { schema = zodToJsonSchema(schema); } - jsonData = { - ...jsonData, - extractorOptions: { - ...jsonData.extractorOptions, - extractionSchema: schema, - mode: jsonData.extractorOptions.mode || "llm-extraction", - }, - }; + if(this.version === 'v0') { + jsonData = { + ...jsonData, + extractorOptions: { + ...jsonData.extractorOptions, + extractionSchema: schema, + mode: jsonData.extractorOptions.mode || "llm-extraction", + }, + }; + } else { + jsonData = { + ...jsonData, + extract: { + ...jsonData.extract, + schema: schema, + }, + }; + } } try { const response: AxiosResponse = await axios.post( diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index d80fa795..9d439a3b 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,17 +1,18 @@ import uuid from firecrawl.firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="fc-YOUR_API_KEY") +app = FirecrawlApp(api_key="fc-") # Scrape a website: scrape_result = app.scrape_url('firecrawl.dev') print(scrape_result['markdown']) # Crawl a website: -idempotency_key = str(uuid.uuid4()) # optional idempotency key -crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2) print(crawl_result) + + # LLM Extraction: # Define schema to extract contents into using pydantic from pydantic import BaseModel, Field @@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True + 'formats': ['extract'], + 'extract': { + 'schema': TopArticlesSchema.model_json_schema() } }) -print(llm_extraction_result['llm_extraction']) +print(llm_extraction_result['extract']) -# Define schema to extract contents into using json schema +# # Define schema to extract contents into using json schema json_schema = { "type": "object", "properties": { @@ -62,7 +60,10 @@ json_schema = { "required": ["top"] } -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { +app2 = FirecrawlApp(api_key="fc-", version="v0") + + +llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { 'extractorOptions': { 'extractionSchema': json_schema, 'mode': 'llm-extraction' diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 89c51803..4e9a7dab 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -58,21 +58,31 @@ class FirecrawlApp: # If there are additional params, process them if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options + if self.version == 'v0': + # Handle extractorOptions (for v0 compatibility) + extractor_options = params.get('extractorOptions', {}) + if extractor_options: + if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'): + extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() + extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') + scrape_params['extractorOptions'] = extractor_options - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key != 'extractorOptions': - scrape_params[key] = value + # Include any other params directly at the top level of scrape_params + for key, value in params.items(): + if key not in ['extractorOptions']: + scrape_params[key] = value + elif self.version == 'v1': + # Handle extract (for v1) + extract = params.get('extract', {}) + if extract: + if 'schema' in extract and hasattr(extract['schema'], 'schema'): + extract['schema'] = extract['schema'].schema() + scrape_params['extract'] = extract + + # Include any other params directly at the top level of scrape_params + for key, value in params.items(): + if key not in ['extract']: + scrape_params[key] = value endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data