From 9443a823b2da4a32fffa3e4c6a329d0ea8b72b46 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Date: Fri, 30 May 2025 17:09:03 -0300 Subject: [PATCH] feat: script that generates all sdk examples for openapi --- apps/api/v1-openapi-with-examples.json | 2951 +++++++++++++++++ .../async_batch_scrape_urls_example.py | 55 + .../examples/async_crawl_url_example.py | 38 + .../examples/async_extract_example.py | 30 + .../batch_scrape_urls_and_watch_example.py | 55 + .../examples/batch_scrape_urls_example.py | 56 + .../examples/cancel_crawl_example.py | 25 + .../check_batch_scrape_errors_example.py | 25 + .../check_batch_scrape_status_example.py | 26 + .../examples/check_crawl_errors_example.py | 25 + .../examples/check_crawl_status_example.py | 25 + .../examples/crawl_url_and_watch_example.py | 37 + apps/python-sdk/examples/crawl_url_example.py | 39 + apps/python-sdk/examples/extract_example.py | 30 + .../examples/get_extract_status_example.py | 25 + apps/python-sdk/examples/map_url_example.py | 31 + .../python-sdk/examples/scrape_url_example.py | 55 + apps/python-sdk/examples/search_example.py | 32 + apps/python-sdk/scripts/generate_examples.py | 691 ++++ 19 files changed, 4251 insertions(+) create mode 100644 apps/api/v1-openapi-with-examples.json create mode 100644 apps/python-sdk/examples/async_batch_scrape_urls_example.py create mode 100644 apps/python-sdk/examples/async_crawl_url_example.py create mode 100644 apps/python-sdk/examples/async_extract_example.py create mode 100644 apps/python-sdk/examples/batch_scrape_urls_and_watch_example.py create mode 100644 apps/python-sdk/examples/batch_scrape_urls_example.py create mode 100644 apps/python-sdk/examples/cancel_crawl_example.py create mode 100644 apps/python-sdk/examples/check_batch_scrape_errors_example.py create mode 100644 apps/python-sdk/examples/check_batch_scrape_status_example.py create mode 100644 apps/python-sdk/examples/check_crawl_errors_example.py create mode 100644 apps/python-sdk/examples/check_crawl_status_example.py create mode 100644 apps/python-sdk/examples/crawl_url_and_watch_example.py create mode 100644 apps/python-sdk/examples/crawl_url_example.py create mode 100644 apps/python-sdk/examples/extract_example.py create mode 100644 apps/python-sdk/examples/get_extract_status_example.py create mode 100644 apps/python-sdk/examples/map_url_example.py create mode 100644 apps/python-sdk/examples/scrape_url_example.py create mode 100644 apps/python-sdk/examples/search_example.py create mode 100644 apps/python-sdk/scripts/generate_examples.py diff --git a/apps/api/v1-openapi-with-examples.json b/apps/api/v1-openapi-with-examples.json new file mode 100644 index 00000000..9c5a26b7 --- /dev/null +++ b/apps/api/v1-openapi-with-examples.json @@ -0,0 +1,2951 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Firecrawl API", + "version": "v1", + "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.", + "contact": { + "name": "Firecrawl Support", + "url": "https://firecrawl.dev/support", + "email": "support@firecrawl.dev" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v1" + } + ], + "paths": { + "/scrape": { + "post": { + "summary": "Scrape a single URL and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrl", + "tags": [ + "Scraping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "allOf": [ + { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "required": [ + "url" + ] + }, + { + "$ref": "#/components/schemas/ScrapeOptions" + } + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\nfrom firecrawl import (\n WaitAction, ScreenshotAction, ClickAction, WriteAction, \n PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction,\n LocationConfig, JsonConfig, ChangeTrackingOptions\n)\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.scrape_url(\n url='https://example.com',\n formats=['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'],\n include_tags=['div', 'p', 'span'],\n exclude_tags=['div', 'p', 'span'],\n only_main_content=True,\n wait_for=30000,\n timeout=30000,\n location=LocationConfig(country=\"US\", languages=[\"en\"]),\n mobile=True,\n skip_tls_verification=True,\n remove_base64_images=True,\n block_ads=True,\n proxy='basic',\n extract=JsonConfig(schema={\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"description\": {\"type\": \"string\"}}}),\n json_options=JsonConfig(schema={\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"description\": {\"type\": \"string\"}}}),\n actions=[\n WaitAction(milliseconds=1000, selector=\"#content\"),\n ScreenshotAction(full_page=True),\n ClickAction(selector=\"button.submit\"),\n WriteAction(text=\"example@email.com\"),\n PressAction(key=\"Enter\"),\n ScrollAction(direction=\"down\", selector=\".scrollable-container\"),\n ScrapeAction(),\n ExecuteJavascriptAction(script=\"function get_title() { return document.title; }; get_title();\")\n ],\n change_tracking_options=ChangeTrackingOptions(modes=[\"git-diff\", \"json\"], schema={\"type\": \"object\", \"properties\": {\"changes\": {\"type\": \"array\"}, \"timestamp\": {\"type\": \"string\"}}})\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/batch/scrape": { + "post": { + "summary": "Scrape multiple URLs and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrls", + "tags": [ + "Scraping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "allOf": [ + { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "webhook": { + "type": "object", + "description": "A webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + }, + "events": { + "type": "array", + "description": "Type of events that should be sent to the webhook URL. (default: all)", + "items": { + "type": "string", + "enum": [ + "completed", + "page", + "failed", + "started" + ] + } + } + }, + "required": [ + "url" + ] + }, + "ignoreInvalidURLs": { + "type": "boolean", + "default": false, + "description": "If invalid URLs are specified in the urls array, they will be ignored. Instead of them failing the entire request, a batch scrape using the remaining valid URLs will be created, and the invalid URLs will be returned in the invalidURLs field of the response." + } + }, + "required": [ + "urls" + ] + }, + { + "$ref": "#/components/schemas/ScrapeOptions" + } + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a batch scrape job", + "operationId": "getBatchScrapeStatus", + "tags": [ + "Scraping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a batch scrape job", + "operationId": "cancelBatchScrape", + "tags": [ + "Scraping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "message": { + "type": "string", + "example": "Batch scrape job successfully cancelled." + } + } + } + } + } + }, + "404": { + "description": "Batch scrape job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Batch scrape job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape/{id}/errors": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the errors of a batch scrape job", + "operationId": "getBatchScrapeErrors", + "tags": [ + "Scraping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlErrorsResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the crawl job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a crawl job", + "operationId": "getCrawlStatus", + "tags": [ + "Crawling" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a crawl job", + "operationId": "cancelCrawl", + "tags": [ + "Crawling" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "cancelled" + ], + "example": "cancelled" + } + } + } + } + } + }, + "404": { + "description": "Crawl job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Crawl job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.cancel_crawl(\n id='example_id'\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/crawl/{id}/errors": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the crawl job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the errors of a crawl job", + "operationId": "getCrawlErrors", + "tags": [ + "Crawling" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlErrorsResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl": { + "post": { + "summary": "Crawl multiple URLs based on options", + "operationId": "crawlUrls", + "tags": [ + "Crawling" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "excludePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL pathname regex patterns that exclude matching URLs from the crawl. For example, if you set \"excludePaths\": [\"blog/.*\"] for the base URL firecrawl.dev, any results matching that pattern will be excluded, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." + }, + "includePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL pathname regex patterns that include matching URLs in the crawl. Only the paths that match the specified patterns will be included in the response. For example, if you set \"includePaths\": [\"blog/.*\"] for the base URL firecrawl.dev, only results matching that pattern will be included, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." + }, + "maxDepth": { + "type": "integer", + "description": "Maximum depth to crawl relative to the base URL. Basically, the max number of slashes the pathname of a scraped URL may contain.", + "default": 10 + }, + "maxDiscoveryDepth": { + "type": "integer", + "description": "Maximum depth to crawl based on discovery order. The root site and sitemapped pages has a discovery depth of 0. For example, if you set it to 1, and you set ignoreSitemap, you will only crawl the entered URL and all URLs that are linked on that page." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, + "ignoreQueryParameters": { + "type": "boolean", + "description": "Do not re-scrape the same path with different (or none) query parameters", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of pages to crawl. Default limit is 10000.", + "default": 10000 + }, + "allowBackwardLinks": { + "type": "boolean", + "description": "Allows the crawler to follow internal links to sibling or parent URLs, not just child paths.\n\nfalse: Only crawls deeper (child) URLs.\n\u2192 e.g. /features/feature-1 \u2192 /features/feature-1/tips \u2705\n\u2192 Won't follow /pricing or / \u274c\n\ntrue: Crawls any internal links, including siblings and parents.\n\u2192 e.g. /features/feature-1 \u2192 /pricing, /, etc. \u2705\n\nUse true for broader internal coverage beyond nested paths.", + "default": false + }, + "allowExternalLinks": { + "type": "boolean", + "description": "Allows the crawler to follow links to external websites.", + "default": false + }, + "delay": { + "type": "number", + "description": "Delay in seconds between scrapes. This helps respect website rate limits." + }, + "webhook": { + "type": "object", + "description": "A webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started), every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + }, + "events": { + "type": "array", + "description": "Type of events that should be sent to the webhook URL. (default: all)", + "items": { + "type": "string", + "enum": [ + "completed", + "page", + "failed", + "started" + ] + } + } + }, + "required": [ + "url" + ] + }, + "scrapeOptions": { + "$ref": "#/components/schemas/ScrapeOptions" + } + }, + "required": [ + "url" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.crawl_url(\n url='https://example.com',\n include_paths=['example1', 'example2'],\n exclude_paths=['example1', 'example2'],\n max_depth=10,\n max_discovery_depth=10,\n limit=10,\n allow_backward_links=True,\n allow_external_links=True,\n ignore_sitemap=True,\n deduplicate_similar_urls=True,\n ignore_query_parameters=True,\n regex_on_full_url=True,\n delay=10,\n poll_interval=10,\n idempotency_key='example_idempotency_key'\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/map": { + "post": { + "summary": "Map multiple URLs based on options", + "operationId": "mapUrls", + "tags": [ + "Mapping" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "search": { + "type": "string", + "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling.", + "default": true + }, + "sitemapOnly": { + "type": "boolean", + "description": "Only return links found in the website sitemap", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains of the website", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of links to return", + "default": 5000, + "maximum": 30000 + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds. There is no timeout by default." + } + }, + "required": [ + "url" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MapResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.map_url(\n url='https://example.com',\n search='example_search',\n ignore_sitemap=True,\n include_subdomains=True,\n sitemap_only=True,\n limit=10,\n timeout=30000\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/extract": { + "post": { + "summary": "Extract structured data from pages using LLMs", + "operationId": "extractData", + "tags": [ + "Extraction" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URLs to extract data from. URLs should be in glob format." + } + }, + "prompt": { + "type": "string", + "description": "Prompt to guide the extraction process" + }, + "schema": { + "type": "object", + "description": "Schema to define the structure of the extracted data. Must conform to [JSON Schema](https://json-schema.org/)." + }, + "enableWebSearch": { + "type": "boolean", + "description": "When true, the extraction will use web search to find additional data", + "default": false + }, + "ignoreSitemap": { + "type": "boolean", + "description": "When true, sitemap.xml files will be ignored during website scanning", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "When true, subdomains of the provided URLs will also be scanned", + "default": true + }, + "showSources": { + "type": "boolean", + "description": "When true, the sources used to extract the data will be included in the response as `sources` key", + "default": false + }, + "scrapeOptions": { + "$ref": "#/components/schemas/ScrapeOptions" + }, + "ignoreInvalidURLs": { + "type": "boolean", + "default": false, + "description": "If invalid URLs are specified in the urls array, they will be ignored. Instead of them failing the entire request, an extract using the remaining valid URLs will be performed, and the invalid URLs will be returned in the invalidURLs field of the response." + } + }, + "required": [ + "urls" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful extraction", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractResponse" + } + } + } + }, + "400": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Invalid input data." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.extract(\n urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'],\n prompt='example_prompt',\n system_prompt='example_system_prompt',\n allow_external_links=True,\n enable_web_search=True,\n show_sources=True\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/extract/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the extract job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of an extract job", + "operationId": "getExtractStatus", + "tags": [ + "Extraction" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractStatusResponse" + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.get_extract_status(\n job_id='example_job_id'\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/deep-research": { + "post": { + "summary": "Start a deep research operation on a query", + "operationId": "startDeepResearch", + "tags": [ + "Research" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query to research" + }, + "maxDepth": { + "type": "integer", + "minimum": 1, + "maximum": 12, + "default": 7, + "description": "Maximum depth of research iterations" + }, + "timeLimit": { + "type": "integer", + "minimum": 30, + "maximum": 600, + "default": 300, + "description": "Time limit in seconds" + }, + "maxUrls": { + "type": "integer", + "minimum": 1, + "maximum": 1000, + "default": 20, + "description": "Maximum number of URLs to analyze" + }, + "analysisPrompt": { + "type": "string", + "description": "The prompt to use for the final analysis. Useful to format the final analysis markdown in a specific way." + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the research agent. Useful to steer the research agent to a specific direction." + }, + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "json" + ], + "default": [ + "markdown" + ] + } + }, + "jsonOptions": { + "type": "object", + "description": "Options for JSON output", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the JSON output. Must conform to [JSON Schema](https://json-schema.org/)." + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the JSON output" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the JSON output" + } + } + } + }, + "required": [ + "query" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Research job started successfully", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "id": { + "type": "string", + "format": "uuid", + "description": "ID of the research job" + } + } + } + } + } + }, + "400": { + "description": "Invalid request parameters", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Invalid parameters provided" + } + } + } + } + } + } + } + } + }, + "/deep-research/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the research job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status and results of a deep research operation", + "operationId": "getDeepResearchStatus", + "tags": [ + "Research" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "finalAnalysis": { + "type": "string" + }, + "json": { + "type": "object", + "description": "Displayed when using JSON format", + "nullable": true + }, + "activities": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "status": { + "type": "string" + }, + "message": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "depth": { + "type": "integer" + } + } + } + }, + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "favicon": { + "type": "string" + } + } + } + }, + "status": { + "type": "string", + "enum": [ + "processing", + "completed", + "failed" + ] + }, + "error": { + "type": "string" + }, + "expiresAt": { + "type": "string", + "format": "date-time" + }, + "currentDepth": { + "type": "integer" + }, + "maxDepth": { + "type": "integer" + }, + "totalUrls": { + "type": "integer" + } + } + } + } + } + } + } + }, + "404": { + "description": "Research job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Research job not found" + } + } + } + } + } + } + } + } + }, + "/team/credit-usage": { + "get": { + "summary": "Get remaining credits for the authenticated team", + "operationId": "getCreditUsage", + "tags": [ + "Billing" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_credits": { + "type": "number", + "description": "Number of credits remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Credit usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find credit usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching credit usage" + } + } + } + } + } + } + } + } + }, + "/team/token-usage": { + "get": { + "summary": "Get remaining tokens for the authenticated team (Extract only)", + "operationId": "getTokenUsage", + "tags": [ + "Billing" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_tokens": { + "type": "number", + "description": "Number of tokens remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Token usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find token usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching token usage" + } + } + } + } + } + } + } + } + }, + "/search": { + "post": { + "summary": "Search and optionally scrape search results", + "operationId": "searchAndScrape", + "tags": [ + "Search" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query" + }, + "limit": { + "type": "integer", + "description": "Maximum number of results to return", + "default": 5, + "maximum": 100, + "minimum": 1 + }, + "tbs": { + "type": "string", + "description": "Time-based search parameter" + }, + "lang": { + "type": "string", + "description": "Language code for search results", + "default": "en" + }, + "country": { + "type": "string", + "description": "Country code for search results", + "default": "us" + }, + "location": { + "type": "string", + "description": "Location parameter for search results" + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds", + "default": 60000 + }, + "ignoreInvalidURLs": { + "type": "boolean", + "description": "Excludes URLs from the search results that are invalid for other Firecrawl endpoints. This helps reduce errors if you are piping data from search into other Firecrawl API endpoints.", + "default": false + }, + "scrapeOptions": { + "type": "object", + "description": "Options for scraping search results", + "properties": { + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "json" + ] + }, + "description": "Formats to include in the output", + "default": [] + } + }, + "default": {} + } + }, + "required": [ + "query" + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Title from search result" + }, + "description": { + "type": "string", + "description": "Description from search result" + }, + "url": { + "type": "string", + "description": "URL of the search result" + }, + "markdown": { + "type": "string", + "nullable": true, + "description": "Markdown content if scraping was requested" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML content if requested in formats" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content if requested in formats" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Links found if requested in formats" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot URL if requested in formats" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string", + "nullable": true + } + } + } + } + } + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Warning message if any issues occurred" + } + } + } + } + } + }, + "408": { + "description": "Request timeout", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Request timed out" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Python SDK", + "source": "import os\nfrom firecrawl import FirecrawlApp\n\ndef main():\n # Initialize the FirecrawlApp\n api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n if not api_key:\n raise ValueError(\"Please set FIRECRAWL_API_KEY environment variable\")\n \n app = FirecrawlApp(api_key=api_key)\n \n # Example with all parameters\n try:\n result = app.search(\n query='example_query',\n limit=10,\n tbs='example_tbs',\n filter='example_filter',\n lang='example_lang',\n country='example_country',\n location='US',\n timeout=30000\n )\n \n print(\"Success!\")\n print(f\"Result: {result}\")\n \n except Exception as e:\n print(f\"Error: {e}\")\n\nif __name__ == \"__main__\":\n main()\n" + } + ] + } + }, + "/llmstxt": { + "post": { + "summary": "Generate LLMs.txt for a website", + "operationId": "generateLLMsTxt", + "tags": [ + "LLMs.txt" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to generate LLMs.txt from" + }, + "maxUrls": { + "type": "integer", + "description": "Maximum number of URLs to analyze", + "default": 2 + }, + "showFullText": { + "type": "boolean", + "description": "Include full text content in the response", + "default": false + } + }, + "required": [ + "url" + ] + } + } + } + }, + "responses": { + "200": { + "description": "LLMs.txt generation job started successfully", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "id": { + "type": "string", + "format": "uuid", + "description": "ID of the LLMs.txt generation job" + } + } + } + } + } + }, + "400": { + "description": "Invalid request parameters", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Invalid parameters provided" + } + } + } + } + } + } + } + } + }, + "/llmstxt/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the LLMs.txt generation job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status and results of an LLMs.txt generation job", + "operationId": "getLLMsTxtStatus", + "tags": [ + "LLMs.txt" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "status": { + "type": "string", + "enum": [ + "processing", + "completed", + "failed" + ] + }, + "data": { + "type": "object", + "properties": { + "llmstxt": { + "type": "string", + "description": "The generated LLMs.txt content" + }, + "llmsfulltxt": { + "type": "string", + "description": "The full text content when showFullText is true" + } + } + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "When the generated content will expire" + } + } + } + } + } + }, + "404": { + "description": "LLMs.txt generation job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "LLMs.txt generation job not found" + } + } + } + } + } + } + } + } + } + }, + "components": { + "securitySchemes": { + "bearerAuth": { + "type": "http", + "scheme": "bearer" + } + }, + "schemas": { + "ScrapeOptions": { + "type": "object", + "properties": { + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "json", + "changeTracking" + ] + }, + "description": "Formats to include in the output.", + "default": [ + "markdown" + ] + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include in the output." + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude from the output." + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "waitFor": { + "type": "integer", + "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", + "default": 0 + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 + }, + "parsePDF": { + "type": "boolean", + "description": "Controls how PDF files are processed during scraping. When true, the PDF content is extracted and converted to markdown format, with billing based on the number of pages (1 credit per page). When false, the PDF file is returned in base64 encoding with a flat rate of 1 credit total.", + "default": true + }, + "jsonOptions": { + "type": "object", + "description": "JSON options object", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the extraction (Optional). Must conform to [JSON Schema](https://json-schema.org/)." + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the extraction (Optional)" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the extraction without a schema (Optional)" + } + } + }, + "actions": { + "type": "array", + "description": "Actions to perform on the page before grabbing the content", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Wait", + "properties": { + "type": { + "type": "string", + "enum": [ + "wait" + ], + "description": "Wait for a specified amount of milliseconds" + }, + "milliseconds": { + "type": "integer", + "minimum": 1, + "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "title": "Screenshot", + "properties": { + "type": { + "type": "string", + "enum": [ + "screenshot" + ], + "description": "Take a screenshot. The links will be in the response's `actions.screenshots` array." + }, + "fullPage": { + "type": "boolean", + "description": "Should the screenshot be full-page or viewport sized?", + "default": false + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "title": "Click", + "properties": { + "type": { + "type": "string", + "enum": [ + "click" + ], + "description": "Click on an element" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#load-more-button" + }, + "all": { + "type": "boolean", + "description": "Clicks all elements matched by the selector, not just the first one. Does not throw an error if no elements match the selector.", + "default": false + } + }, + "required": [ + "type", + "selector" + ] + }, + { + "type": "object", + "title": "Write text", + "properties": { + "type": { + "type": "string", + "enum": [ + "write" + ], + "description": "Write text into an input field, text area, or contenteditable element. Note: You must first focus the element using a 'click' action before writing. The text will be typed character by character to simulate keyboard input." + }, + "text": { + "type": "string", + "description": "Text to type", + "example": "Hello, world!" + } + }, + "required": [ + "type", + "text" + ] + }, + { + "type": "object", + "title": "Press a key", + "description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.", + "properties": { + "type": { + "type": "string", + "enum": [ + "press" + ], + "description": "Press a key on the page" + }, + "key": { + "type": "string", + "description": "Key to press", + "example": "Enter" + } + }, + "required": [ + "type", + "key" + ] + }, + { + "type": "object", + "title": "Scroll", + "properties": { + "type": { + "type": "string", + "enum": [ + "scroll" + ], + "description": "Scroll the page or a specific element" + }, + "direction": { + "type": "string", + "enum": [ + "up", + "down" + ], + "description": "Direction to scroll", + "default": "down" + }, + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": [ + "scrape" + ], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": [ + "executeJavascript" + ], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": [ + "type", + "script" + ] + } + ] + } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + }, + "blockAds": { + "type": "boolean", + "description": "Enables ad-blocking and cookie popup blocking.", + "default": true + }, + "proxy": { + "type": "string", + "enum": [ + "basic", + "stealth", + "auto" + ], + "description": "Specifies the type of proxy to use.\n\n - **basic**: Proxies for scraping sites with none to basic anti-bot solutions. Fast and usually works.\n - **stealth**: Stealth proxies for scraping sites with advanced anti-bot solutions. Slower, but more reliable on certain sites. Costs up to 5 credits per request.\n - **auto**: Firecrawl will automatically retry scraping with stealth proxies if the basic proxy fails. If the retry with stealth is successful, 5 credits will be billed for the scrape. If the first attempt with basic is successful, only the regular cost will be billed.\n\nIf you do not specify a proxy, Firecrawl will default to basic." + }, + "changeTrackingOptions": { + "type": "object", + "description": "Options for change tracking (Beta). Only applicable when 'changeTracking' is included in formats. The 'markdown' format must also be specified when using change tracking.", + "properties": { + "modes": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "git-diff", + "json" + ] + }, + "description": "The mode to use for change tracking. 'git-diff' provides a detailed diff, and 'json' compares extracted JSON data." + }, + "schema": { + "type": "object", + "description": "Schema for JSON extraction when using 'json' mode. Defines the structure of data to extract and compare. Must conform to [JSON Schema](https://json-schema.org/)." + }, + "prompt": { + "type": "string", + "description": "Prompt to use for change tracking when using 'json' mode. If not provided, the default prompt will be used." + } + } + } + } + }, + "ScrapeResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `html` is in `formats`" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `rawHtml` is in `formats`" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `screenshot` is in `formats`" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `links` is in `formats`" + }, + "actions": { + "type": "object", + "nullable": true, + "description": "Results of the actions specified in the `actions` parameter. Only present if the `actions` parameter was provided in the request", + "properties": { + "screenshots": { + "type": "array", + "description": "Screenshot URLs, in the same order as the screenshot actions provided.", + "items": { + "type": "string", + "format": "url" + } + }, + "scrapes": { + "type": "array", + "description": "Scrape contents, in the same order as the scrape actions provided.", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "html": { + "type": "string" + } + } + } + }, + "javascriptReturns": { + "type": "array", + "description": "JavaScript return values, in the same order as the executeJavascript actions provided.", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "value": {} + } + } + } + } + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + }, + "changeTracking": { + "type": "object", + "nullable": true, + "description": "Change tracking information if `changeTracking` is in `formats`. Only present when the `changeTracking` format is requested.", + "properties": { + "previousScrapeAt": { + "type": "string", + "format": "date-time", + "nullable": true, + "description": "The timestamp of the previous scrape that the current page is being compared against. Null if no previous scrape exists." + }, + "changeStatus": { + "type": "string", + "enum": [ + "new", + "same", + "changed", + "removed" + ], + "description": "The result of the comparison between the two page versions. 'new' means this page did not exist before, 'same' means content has not changed, 'changed' means content has changed, 'removed' means the page was removed." + }, + "visibility": { + "type": "string", + "enum": [ + "visible", + "hidden" + ], + "description": "The visibility of the current page/URL. 'visible' means the URL was discovered through an organic route (links or sitemap), 'hidden' means the URL was discovered through memory from previous crawls." + }, + "diff": { + "type": "string", + "nullable": true, + "description": "Git-style diff of changes when using 'git-diff' mode. Only present when the mode is set to 'git-diff'." + }, + "json": { + "type": "object", + "nullable": true, + "description": "JSON comparison results when using 'json' mode. Only present when the mode is set to 'json'. This will emit a list of all the keys and their values from the `previous` and `current` scrapes based on the type defined in the `schema`. Example [here](/features/change-tracking)" + } + } + } + } + } + } + }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "The current status of the crawl. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be crawled." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully crawled." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the crawl." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the crawl will expire." + }, + "next": { + "type": "string", + "nullable": true, + "description": "The URL to retrieve the next 10MB of data. Returned if the crawl is not completed or if the response is larger than 10MB." + }, + "data": { + "type": "array", + "description": "The data of the crawl.", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + } + } + } + }, + "CrawlErrorsResponseObj": { + "type": "object", + "properties": { + "errors": { + "type": "array", + "description": "Errored scrape jobs and error details", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "timestamp": { + "type": "string", + "nullable": true, + "description": "ISO timestamp of failure" + }, + "url": { + "type": "string", + "description": "Scraped URL" + }, + "error": { + "type": "string", + "description": "Error message" + } + } + } + }, + "robotsBlocked": { + "type": "array", + "description": "List of URLs that were attempted in scraping but were blocked by robots.txt", + "items": { + "type": "string" + } + } + } + }, + "BatchScrapeStatusResponseObj": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "The current status of the batch scrape. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be scraped." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully scraped." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the batch scrape." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the batch scrape will expire." + }, + "next": { + "type": "string", + "nullable": true, + "description": "The URL to retrieve the next 10MB of data. Returned if the batch scrape is not completed or if the response is larger than 10MB." + }, + "data": { + "type": "array", + "description": "The data of the batch scrape.", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string", + "format": "uri" + } + } + }, + "BatchScrapeResponseObj": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string", + "format": "uri" + }, + "invalidURLs": { + "type": "array", + "nullable": true, + "items": { + "type": "string" + }, + "description": "If ignoreInvalidURLs is true, this is an array containing the invalid URLs that were specified in the request. If there were no invalid URLs, this will be an empty array. If ignoreInvalidURLs is false, this field will be undefined." + } + } + }, + "MapResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "ExtractResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "invalidURLs": { + "type": "array", + "nullable": true, + "items": { + "type": "string" + }, + "description": "If ignoreInvalidURLs is true, this is an array containing the invalid URLs that were specified in the request. If there were no invalid URLs, this will be an empty array. If ignoreInvalidURLs is false, this field will be undefined." + } + } + }, + "ExtractStatusResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object" + }, + "status": { + "type": "string", + "enum": [ + "completed", + "processing", + "failed", + "cancelled" + ], + "description": "The current status of the extract job" + }, + "expiresAt": { + "type": "string", + "format": "date-time" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] +} \ No newline at end of file diff --git a/apps/python-sdk/examples/async_batch_scrape_urls_example.py b/apps/python-sdk/examples/async_batch_scrape_urls_example.py new file mode 100644 index 00000000..74ded0c7 --- /dev/null +++ b/apps/python-sdk/examples/async_batch_scrape_urls_example.py @@ -0,0 +1,55 @@ +import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.async_batch_scrape_urls( + urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'], + formats=['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'], + include_tags=['div', 'p', 'span'], + exclude_tags=['div', 'p', 'span'], + only_main_content=True, + wait_for=30000, + timeout=30000, + location=LocationConfig(country="US", languages=["en"]), + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy='basic', + extract=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + json_options=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + actions=[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ], + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/async_crawl_url_example.py b/apps/python-sdk/examples/async_crawl_url_example.py new file mode 100644 index 00000000..4fa55bcf --- /dev/null +++ b/apps/python-sdk/examples/async_crawl_url_example.py @@ -0,0 +1,38 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.async_crawl_url( + url='https://example.com', + include_paths=['example1', 'example2'], + exclude_paths=['example1', 'example2'], + max_depth=10, + max_discovery_depth=10, + limit=10, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=10, + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/async_extract_example.py b/apps/python-sdk/examples/async_extract_example.py new file mode 100644 index 00000000..bf95c785 --- /dev/null +++ b/apps/python-sdk/examples/async_extract_example.py @@ -0,0 +1,30 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.async_extract( + urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'], + prompt='example_prompt', + system_prompt='example_system_prompt', + allow_external_links=True, + enable_web_search=True, + show_sources=True + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/batch_scrape_urls_and_watch_example.py b/apps/python-sdk/examples/batch_scrape_urls_and_watch_example.py new file mode 100644 index 00000000..6a4a3c61 --- /dev/null +++ b/apps/python-sdk/examples/batch_scrape_urls_and_watch_example.py @@ -0,0 +1,55 @@ +import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.batch_scrape_urls_and_watch( + urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'], + formats=['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'], + include_tags=['div', 'p', 'span'], + exclude_tags=['div', 'p', 'span'], + only_main_content=True, + wait_for=30000, + timeout=30000, + location=LocationConfig(country="US", languages=["en"]), + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy='basic', + extract=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + json_options=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + actions=[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ], + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/batch_scrape_urls_example.py b/apps/python-sdk/examples/batch_scrape_urls_example.py new file mode 100644 index 00000000..1d90952a --- /dev/null +++ b/apps/python-sdk/examples/batch_scrape_urls_example.py @@ -0,0 +1,56 @@ +import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.batch_scrape_urls( + urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'], + formats=['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'], + include_tags=['div', 'p', 'span'], + exclude_tags=['div', 'p', 'span'], + only_main_content=True, + wait_for=30000, + timeout=30000, + location=LocationConfig(country="US", languages=["en"]), + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy='basic', + extract=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + json_options=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + actions=[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ], + poll_interval=10, + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/cancel_crawl_example.py b/apps/python-sdk/examples/cancel_crawl_example.py new file mode 100644 index 00000000..94f40601 --- /dev/null +++ b/apps/python-sdk/examples/cancel_crawl_example.py @@ -0,0 +1,25 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.cancel_crawl( + id='example_id' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/check_batch_scrape_errors_example.py b/apps/python-sdk/examples/check_batch_scrape_errors_example.py new file mode 100644 index 00000000..3e603cf6 --- /dev/null +++ b/apps/python-sdk/examples/check_batch_scrape_errors_example.py @@ -0,0 +1,25 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.check_batch_scrape_errors( + id='example_id' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/check_batch_scrape_status_example.py b/apps/python-sdk/examples/check_batch_scrape_status_example.py new file mode 100644 index 00000000..04f87d53 --- /dev/null +++ b/apps/python-sdk/examples/check_batch_scrape_status_example.py @@ -0,0 +1,26 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.check_batch_scrape_status( + id='example_id', + poll_interval=10 + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/check_crawl_errors_example.py b/apps/python-sdk/examples/check_crawl_errors_example.py new file mode 100644 index 00000000..41ee2631 --- /dev/null +++ b/apps/python-sdk/examples/check_crawl_errors_example.py @@ -0,0 +1,25 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.check_crawl_errors( + id='example_id' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/check_crawl_status_example.py b/apps/python-sdk/examples/check_crawl_status_example.py new file mode 100644 index 00000000..abd3fc2e --- /dev/null +++ b/apps/python-sdk/examples/check_crawl_status_example.py @@ -0,0 +1,25 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.check_crawl_status( + id='example_id' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/crawl_url_and_watch_example.py b/apps/python-sdk/examples/crawl_url_and_watch_example.py new file mode 100644 index 00000000..8c4068e9 --- /dev/null +++ b/apps/python-sdk/examples/crawl_url_and_watch_example.py @@ -0,0 +1,37 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.crawl_url_and_watch( + url='https://example.com', + include_paths=['example1', 'example2'], + exclude_paths=['example1', 'example2'], + max_depth=10, + max_discovery_depth=10, + limit=10, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/crawl_url_example.py b/apps/python-sdk/examples/crawl_url_example.py new file mode 100644 index 00000000..fbd0be42 --- /dev/null +++ b/apps/python-sdk/examples/crawl_url_example.py @@ -0,0 +1,39 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.crawl_url( + url='https://example.com', + include_paths=['example1', 'example2'], + exclude_paths=['example1', 'example2'], + max_depth=10, + max_discovery_depth=10, + limit=10, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=10, + poll_interval=10, + idempotency_key='example_idempotency_key' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/extract_example.py b/apps/python-sdk/examples/extract_example.py new file mode 100644 index 00000000..11fde632 --- /dev/null +++ b/apps/python-sdk/examples/extract_example.py @@ -0,0 +1,30 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.extract( + urls=['https://example1.com', 'https://example2.com', 'https://blog.example.com'], + prompt='example_prompt', + system_prompt='example_system_prompt', + allow_external_links=True, + enable_web_search=True, + show_sources=True + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/get_extract_status_example.py b/apps/python-sdk/examples/get_extract_status_example.py new file mode 100644 index 00000000..34788f12 --- /dev/null +++ b/apps/python-sdk/examples/get_extract_status_example.py @@ -0,0 +1,25 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.get_extract_status( + job_id='example_job_id' + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/map_url_example.py b/apps/python-sdk/examples/map_url_example.py new file mode 100644 index 00000000..0eb4d155 --- /dev/null +++ b/apps/python-sdk/examples/map_url_example.py @@ -0,0 +1,31 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.map_url( + url='https://example.com', + search='example_search', + ignore_sitemap=True, + include_subdomains=True, + sitemap_only=True, + limit=10, + timeout=30000 + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/scrape_url_example.py b/apps/python-sdk/examples/scrape_url_example.py new file mode 100644 index 00000000..20b78370 --- /dev/null +++ b/apps/python-sdk/examples/scrape_url_example.py @@ -0,0 +1,55 @@ +import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig, ChangeTrackingOptions +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.scrape_url( + url='https://example.com', + formats=['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'], + include_tags=['div', 'p', 'span'], + exclude_tags=['div', 'p', 'span'], + only_main_content=True, + wait_for=30000, + timeout=30000, + location=LocationConfig(country="US", languages=["en"]), + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy='basic', + extract=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + json_options=JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}}), + actions=[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ], + change_tracking_options=ChangeTrackingOptions(modes=["git-diff", "json"], schema={"type": "object", "properties": {"changes": {"type": "array"}, "timestamp": {"type": "string"}}}) + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/examples/search_example.py b/apps/python-sdk/examples/search_example.py new file mode 100644 index 00000000..019d4926 --- /dev/null +++ b/apps/python-sdk/examples/search_example.py @@ -0,0 +1,32 @@ +import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: + result = app.search( + query='example_query', + limit=10, + tbs='example_tbs', + filter='example_filter', + lang='example_lang', + country='example_country', + location='US', + timeout=30000 + ) + + print("Success!") + print(f"Result: {result}") + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/apps/python-sdk/scripts/generate_examples.py b/apps/python-sdk/scripts/generate_examples.py new file mode 100644 index 00000000..d3f0b25e --- /dev/null +++ b/apps/python-sdk/scripts/generate_examples.py @@ -0,0 +1,691 @@ +import inspect +import sys +import os +from typing import get_origin, get_args +import json + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) +from firecrawl.firecrawl import FirecrawlApp + +class MethodInfo: + def __init__(self, name: str, signature: str, parameters: dict[str, dict]): + self.name = name + self.signature = signature + self.parameters = parameters + +def extract_method_info(cls, method_name) -> MethodInfo: + """Extract signature, parameters, types, defaults""" + method = getattr(cls, method_name) + signature = inspect.signature(method) + parameters = signature.parameters + + param_info = {} + for param_name, param in parameters.items(): + param_info[param_name] = { + "type": param.annotation, + "default": param.default, + "kind": param.kind.name + } + + return MethodInfo( + name=method_name, + signature=str(signature), + parameters=param_info + ) + +def generate_example_value(param_name: str, param_type, method_name: str): + """Generate realistic example values based on parameter type and name""" + + # Check for actions parameter first, before any type processing + if param_name == 'actions' or 'Action' in str(param_type): + return "ACTIONS_EXAMPLE" + + # Check for formats parameter to show all available formats + if param_name == 'formats': + return ['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'] + + # Check for urls parameter to show proper URL examples + if param_name == 'urls': + return ['https://example1.com', 'https://example2.com', 'https://blog.example.com'] + + # Check for location parameter to use LocationConfig object + if param_name == 'location': + # For search method, location is a string, not LocationConfig + if method_name == 'search': + return 'US' + return "LOCATION_EXAMPLE" + + # Check for extract and json_options parameters to use JsonConfig object + if param_name in ['extract', 'json_options']: + return "JSONCONFIG_EXAMPLE" + + # Check for change_tracking_options parameter to use ChangeTrackingOptions object + if param_name == 'change_tracking_options': + return "CHANGETRACKING_EXAMPLE" + + # Handle Optional types + if hasattr(param_type, '__origin__') and param_type.__origin__ is type(None): + return None + + if str(param_type).startswith('typing.Optional'): + # Extract the inner type from Optional[T] + args = get_args(param_type) + if args: + param_type = args[0] + + # String parameters + if param_type == str or str(param_type) == "": + if param_name == 'url': + return 'https://example.com' + return f'example_{param_name}' + + # Boolean parameters + if param_type == bool or str(param_type) == "": + return True + + # Integer parameters + if param_type == int or str(param_type) == "": + if 'timeout' in param_name or 'wait' in param_name: + return 30000 + return 10 + + # List parameters + if hasattr(param_type, '__origin__') and param_type.__origin__ is list: + args = get_args(param_type) + if args: + inner_type = args[0] + if inner_type == str: + if 'tags' in param_name: + return ['div', 'p', 'span'] + elif 'formats' in param_name: + return ['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'] + return ['example1', 'example2'] + return [] + + # Literal types + if 'Literal' in str(param_type): + args = get_args(param_type) + if args: + return args[0] # Return first literal value + + # Complex types - return example structures + if 'LocationConfig' in str(param_type): + return {'country': 'US', 'languages': ['en']} + + if 'JsonConfig' in str(param_type): + return { + 'schema': { + 'type': 'object', + 'properties': { + 'title': {'type': 'string'}, + 'description': {'type': 'string'} + } + } + } + + if 'ChangeTrackingOptions' in str(param_type): + return { + 'modes': ['git-diff', 'json'], + 'schema': { + 'type': 'object', + 'properties': { + 'changes': {'type': 'array'}, + 'timestamp': {'type': 'string'} + } + }, + 'prompt': 'Detect and extract any content changes from the page' + } + + return None + +def create_example_files(): + """Generate individual .py files for each method""" + + # Get all public methods from FirecrawlApp + methods = [method for method in dir(FirecrawlApp) + if not method.startswith('_') and callable(getattr(FirecrawlApp, method))] + + # Create examples directory + examples_dir = os.path.join(os.path.dirname(__file__), '../examples') + os.makedirs(examples_dir, exist_ok=True) + + for method_name in methods: + try: + method_info = extract_method_info(FirecrawlApp, method_name) + + # Skip if no parameters (besides self) + relevant_params = {k: v for k, v in method_info.parameters.items() + if k not in ['self', 'kwargs']} + + if not relevant_params: + continue + + # Generate example file content + example_content = generate_example_file_content(method_name, method_info) + + # Write to file + filename = f"{method_name}_example.py" + filepath = os.path.join(examples_dir, filename) + + with open(filepath, 'w') as f: + f.write(example_content) + + print(f"Generated: {filename}") + + except Exception as e: + print(f"Error generating example for {method_name}: {e}") + +def generate_example_file_content(method_name: str, method_info: MethodInfo) -> str: + """Generate the content for an example file""" + + # Define example templates + actions_example = """[ + WaitAction(milliseconds=1000), + ScreenshotAction(fullPage=True), + ClickAction(selector="button.submit"), + WriteAction(selector="input[name='email']", text="test@example.com"), + PressAction(key="Enter"), + ScrollAction(direction="down"), + ScrapeAction(), + ExecuteJavascriptAction(script=''' + function getDocumentTitle() { + return document.title; + } + return getDocumentTitle(); + ''') + ]""" + + formats_example = """["markdown", "html", "rawHtml", "screenshot", "links", "screenshot@fullPage", "extract"]""" + + urls_example = """["https://example1.com", "https://example2.com", "https://example3.com"]""" + + location_example = """LocationConfig( + country="US", + languages=["en"] + )""" + + jsonconfig_example = """JsonConfig( + prompt="Extract the main content and metadata", + schema={ + "type": "object", + "properties": { + "title": {"type": "string"}, + "content": {"type": "string"}, + "author": {"type": "string"}, + "date": {"type": "string"} + }, + "required": ["title", "content"] + }, + system_prompt="You are a helpful assistant that extracts structured data from web pages.", + agent="gpt-4" + )""" + + changetracking_example = """ChangeTrackingOptions( + modes=["git-diff", "json"], + schema={ + "type": "object", + "properties": { + "changes": {"type": "array"}, + "timestamp": {"type": "string"} + } + }, + prompt="Detect and extract any content changes from the page" + )""" + + # Check if this method uses actions to determine imports + has_actions = any('Action' in str(param_details['type']) + for param_details in method_info.parameters.values()) + has_location = any('LocationConfig' in str(param_details['type']) + for param_details in method_info.parameters.values()) + has_jsonconfig = any('JsonConfig' in str(param_details['type']) + for param_details in method_info.parameters.values()) + has_changetracking = any('ChangeTrackingOptions' in str(param_details['type']) + for param_details in method_info.parameters.values()) + + # File header with conditional imports + if has_actions and has_location and has_jsonconfig and has_changetracking: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig, ChangeTrackingOptions +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_actions and has_location and has_jsonconfig: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, JsonConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_actions and has_location and has_changetracking: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig, ChangeTrackingOptions +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_actions and has_location: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + LocationConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_actions and has_jsonconfig: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, + JsonConfig +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_location and has_jsonconfig: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import LocationConfig, JsonConfig + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_actions: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ( + WaitAction, ScreenshotAction, ClickAction, WriteAction, + PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction +) + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_location: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import LocationConfig + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_jsonconfig: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import JsonConfig + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + elif has_changetracking: + content = f"""import os +from firecrawl import FirecrawlApp +from firecrawl import ChangeTrackingOptions + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + else: + content = f"""import os +from firecrawl import FirecrawlApp + +def main(): + # Initialize the FirecrawlApp + api_key = os.getenv("FIRECRAWL_API_KEY") + if not api_key: + raise ValueError("Please set FIRECRAWL_API_KEY environment variable") + + app = FirecrawlApp(api_key=api_key) + + # Example with all parameters + try: +""" + + # Generate method call with all parameters + params = [] + for param_name, param_details in method_info.parameters.items(): + if param_name in ['self', 'kwargs']: + continue + + param_type = param_details['type'] + example_value = generate_example_value(param_name, param_type, method_name) + + if example_value is not None: + if example_value == "ACTIONS_EXAMPLE": + # Special handling for actions + actions_code = """[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ]""" + params.append(f" {param_name}={actions_code}") + elif example_value == "LOCATION_EXAMPLE": + # Special handling for location + location_code = 'LocationConfig(country="US", languages=["en"])' + params.append(f" {param_name}={location_code}") + elif example_value == "JSONCONFIG_EXAMPLE": + # Special handling for extract and json_options + json_config_code = 'JsonConfig(schema={"type": "object", "properties": {"title": {"type": "string"}, "description": {"type": "string"}}})' + params.append(f" {param_name}={json_config_code}") + elif example_value == "CHANGETRACKING_EXAMPLE": + # Special handling for change_tracking_options + change_tracking_code = 'ChangeTrackingOptions(modes=["git-diff", "json"], schema={"type": "object", "properties": {"changes": {"type": "array"}, "timestamp": {"type": "string"}}})' + params.append(f" {param_name}={change_tracking_code}") + elif isinstance(example_value, str): + params.append(f" {param_name}='{example_value}'") + elif isinstance(example_value, dict): + # Format dict with proper indentation + dict_str = json.dumps(example_value, indent=4) + # Add proper indentation to each line except the first + lines = dict_str.split('\n') + if len(lines) > 1: + indented_lines = [lines[0]] # First line without extra indent + for line in lines[1:]: + indented_lines.append(' ' + line) + indented_dict = '\n'.join(indented_lines) + else: + indented_dict = dict_str + params.append(f" {param_name}={indented_dict}") + elif isinstance(example_value, list): + # Format list with proper indentation + if all(isinstance(item, str) for item in example_value): + # Simple string list + params.append(f" {param_name}={example_value}") + else: + # Complex list with dicts + list_str = json.dumps(example_value, indent=4) + indented_list = '\n'.join(' ' + line for line in list_str.split('\n')) + params.append(f" {param_name}={indented_list}") + else: + params.append(f" {param_name}={example_value}") + elif param_name == 'actions': + # Always include actions parameter even if example_value is None + actions_code = """[ + WaitAction(milliseconds=1000, selector="#content"), + ScreenshotAction(full_page=True), + ClickAction(selector="button.submit"), + WriteAction(text="example@email.com"), + PressAction(key="Enter"), + ScrollAction(direction="down", selector=".scrollable-container"), + ScrapeAction(), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ]""" + params.append(f" {param_name}={actions_code}") + elif param_name == 'formats': + # Always include formats parameter to show all available options + formats_list = ['markdown', 'html', 'raw_html', 'links', 'screenshot', 'screenshot@full_page', 'extract', 'json', 'change_tracking'] + params.append(f" {param_name}={formats_list}") + + # Add method call + if params: + params_str = ',\n'.join(params) + content += f""" result = app.{method_name}( +{params_str} + ) + + print("Success!") + print(f"Result: {{result}}") + + except Exception as e: + print(f"Error: {{e}}") + +if __name__ == "__main__": + main() +""" + else: + content += f""" result = app.{method_name}() + print("Success!") + print(f"Result: {{result}}") + + except Exception as e: + print(f"Error: {{e}}") + +if __name__ == "__main__": + main() +""" + + # Replace placeholders with actual examples + content = content.replace('"ACTIONS_EXAMPLE"', actions_example) + content = content.replace('"FORMATS_EXAMPLE"', formats_example) + content = content.replace('"URLS_EXAMPLE"', urls_example) + content = content.replace('"LOCATION_EXAMPLE"', location_example) + content = content.replace('"JSONCONFIG_EXAMPLE"', jsonconfig_example) + content = content.replace('"CHANGETRACKING_EXAMPLE"', changetracking_example) + + return content + +def add_examples_to_openapi(): + """Add Python SDK examples to OpenAPI spec for Mintlify documentation""" + import json + + # Load the OpenAPI specification + openapi_path = os.path.join(os.path.dirname(__file__), '../../api/v1-openapi.json') + + try: + with open(openapi_path, 'r') as f: + openapi_spec = json.load(f) + except FileNotFoundError: + print(f"OpenAPI spec not found at {openapi_path}") + return + + # Mapping of OpenAPI operation IDs to Python SDK methods + operation_to_method_mapping = { + 'scrapeAndExtractFromUrl': 'scrape_url', + 'scrapeAndExtractFromUrls': 'batch_scrape', + 'getBatchScrapeStatus': 'get_batch_scrape_status', + 'cancelBatchScrape': 'cancel_batch_scrape', + 'getBatchScrapeErrors': 'get_batch_scrape_errors', + 'getCrawlStatus': 'get_crawl_status', + 'cancelCrawl': 'cancel_crawl', + 'getCrawlErrors': 'get_crawl_errors', + 'crawlUrls': 'crawl_url', + 'mapUrls': 'map_url', + 'extractData': 'extract', + 'getExtractStatus': 'get_extract_status', + 'startDeepResearch': 'deep_research', + 'getDeepResearchStatus': 'get_deep_research_status', + 'getCreditUsage': 'get_credit_usage', + 'getTokenUsage': 'get_token_usage', + 'searchAndScrape': 'search', + 'generateLLMsTxt': 'generate_llms_txt', + 'getLLMsTxtStatus': 'get_llms_txt_status' + } + + # Path to examples directory + examples_dir = os.path.join(os.path.dirname(__file__), '../examples') + + # Check if examples directory exists + if not os.path.exists(examples_dir): + print(f"Examples directory not found at {examples_dir}") + return + + # Generate code samples for each endpoint + for path, path_item in openapi_spec.get('paths', {}).items(): + for method, operation in path_item.items(): + if method.upper() not in ['GET', 'POST', 'PUT', 'DELETE', 'PATCH']: + continue + + operation_id = operation.get('operationId') + if not operation_id or operation_id not in operation_to_method_mapping: + continue + + python_method = operation_to_method_mapping[operation_id] + + # Look for the corresponding example file + example_filename = f"{python_method}_example.py" + example_filepath = os.path.join(examples_dir, example_filename) + + # Only proceed if the example file exists + if not os.path.exists(example_filepath): + print(f"Skipping {operation_id} -> {python_method} (no example file found)") + continue + + try: + # Read the example file content + with open(example_filepath, 'r') as f: + example_content = f.read() + + # Add x-codeSamples to the operation + if 'x-codeSamples' not in operation: + operation['x-codeSamples'] = [] + + # Remove existing Python samples to avoid duplicates + operation['x-codeSamples'] = [ + sample for sample in operation['x-codeSamples'] + if sample.get('lang') != 'python' + ] + + # Add new Python sample using the example file content + operation['x-codeSamples'].append({ + 'lang': 'python', + 'label': 'Python SDK', + 'source': example_content + }) + + print(f"Added Python example for {operation_id} -> {python_method}") + + except Exception as e: + print(f"Error reading example file for {python_method}: {e}") + continue + + # Save the updated OpenAPI spec + output_path = os.path.join(os.path.dirname(__file__), '../../api/v1-openapi-with-examples.json') + with open(output_path, 'w') as f: + json.dump(openapi_spec, f, indent=2) + + print(f"Updated OpenAPI spec saved to: {output_path}") + +def main(): + print("Generating comprehensive SDK examples...") + + # First, show method info for scrape_url as before + method_info = extract_method_info(FirecrawlApp, "scrape_url") + print(f"\nMethod: {method_info.name}") + print(f"Signature: {method_info.signature}") + print("Parameters:") + for param_name, param_details in method_info.parameters.items(): + print(f" {param_name}: {param_details}") + + print("\n" + "="*50) + print("Generating example files...") + + # Generate all example files + create_example_files() + + print("\n" + "="*50) + print("Adding examples to OpenAPI specification...") + + # Add examples to OpenAPI spec for Mintlify + add_examples_to_openapi() + + print("\nDone! Check the examples/ directory for generated files.") + print("Updated OpenAPI spec with Python SDK examples saved to v1-openapi-with-examples.json") + +if __name__ == "__main__": + main() \ No newline at end of file