From 7b03ab36a721744e6220b009be8383a74c9b2d6d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 8 May 2025 20:15:49 -0300 Subject: [PATCH] Update openapi.json --- apps/api/openapi.json | 2794 +++++++++++++++++++++++++++++++++-------- 1 file changed, 2305 insertions(+), 489 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 5bd3e3d8..41c98049 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -12,14 +12,14 @@ }, "servers": [ { - "url": "https://api.firecrawl.dev/v0" + "url": "https://api.firecrawl.dev/v1" } ], "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL", - "operationId": "scrape", + "summary": "Scrape a single URL and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrl", "tags": ["Scraping"], "security": [ { @@ -31,57 +31,22 @@ "content": { "application/json": { "schema": { - "type": "object", - "properties": { - "url": { - "type": "string", - "format": "uri", - "description": "The URL to scrape" - }, - "formats": { - "type": "array", - "items": { - "type": "string", - "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"] - }, - "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)", - "default": ["markdown"] - }, - "headers": { + "allOf": [ + { "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "includeTags": { - "type": "array", - "items": { - "type": "string" + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + "required": ["url"] }, - "excludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": true - }, - "timeout": { - "type": "integer", - "description": "Timeout in milliseconds for the request", - "default": 30000 - }, - "waitFor": { - "type": "integer", - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "default": 0 + { + "$ref": "#/components/schemas/ScrapeOptions" } - }, - "required": ["url"] + ] } } } @@ -148,6 +113,603 @@ } } }, + "/batch/scrape": { + "post": { + "summary": "Scrape multiple URLs and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrls", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "allOf": [ + { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "webhook": { + "type": "object", + "description": "A webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + }, + "events": { + "type": "array", + "description": "Type of events that should be sent to the webhook URL. (default: all)", + "items": { + "type": "string", + "enum": ["completed", "page", "failed", "started"] + } + } + }, + "required": ["url"] + }, + "ignoreInvalidURLs": { + "type": "boolean", + "default": false, + "description": "If invalid URLs are specified in the urls array, they will be ignored. Instead of them failing the entire request, a batch scrape using the remaining valid URLs will be created, and the invalid URLs will be returned in the invalidURLs field of the response." + } + }, + "required": ["urls"] + }, + { + "$ref": "#/components/schemas/ScrapeOptions" + } + ] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a batch scrape job", + "operationId": "getBatchScrapeStatus", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a batch scrape job", + "operationId": "cancelBatchScrape", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "message": { + "type": "string", + "example": "Batch scrape job successfully cancelled." + } + } + } + } + } + }, + "404": { + "description": "Batch scrape job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Batch scrape job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape/{id}/errors": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the errors of a batch scrape job", + "operationId": "getBatchScrapeErrors", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlErrorsResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the crawl job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a crawl job", + "operationId": "getCrawlStatus", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a crawl job", + "operationId": "cancelCrawl", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": ["cancelled"], + "example": "cancelled" + } + } + } + } + } + }, + "404": { + "description": "Crawl job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Crawl job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl/{id}/errors": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the crawl job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the errors of a crawl job", + "operationId": "getCrawlErrors", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlErrorsResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, "/crawl": { "post": { "summary": "Crawl multiple URLs based on options", @@ -170,122 +732,91 @@ "format": "uri", "description": "The base URL to start crawling from" }, - "crawlerOptions": { - "type": "object", - "properties": { - "includes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to include" - }, - "excludes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "URL patterns to exclude" - }, - "generateImgAltText": { - "type": "boolean", - "description": "Generate alt text for images using LLMs (must have a paid plan)", - "default": false - }, - "returnOnlyUrls": { - "type": "boolean", - "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", - "default": false - }, - "maxDepth": { - "type": "integer", - "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern." - }, - "mode": { - "type": "string", - "enum": ["default", "fast"], - "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", - "default": "default" - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore the website sitemap when crawling", - "default": false - }, - "limit": { - "type": "integer", - "description": "Maximum number of pages to crawl", - "default": 10000 - }, - "allowBackwardCrawling": { - "type": "boolean", - "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", - "default": false - }, - "allowExternalContentLinks": { - "type": "boolean", - "description": "Allows the crawler to follow links to external websites.", - "default": false - } - } + "excludePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL pathname regex patterns that exclude matching URLs from the crawl. For example, if you set \"excludePaths\": [\"blog/.*\"] for the base URL firecrawl.dev, any results matching that pattern will be excluded, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." }, - "pageOptions": { + "includePaths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL pathname regex patterns that include matching URLs in the crawl. Only the paths that match the specified patterns will be included in the response. For example, if you set \"includePaths\": [\"blog/.*\"] for the base URL firecrawl.dev, only results matching that pattern will be included, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." + }, + "maxDepth": { + "type": "integer", + "description": "Maximum depth to crawl relative to the base URL. Basically, the max number of slashes the pathname of a scraped URL may contain.", + "default": 10 + }, + "maxDiscoveryDepth": { + "type": "integer", + "description": "Maximum depth to crawl based on discovery order. The root site and sitemapped pages has a discovery depth of 0. For example, if you set it to 1, and you set ignoreSitemap, you will only crawl the entered URL and all URLs that are linked on that page." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, + "ignoreQueryParameters": { + "type": "boolean", + "description": "Do not re-scrape the same path with different (or none) query parameters", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of pages to crawl. Default limit is 10000.", + "default": 10000 + }, + "allowBackwardLinks": { + "type": "boolean", + "description": "Enables the crawler to navigate from a specific URL to previously linked pages.", + "default": false + }, + "allowExternalLinks": { + "type": "boolean", + "description": "Allows the crawler to follow links to external websites.", + "default": false + }, + "delay": { + "type": "number", + "description": "Delay in seconds between scrapes. This helps respect website rate limits." + }, + "webhook": { "type": "object", + "description": "A webhook specification object.", "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started), every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, "headers": { "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "includeHtml": { - "type": "boolean", - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "default": false - }, - "includeRawHtml": { - "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "default": false - }, - "onlyIncludeTags": { - "type": "array", - "items": { + "description": "Headers to send to the webhook URL.", + "additionalProperties": { "type": "string" - }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + } }, - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true }, - "removeTags": { + "events": { "type": "array", + "description": "Type of events that should be sent to the webhook URL. (default: all)", "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, - "screenshot": { - "type": "boolean", - "description": "Include a screenshot of the top of the page that you are scraping.", - "default": false - }, - "fullPageScreenshot": { - "type": "boolean", - "description": "Include a full page screenshot of the page that you are scraping.", - "default": false - }, - "waitFor": { - "type": "integer", - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "default": 0 + "type": "string", + "enum": ["completed", "page", "failed", "started"] + } } - } + }, + "required": ["url"] + }, + "scrapeOptions": { + "$ref": "#/components/schemas/ScrapeOptions" } }, "required": ["url"] @@ -355,10 +886,710 @@ } } }, + "/map": { + "post": { + "summary": "Map multiple URLs based on options", + "operationId": "mapUrls", + "tags": ["Mapping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "search": { + "type": "string", + "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling.", + "default": true + }, + "sitemapOnly": { + "type": "boolean", + "description": "Only return links found in the website sitemap", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains of the website", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of links to return", + "default": 5000, + "maximum": 30000 + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds. There is no timeout by default." + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MapResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/extract": { + "post": { + "summary": "Extract structured data from pages using LLMs", + "operationId": "extractData", + "tags": ["Extraction"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URLs to extract data from. URLs should be in glob format." + } + }, + "prompt": { + "type": "string", + "description": "Prompt to guide the extraction process" + }, + "schema": { + "type": "object", + "description": "Schema to define the structure of the extracted data", + "properties": { + "property1": { + "type": "string", + "description": "Description of property1" + }, + "property2": { + "type": "integer", + "description": "Description of property2" + } + }, + "required": ["property1", "property2"] + }, + "enableWebSearch": { + "type": "boolean", + "description": "When true, the extraction will use web search to find additional data", + "default": false + }, + "ignoreSitemap": { + "type": "boolean", + "description": "When true, sitemap.xml files will be ignored during website scanning", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "When true, subdomains of the provided URLs will also be scanned", + "default": true + }, + "showSources": { + "type": "boolean", + "description": "When true, the sources used to extract the data will be included in the response as `sources` key", + "default": false + }, + "scrapeOptions": { + "$ref": "#/components/schemas/ScrapeOptions" + } + }, + "required": ["urls"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful extraction", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractResponse" + } + } + } + }, + "400": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Invalid input data." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/extract/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the extract job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of an extract job", + "operationId": "getExtractStatus", + "tags": ["Extraction"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractStatusResponse" + } + } + } + } + } + } + }, + + "/deep-research": { + "post": { + "summary": "Start a deep research operation on a query", + "operationId": "startDeepResearch", + "tags": ["Research"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query to research" + }, + "maxDepth": { + "type": "integer", + "minimum": 1, + "maximum": 12, + "default": 7, + "description": "Maximum depth of research iterations" + }, + "timeLimit": { + "type": "integer", + "minimum": 30, + "maximum": 600, + "default": 300, + "description": "Time limit in seconds" + }, + "maxUrls": { + "type": "integer", + "minimum": 1, + "maximum": 1000, + "default": 20, + "description": "Maximum number of URLs to analyze" + }, + "analysisPrompt": { + "type": "string", + "description": "The prompt to use for the final analysis. Useful to format the final analysis markdown in a specific way." + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the research agent. Useful to steer the research agent to a specific direction." + }, + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": ["markdown", "json"], + "default": ["markdown"] + } + }, + "jsonOptions": { + "type": "object", + "description": "Options for JSON output", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the JSON output" + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the JSON output" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the JSON output" + } + } + } + }, + "required": ["query"] + } + } + } + }, + "responses": { + "200": { + "description": "Research job started successfully", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "id": { + "type": "string", + "format": "uuid", + "description": "ID of the research job" + } + } + } + } + } + }, + "400": { + "description": "Invalid request parameters", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Invalid parameters provided" + } + } + } + } + } + } + } + } + }, + "/deep-research/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the research job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status and results of a deep research operation", + "operationId": "getDeepResearchStatus", + "tags": ["Research"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "finalAnalysis": { + "type": "string" + }, + "json": { + "type": "object", + "description": "Displayed when using JSON format", + "nullable": true + }, + "activities": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "status": { + "type": "string" + }, + "message": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "depth": { + "type": "integer" + } + } + } + }, + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "favicon": { + "type": "string" + } + } + } + }, + "status": { + "type": "string", + "enum": ["processing", "completed", "failed"] + }, + "error": { + "type": "string" + }, + "expiresAt": { + "type": "string", + "format": "date-time" + }, + "currentDepth": { + "type": "integer" + }, + "maxDepth": { + "type": "integer" + }, + "totalUrls": { + "type": "integer" + } + } + } + } + } + } + } + }, + "404": { + "description": "Research job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Research job not found" + } + } + } + } + } + } + } + } + }, + "/team/credit-usage": { + "get": { + "summary": "Get remaining credits for the authenticated team", + "operationId": "getCreditUsage", + "tags": ["Billing"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_credits": { + "type": "number", + "description": "Number of credits remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Credit usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find credit usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching credit usage" + } + } + } + } + } + } + } + } + }, + "/team/token-usage": { + "get": { + "summary": "Get remaining tokens for the authenticated team (Extract only)", + "operationId": "getTokenUsage", + "tags": ["Billing"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_tokens": { + "type": "number", + "description": "Number of tokens remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Token usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find token usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching token usage" + } + } + } + } + } + } + } + } + }, "/search": { "post": { - "summary": "Search for a keyword in Google, returns top page results with markdown content for each page", - "operationId": "searchGoogle", + "summary": "Search and optionally scrape search results", + "operationId": "searchAndScrape", "tags": ["Search"], "security": [ { @@ -374,42 +1605,61 @@ "properties": { "query": { "type": "string", - "format": "uri", - "description": "The query to search for" + "description": "The search query" }, - "pageOptions": { - "type": "object", - "properties": { - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false - }, - "fetchPageContent": { - "type": "boolean", - "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", - "default": true - }, - "includeHtml": { - "type": "boolean", - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "default": false - }, - "includeRawHtml": { - "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "default": false - } - } + "limit": { + "type": "integer", + "description": "Maximum number of results to return", + "default": 5, + "maximum": 50, + "minimum": 1 }, - "searchOptions": { + "tbs": { + "type": "string", + "description": "Time-based search parameter" + }, + "lang": { + "type": "string", + "description": "Language code for search results", + "default": "en" + }, + "country": { + "type": "string", + "description": "Country code for search results", + "default": "us" + }, + "location": { + "type": "string", + "description": "Location parameter for search results" + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds", + "default": 60000 + }, + "scrapeOptions": { "type": "object", + "description": "Options for scraping search results", "properties": { - "limit": { - "type": "integer", - "description": "Maximum number of results. Max is 20 during beta." + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract" + ] + }, + "description": "Formats to include in the output", + "default": [] } - } + }, + "default": {} } }, "required": ["query"] @@ -420,151 +1670,106 @@ "responses": { "200": { "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SearchResponse" - } - } - } - }, - "402": { - "description": "Payment required", "content": { "application/json": { "schema": { "type": "object", "properties": { - "error": { - "type": "string", - "example": "Payment required to access this resource." - } - } - } - } - } - }, - "429": { - "description": "Too many requests", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Request rate limit exceeded. Please wait and try again later." - } - } - } - } - } - }, - "500": { - "description": "Server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "An unexpected error occurred on the server." - } - } - } - } - } - } - } - } - }, - "/crawl/status/{jobId}": { - "get": { - "tags": ["Crawl"], - "summary": "Get the status of a crawl job", - "operationId": "getCrawlStatus", - "security": [ - { - "bearerAuth": [] - } - ], - "parameters": [ - { - "name": "jobId", - "in": "path", - "description": "ID of the crawl job", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "status": { - "type": "string", - "description": "Status of the job (completed, active, failed, paused)" - }, - "current": { - "type": "integer", - "description": "Current page number" - }, - "total": { - "type": "integer", - "description": "Total number of pages" + "success": { + "type": "boolean" }, "data": { "type": "array", "items": { - "$ref": "#/components/schemas/CrawlStatusResponseObj" - }, - "description": "Data returned from the job (null when it is in progress)" + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Title from search result" + }, + "description": { + "type": "string", + "description": "Description from search result" + }, + "url": { + "type": "string", + "description": "URL of the search result" + }, + "markdown": { + "type": "string", + "nullable": true, + "description": "Markdown content if scraping was requested" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML content if requested in formats" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content if requested in formats" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Links found if requested in formats" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot URL if requested in formats" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "statusCode": { + "type": "integer" + }, + "error": { + "type": "string", + "nullable": true + } + } + } + } + } }, - "partial_data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/CrawlStatusResponseObj" - }, - "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." + "warning": { + "type": "string", + "nullable": true, + "description": "Warning message if any issues occurred" } } } } } }, - "402": { - "description": "Payment required", + "408": { + "description": "Request timeout", "content": { "application/json": { "schema": { "type": "object", "properties": { + "success": { + "type": "boolean", + "example": false + }, "error": { "type": "string", - "example": "Payment required to access this resource." - } - } - } - } - } - }, - "429": { - "description": "Too many requests", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Request rate limit exceeded. Please wait and try again later." + "example": "Request timed out" } } } @@ -578,6 +1783,10 @@ "schema": { "type": "object", "properties": { + "success": { + "type": "boolean", + "example": false + }, "error": { "type": "string", "example": "An unexpected error occurred on the server." @@ -590,26 +1799,110 @@ } } }, - "/crawl/cancel/{jobId}": { - "delete": { - "tags": ["Crawl"], - "summary": "Cancel a crawl job", - "operationId": "cancelCrawlJob", + "/llmstxt": { + "post": { + "summary": "Generate LLMs.txt for a website", + "operationId": "generateLLMsTxt", + "tags": ["LLMs.txt"], "security": [ { "bearerAuth": [] } ], - "parameters": [ - { - "name": "jobId", - "in": "path", - "description": "ID of the crawl job", - "required": true, - "schema": { - "type": "string" + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to generate LLMs.txt from" + }, + "maxUrls": { + "type": "integer", + "description": "Maximum number of URLs to analyze", + "default": 2 + }, + "showFullText": { + "type": "boolean", + "description": "Include full text content in the response", + "default": false + } + }, + "required": ["url"] + } } } + }, + "responses": { + "200": { + "description": "LLMs.txt generation job started successfully", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "id": { + "type": "string", + "format": "uuid", + "description": "ID of the LLMs.txt generation job" + } + } + } + } + } + }, + "400": { + "description": "Invalid request parameters", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Invalid parameters provided" + } + } + } + } + } + } + } + } + }, + "/llmstxt/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the LLMs.txt generation job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status and results of an LLMs.txt generation job", + "operationId": "getLLMsTxtStatus", + "tags": ["LLMs.txt"], + "security": [ + { + "bearerAuth": [] + } ], "responses": { "200": { @@ -619,57 +1912,50 @@ "schema": { "type": "object", "properties": { + "success": { + "type": "boolean" + }, "status": { "type": "string", - "description": "Returns cancelled." + "enum": ["processing", "completed", "failed"] + }, + "data": { + "type": "object", + "properties": { + "llmstxt": { + "type": "string", + "description": "The generated LLMs.txt content" + }, + "llmsfulltxt": { + "type": "string", + "description": "The full text content when showFullText is true" + } + } + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "When the generated content will expire" } } } } } }, - "402": { - "description": "Payment required", + "404": { + "description": "LLMs.txt generation job not found", "content": { "application/json": { "schema": { "type": "object", "properties": { + "success": { + "type": "boolean", + "example": false + }, "error": { "type": "string", - "example": "Payment required to access this resource." - } - } - } - } - } - }, - "429": { - "description": "Too many requests", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Request rate limit exceeded. Please wait and try again later." - } - } - } - } - } - }, - "500": { - "description": "Server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "An unexpected error occurred on the server." + "example": "LLMs.txt generation job not found" } } } @@ -688,48 +1974,378 @@ } }, "schemas": { + "ScrapeOptions": { + "type": "object", + "properties": { + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "json", + "changeTracking" + ] + }, + "description": "Formats to include in the output.", + "default": ["markdown"] + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include in the output." + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude from the output." + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "waitFor": { + "type": "integer", + "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", + "default": 0 + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 + }, + "jsonOptions": { + "type": "object", + "description": "Extract object", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the extraction (Optional)" + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the extraction (Optional)" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the extraction without a schema (Optional)" + } + } + }, + "actions": { + "type": "array", + "description": "Actions to perform on the page before grabbing the content", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Wait", + "properties": { + "type": { + "type": "string", + "enum": ["wait"], + "description": "Wait for a specified amount of milliseconds" + }, + "milliseconds": { + "type": "integer", + "minimum": 1, + "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Screenshot", + "properties": { + "type": { + "type": "string", + "enum": ["screenshot"], + "description": "Take a screenshot. The links will be in the response's `actions.screenshots` array." + }, + "fullPage": { + "type": "boolean", + "description": "Should the screenshot be full-page or viewport sized?", + "default": false + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Click", + "properties": { + "type": { + "type": "string", + "enum": ["click"], + "description": "Click on an element" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#load-more-button" + }, + "all": { + "type": "boolean", + "description": "Clicks all elements matched by the selector, not just the first one. Does not throw an error if no elements match the selector.", + "default": false + } + }, + "required": ["type", "selector"] + }, + { + "type": "object", + "title": "Write text", + "properties": { + "type": { + "type": "string", + "enum": ["write"], + "description": "Write text into an input field, text area, or contenteditable element. Note: You must first focus the element using a 'click' action before writing. The text will be typed character by character to simulate keyboard input." + }, + "text": { + "type": "string", + "description": "Text to type", + "example": "Hello, world!" + } + }, + "required": ["type", "text"] + }, + { + "type": "object", + "title": "Press a key", + "description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.", + "properties": { + "type": { + "type": "string", + "enum": ["press"], + "description": "Press a key on the page" + }, + "key": { + "type": "string", + "description": "Key to press", + "example": "Enter" + } + }, + "required": ["type", "key"] + }, + { + "type": "object", + "title": "Scroll", + "properties": { + "type": { + "type": "string", + "enum": ["scroll"], + "description": "Scroll the page or a specific element" + }, + "direction": { + "type": "string", + "enum": ["up", "down"], + "description": "Direction to scroll", + "default": "down" + }, + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": ["scrape"], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": ["executeJavascript"], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": ["type", "script"] + } + ] + } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + }, + "blockAds": { + "type": "boolean", + "description": "Enables ad-blocking and cookie popup blocking.", + "default": true + }, + "proxy": { + "type": "string", + "enum": ["basic", "stealth"], + "description": "Specifies the type of proxy to use.\n\n - **basic**: Proxies for scraping sites with none to basic anti-bot solutions. Fast and usually works.\n - **stealth**: Stealth proxies for scraping sites with advanced anti-bot solutions. Slower, but more reliable on certain sites. Starting May 8th, stealth will cost 5 credits per request.\n\nIf you do not specify a proxy, Firecrawl will default to basic." + }, + "changeTrackingOptions": { + "type": "object", + "description": "Options for change tracking (Beta). Only applicable when 'changeTracking' is included in formats. The 'markdown' format must also be specified when using change tracking.", + "properties": { + "modes": { + "type": "array", + "items": { + "type": "string", + "enum": ["git-diff", "json"] + }, + "description": "The mode to use for change tracking. 'git-diff' provides a detailed diff, and 'json' compares extracted JSON data." + }, + "schema": { + "type": "object", + "description": "Schema for JSON extraction when using 'json' mode. Defines the structure of data to extract and compare." + }, + "prompt": { + "type": "string", + "description": "Prompt to use for change tracking when using 'json' mode. If not provided, the default prompt will be used." + } + } + } + } + }, "ScrapeResponse": { "type": "object", "properties": { "success": { "type": "boolean" }, - "warning": { - "type": "string", - "nullable": true, - "description": "Warning message to let you know of any issues." - }, "data": { "type": "object", "properties": { "markdown": { - "type": "string", - "nullable": true, - "description": "Markdown content of the page if the `markdown` format was specified (default)" + "type": "string" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if the `html` format was specified" + "description": "HTML version of the content on page if `html` is in `formats`" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if the `rawHtml` format was specified" - }, - "links": { - "type": "array", - "items": { - "type": "string", - "format": "uri" - }, - "nullable": true, - "description": "Links on the page if the `links` format was specified" + "description": "Raw HTML content of the page if `rawHtml` is in `formats`" }, "screenshot": { "type": "string", "nullable": true, - "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" + "description": "Screenshot of the page if `screenshot` is in `formats`" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `links` is in `formats`" + }, + "actions": { + "type": "object", + "nullable": true, + "description": "Results of the actions specified in the `actions` parameter. Only present if the `actions` parameter was provided in the request", + "properties": { + "screenshots": { + "type": "array", + "description": "Screenshot URLs, in the same order as the screenshot actions provided.", + "items": { + "type": "string", + "format": "url" + } + }, + "scrapes": { + "type": "array", + "description": "Scrape contents, in the same order as the scrape actions provided.", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "html": { + "type": "string" + } + } + } + }, + "javascriptReturns": { + "type": "array", + "description": "JavaScript return values, in the same order as the executeJavascript actions provided.", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "value": {} + } + } + } + } }, "metadata": { "type": "object", @@ -761,6 +2377,49 @@ "description": "The error message of the page" } } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + }, + "changeTracking": { + "type": "object", + "nullable": true, + "description": "Change tracking information if `changeTracking` is in `formats`. Only present when the `changeTracking` format is requested.", + "properties": { + "previousScrapeAt": { + "type": "string", + "format": "date-time", + "nullable": true, + "description": "The timestamp of the previous scrape that the current page is being compared against. Null if no previous scrape exists." + }, + "changeStatus": { + "type": "string", + "enum": ["new", "same", "changed", "removed"], + "description": "The result of the comparison between the two page versions. 'new' means this page did not exist before, 'same' means content has not changed, 'changed' means content has changed, 'removed' means the page was removed." + }, + "visibility": { + "type": "string", + "enum": ["visible", "hidden"], + "description": "The visibility of the current page/URL. 'visible' means the URL was discovered through an organic route (links or sitemap), 'hidden' means the URL was discovered through memory from previous crawls." + }, + "diff": { + "type": "string", + "nullable": true, + "description": "Git-style diff of changes when using 'git-diff' mode. Only present when the mode is set to 'git-diff'." + }, + "json": { + "type": "object", + "nullable": true, + "description": "JSON comparison results when using 'json' mode. Only present when the mode is set to 'json'. This will emit a list of all the keys and their values from the `previous` and `current` scrapes based on the type defined in the `schema`. Example [here](/features/change-tracking)" + } + } } } } @@ -769,134 +2428,223 @@ "CrawlStatusResponseObj": { "type": "object", "properties": { - "markdown": { + "status": { + "type": "string", + "description": "The current status of the crawl. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be crawled." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully crawled." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the crawl." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the crawl will expire." + }, + "next": { "type": "string", "nullable": true, - "description": "Markdown content of the page if the `markdown` format was specified (default)" + "description": "The URL to retrieve the next 10MB of data. Returned if the crawl is not completed or if the response is larger than 10MB." }, - "html": { - "type": "string", - "nullable": true, - "description": "HTML version of the content on page if the `html` format was specified" - }, - "rawHtml": { - "type": "string", - "nullable": true, - "description": "Raw HTML content of the page if the `rawHtml` format was specified" - }, - "links": { + "data": { "type": "array", + "description": "The data of the crawl.", "items": { - "type": "string", - "format": "uri" - }, - "nullable": true, - "description": "Links on the page if the `links` format was specified" - }, - "screenshot": { - "type": "string", - "nullable": true, - "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - }, - " ": { - "type": "string" - }, - "statusCode": { - "type": "integer", - "description": "The status code of the page" - }, - "error": { - "type": "string", - "nullable": true, - "description": "The error message of the page" + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } } } } } }, - "SearchResponse": { + "CrawlErrorsResponseObj": { "type": "object", "properties": { - "success": { - "type": "boolean" + "errors": { + "type": "array", + "description": "Errored scrape jobs and error details", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "timestamp": { + "type": "string", + "nullable": true, + "description": "ISO timestamp of failure" + }, + "url": { + "type": "string", + "description": "Scraped URL" + }, + "error": { + "type": "string", + "description": "Error message" + } + } + } + }, + "robotsBlocked": { + "type": "array", + "description": "List of URLs that were attempted in scraping but were blocked by robots.txt", + "items": { "type": "string" } + } + } + }, + "BatchScrapeStatusResponseObj": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "The current status of the batch scrape. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be scraped." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully scraped." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the batch scrape." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the batch scrape will expire." + }, + "next": { + "type": "string", + "nullable": true, + "description": "The URL to retrieve the next 10MB of data. Returned if the batch scrape is not completed or if the response is larger than 10MB." }, "data": { "type": "array", + "description": "The data of the batch scrape.", "items": { - "markdown": { - "type": "string", - "nullable": true, - "description": "Markdown content of the page if the `markdown` format was specified (default)" - }, - "html": { - "type": "string", - "nullable": true, - "description": "HTML version of the content on page if the `html` format was specified" - }, - "rawHtml": { - "type": "string", - "nullable": true, - "description": "Raw HTML content of the page if the `rawHtml` format was specified" - }, - "links": { - "type": "array", - "items": { - "type": "string", - "format": "uri" + "type": "object", + "properties": { + "markdown": { + "type": "string" }, - "nullable": true, - "description": "Links on the page if the `links` format was specified" - }, - "screenshot": { - "type": "string", - "nullable": true, - "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" - }, - "metadata": { - "type": "object", - "properties": { - "title": { + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { "type": "string" }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - }, - " ": { - "type": "string" - }, - "statusCode": { - "type": "integer", - "description": "The status code of the page" - }, - "error": { - "type": "string", - "nullable": true, - "description": "The error message of the page" + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } } } } @@ -918,6 +2666,74 @@ "format": "uri" } } + }, + "BatchScrapeResponseObj": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string", + "format": "uri" + }, + "invalidURLs": { + "type": "array", + "nullable": true, + "items": { + "type": "string" + }, + "description": "If ignoreInvalidURLs is true, this is an array containing the invalid URLs that were specified in the request. If there were no invalid URLs, this will be an empty array. If ignoreInvalidURLs is false, this field will be undefined." + } + } + }, + "MapResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "links": { + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "ExtractResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + } + } + }, + "ExtractStatusResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object" + }, + "status": { + "type": "string", + "enum": ["completed", "processing", "failed", "cancelled"], + "description": "The current status of the extract job" + }, + "expiresAt": { + "type": "string", + "format": "date-time" + } + } } } }, @@ -926,4 +2742,4 @@ "bearerAuth": [] } ] -} \ No newline at end of file +}