From 8b7569f8f3bbc01c2e6e80129fcf902ddd4d1aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 15 Aug 2024 23:30:33 +0200 Subject: [PATCH] add zod, create middleware, update openapi declaration, add crawl logic --- apps/api/openapi-v0.json | 924 ++++++++++++++++++++ apps/api/openapi.json | 287 +++--- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 2 +- apps/api/src/controllers/v1/crawl-status.ts | 134 ++- apps/api/src/controllers/v1/crawl.ts | 211 ++--- apps/api/src/controllers/v1/map.ts | 145 +-- apps/api/src/controllers/v1/scrape.ts | 262 +++--- apps/api/src/controllers/v1/types.ts | 154 +++- apps/api/src/routes/v1.ts | 112 ++- 10 files changed, 1604 insertions(+), 629 deletions(-) create mode 100644 apps/api/openapi-v0.json diff --git a/apps/api/openapi-v0.json b/apps/api/openapi-v0.json new file mode 100644 index 00000000..40272385 --- /dev/null +++ b/apps/api/openapi-v0.json @@ -0,0 +1,924 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Firecrawl API", + "version": "0.0.0", + "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.", + "contact": { + "name": "Firecrawl Support", + "url": "https://firecrawl.dev/support", + "email": "support@firecrawl.dev" + } + }, + "servers": [ + { + "url": "https://api.firecrawl.dev/v0" + } + ], + "paths": { + "/scrape": { + "post": { + "summary": "Scrape a single URL and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrl", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + }, + "pageOptions": { + "type": "object", + "properties": { + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeHtml": { + "type": "boolean", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "default": false + }, + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false + }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 + } + } + }, + "extractorOptions": { + "type": "object", + "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", + "default": {}, + "properties": { + "mode": { + "type": "string", + "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], + "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." + }, + "extractionPrompt": { + "type": "string", + "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." + }, + "extractionSchema": { + "type": "object", + "additionalProperties": true, + "description": "The schema for the data to be extracted, required only for LLM extraction modes.", + "required": [ + "company_mission", + "supports_sso", + "is_open_source" + ] + } + } + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl": { + "post": { + "summary": "Crawl multiple URLs based on options", + "operationId": "crawlUrls", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "crawlerOptions": { + "type": "object", + "properties": { + "includes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to include" + }, + "excludes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "URL patterns to exclude" + }, + "generateImgAltText": { + "type": "boolean", + "description": "Generate alt text for images using LLMs (must have a paid plan)", + "default": false + }, + "returnOnlyUrls": { + "type": "boolean", + "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", + "default": false + }, + "maxDepth": { + "type": "integer", + "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern." + }, + "mode": { + "type": "string", + "enum": ["default", "fast"], + "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", + "default": "default" + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of pages to crawl", + "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", + "default": false + }, + "allowExternalContentLinks": { + "type": "boolean", + "description": "Allows the crawler to follow links to external websites.", + "default": false + } + } + }, + "pageOptions": { + "type": "object", + "properties": { + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeHtml": { + "type": "boolean", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "default": false + }, + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false + }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 + } + } + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/search": { + "post": { + "summary": "Search for a keyword in Google, returns top page results with markdown content for each page", + "operationId": "searchGoogle", + "tags": ["Search"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "format": "uri", + "description": "The query to search for" + }, + "pageOptions": { + "type": "object", + "properties": { + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false + }, + "fetchPageContent": { + "type": "boolean", + "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", + "default": true + }, + "includeHtml": { + "type": "boolean", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", + "default": false + } + } + }, + "searchOptions": { + "type": "object", + "properties": { + "limit": { + "type": "integer", + "description": "Maximum number of results. Max is 20 during beta." + } + } + } + }, + "required": ["query"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SearchResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl/status/{jobId}": { + "get": { + "tags": ["Crawl"], + "summary": "Get the status of a crawl job", + "operationId": "getCrawlStatus", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Status of the job (completed, active, failed, paused)" + }, + "current": { + "type": "integer", + "description": "Current page number" + }, + "total": { + "type": "integer", + "description": "Total number of pages" + }, + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CrawlStatusResponseObj" + }, + "description": "Data returned from the job (null when it is in progress)" + }, + "partial_data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CrawlStatusResponseObj" + }, + "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." + } + } + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/crawl/cancel/{jobId}": { + "delete": { + "tags": ["Crawl"], + "summary": "Cancel a crawl job", + "operationId": "cancelCrawlJob", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Returns cancelled." + } + } + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + } + }, + "components": { + "securitySchemes": { + "bearerAuth": { + "type": "http", + "scheme": "bearer" + } + }, + "schemas": { + "ScrapeResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + + } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + } + } + } + } + }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + }, + "SearchResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "jobId": { + "type": "string" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] +} \ No newline at end of file diff --git a/apps/api/openapi.json b/apps/api/openapi.json index fb0c4305..5bd3e3d8 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -18,8 +18,8 @@ "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL and optionally extract information using an LLM", - "operationId": "scrapeAndExtractFromUrl", + "summary": "Scrape a single URL", + "operationId": "scrape", "tags": ["Scraping"], "security": [ { @@ -38,94 +38,47 @@ "format": "uri", "description": "The URL to scrape" }, - "pageOptions": { - "type": "object", - "properties": { - "headers": { - "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "includeHtml": { - "type": "boolean", - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "default": false - }, - "includeRawHtml": { - "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "default": false - }, - "onlyIncludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false - }, - "removeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, - "screenshot": { - "type": "boolean", - "description": "Include a screenshot of the top of the page that you are scraping.", - "default": false - }, - "fullPageScreenshot": { - "type": "boolean", - "description": "Include a full page screenshot of the page that you are scraping.", - "default": false - }, - "waitFor": { - "type": "integer", - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "default": 0 - } - } + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"] + }, + "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)", + "default": ["markdown"] }, - "extractorOptions": { + "headers": { "type": "object", - "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", - "default": {}, - "properties": { - "mode": { - "type": "string", - "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], - "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." - }, - "extractionPrompt": { - "type": "string", - "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." - }, - "extractionSchema": { - "type": "object", - "additionalProperties": true, - "description": "The schema for the data to be extracted, required only for LLM extraction modes.", - "required": [ - "company_mission", - "supports_sso", - "is_open_source" - ] - } - } + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true }, "timeout": { "type": "integer", "description": "Timeout in milliseconds for the request", "default": 30000 + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 } }, "required": ["url"] @@ -741,24 +694,42 @@ "success": { "type": "boolean" }, + "warning": { + "type": "string", + "nullable": true, + "description": "Warning message to let you know of any issues." + }, "data": { "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -780,27 +751,16 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" } - } - }, - "llm_extraction": { - "type": "object", - "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", - "nullable": true - }, - "warning": { - "type": "string", - "nullable": true, - "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." } } } @@ -810,24 +770,33 @@ "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" }, - "index": { - "type": "integer", - "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -849,11 +818,11 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" @@ -871,34 +840,63 @@ "data": { "type": "array", "items": { - "type": "object", - "properties": { - "url": { - "type": "string" + "markdown": { + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if the `html` format was specified" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" }, - "markdown": { - "type": "string" - }, - "content": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - } + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } } @@ -909,8 +907,15 @@ "CrawlResponse": { "type": "object", "properties": { - "jobId": { + "success": { + "type": "boolean" + }, + "id": { "type": "string" + }, + "url": { + "type": "string", + "format": "uri" } } } diff --git a/apps/api/package.json b/apps/api/package.json index bd35dc65..732472e2 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -106,7 +106,7 @@ "uuid": "^10.0.0", "wordpos": "^2.1.0", "xml2js": "^0.6.2", - "zod": "^3.23.4", + "zod": "^3.23.8", "zod-to-json-schema": "^3.23.1" }, "nodemonConfig": { diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 4b590380..7b2e07fa 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -189,7 +189,7 @@ importers: specifier: ^0.6.2 version: 0.6.2 zod: - specifier: ^3.23.4 + specifier: ^3.23.8 version: 3.23.8 zod-to-json-schema: specifier: ^3.23.1 diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index e148f8db..4f65bdb1 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -1,86 +1,68 @@ -import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../../src/types"; -import { Logger } from "../../../src/lib/logger"; +import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; +import { RequestWithAuth } from "./types"; -export async function crawlStatusController(req: Request, res: Response) { - // TODO: validate req.params.jobId +export async function crawlStatusController(req: RequestWithAuth, res: Response) { + // const job = await getWebScraperQueue().getJob(req.params.jobId); + // if (!job) { + // return res.status(404).json({ error: "Job not found" }); + // } - try { - const { success, team_id, error, status } = await authenticateUser( - req, - res, - RateLimiterMode.CrawlStatus - ); - if (!success) { - return res.status(status).json({ error }); - } + // const { current, current_url, total, current_step, partialDocs } = await job.progress(); - // const job = await getWebScraperQueue().getJob(req.params.jobId); - // if (!job) { - // return res.status(404).json({ error: "Job not found" }); - // } + // let data = job.returnvalue; + // if (process.env.USE_DB_AUTHENTICATION === "true") { + // const supabaseData = await supabaseGetJobById(req.params.jobId); - // const { current, current_url, total, current_step, partialDocs } = await job.progress(); + // if (supabaseData) { + // data = supabaseData.docs; + // } + // } - // let data = job.returnvalue; - // if (process.env.USE_DB_AUTHENTICATION === "true") { - // const supabaseData = await supabaseGetJobById(req.params.jobId); + // const jobStatus = await job.getState(); - // if (supabaseData) { - // data = supabaseData.docs; - // } - // } - - // const jobStatus = await job.getState(); - - // mock: - const id = uuidv4(); - const result = { - totalCount: 100, - creditsUsed: 2, - expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(), - status: "scraping", // scraping, completed, failed - next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, - data: [{ - markdown: "test", - content: "test", - html: "test", - rawHtml: "test", - linksOnPage: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } - }, - { - markdown: "test", - content: "test", - html: "test", - rawHtml: "test", - linksOnPage: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } - }] - } - - res.status(200).json(result); - } catch (error) { - Logger.error(error); - return res.status(500).json({ error: error.message }); + // mock: + const id = uuidv4(); + const result = { + totalCount: 100, + creditsUsed: 2, + expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(), + status: "scraping", // scraping, completed, failed + next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, + data: [{ + markdown: "test", + content: "test", + html: "test", + rawHtml: "test", + linksOnPage: ["test1", "test2"], + screenshot: "test", + metadata: { + title: "test", + description: "test", + language: "test", + sourceURL: "test", + statusCode: 200, + error: "test" + } + }, + { + markdown: "test", + content: "test", + html: "test", + rawHtml: "test", + linksOnPage: ["test1", "test2"], + screenshot: "test", + metadata: { + title: "test", + description: "test", + language: "test", + sourceURL: "test", + statusCode: 200, + error: "test" + } + }] } + + res.status(200).json(result); } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index b4ce293e..52fe22e6 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -1,140 +1,87 @@ -import { Request, Response } from "express"; -import { checkTeamCredits } from "../../../src/services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../../src/types"; -import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; -import { validateIdempotencyKey } from "../../../src/services/idempotency/validate"; -import { createIdempotencyKey } from "../../../src/services/idempotency/create"; +import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../../src/lib/logger"; -import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; -import { CrawlRequest, CrawlResponse } from "./types"; +import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types"; +import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { addScrapeJob } from "../../services/queue-jobs"; +import { Logger } from "../../lib/logger"; -export async function crawlController(req: Request<{}, {}, CrawlRequest>, res: Response) { - // expected req.body +export async function crawlController(req: RequestWithAuth, res: Response) { + req.body = crawlRequestSchema.parse(req.body); + + const id = uuidv4(); - // req.body = { - // url: string - // crawlerOptions: { - // includePaths: string[] - // excludePaths: string[] - // maxDepth: number - // limit: number - // allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME??? - // allowExternalLinks: boolean - // ignoreSitemap: number - // } - // scrapeOptions: Exclude - // } + await logCrawl(id, req.auth.team_id); + const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions), + pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + }; + + const crawler = crawlToCrawler(id, sc); try { - const { success, team_id, error, status } = await authenticateUser( - req, - res, - RateLimiterMode.Crawl - ); - if (!success) { - return res.status(status).json({ success: false, error }); - } - - if (req.headers["x-idempotency-key"]) { - const isIdempotencyValid = await validateIdempotencyKey(req); - if (!isIdempotencyValid) { - return res.status(409).json({ success: false, error: "Idempotency key already used" }); - } - try { - createIdempotencyKey(req); - } catch (error) { - Logger.error(error); - return res.status(500).json({ success: false, error: error.message }); - } - } - - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ success: false, error: "Insufficient credits" }); - } - - let url = req.body.url; - if (!url) { - return res.status(400).json({ success: false, error: "Url is required" }); - } - - if (isUrlBlocked(url)) { - return res - .status(403) - .json({ - success: false, - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - }); - } - - try { - url = checkAndUpdateURL(url).url; - } catch (error) { - return res.status(400).json({ success: false, error: 'Invalid Url' }); - } - - // TODO: add job to queue - - const id = uuidv4(); - return res.status(200).json({ - success: true, - id, - url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`, - }); - - // const mode = req.body.mode ?? "crawl"; - - // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; - // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - - // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? - // try { - // const a = new WebScraperDataProvider(); - // await a.setOptions({ - // jobId: uuidv4(), - // mode: "single_urls", - // urls: [url], - // crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, - // pageOptions: pageOptions, - // }); - - // const docs = await a.getDocuments(false, (progress) => { - // job.progress({ - // current: progress.current, - // total: progress.total, - // current_step: "SCRAPING", - // current_url: progress.currentDocumentUrl, - // }); - // }); - // return res.json({ - // success: true, - // documents: docs, - // }); - // } catch (error) { - // Logger.error(error); - // return res.status(500).json({ error: error.message }); - // } - // } - - // const job = await addWebScraperJob({ - // url: url, - // mode: mode ?? "crawl", // fix for single urls not working - // crawlerOptions: crawlerOptions, - // team_id: team_id, - // pageOptions: pageOptions, - // origin: req.body.origin ?? defaultOrigin, - // }); - - // await logCrawl(job.id.toString(), team_id); - - // res.json({ jobId: job.id }); - } catch (error) { - Logger.error(error); - return res.status(500).json({ success: false, error: error.message }); + sc.robots = await crawler.getRobotsTxt(); + } catch (e) { + Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`); } + + await saveCrawl(id, sc); + + const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); + + if (sitemap !== null) { + const jobs = sitemap.map(x => { + const url = x.url; + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + team_id: req.auth.team_id, + crawlerOptions, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + }, + opts: { + jobId: uuid, + priority: 20, + } + }; + }) + + await lockURLs(id, jobs.map(x => x.data.url)); + await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); + await getScrapeQueue().addBulk(jobs); + } else { + await lockURL(id, sc, req.body.url); + const job = await addScrapeJob({ + url: req.body.url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: req.auth.team_id, + pageOptions: pageOptions, + origin: "api", + crawl_id: id, + }, { + priority: 15, + }); + await addCrawlJob(id, job.id); + } + + return res.status(200).json({ + success: true, + id, + url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`, + }); } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index f4546abe..49bc86ed 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,12 +1,12 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../../src/types"; -import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { Logger } from "../../../src/lib/logger"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; -import { MapRequest, MapResponse } from "./types"; +import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types"; +import { checkTeamCredits } from "../../services/billing/credit_billing"; -export async function mapController(req: Request<{}, MapResponse, MapRequest>, res: Response) { +export async function mapController(req: RequestWithAuth, res: Response) { + req.body = mapRequestSchema.parse(req.body); + console.log(req.body); // expected req.body // req.body = { @@ -14,106 +14,53 @@ export async function mapController(req: Request<{}, MapResponse, MapRequest>, r // crawlerOptions: // } - try { - const { success, team_id, error, status } = await authenticateUser( - req, - res, - RateLimiterMode.Crawl - ); - if (!success) { - return res.status(status).json({ success: false, error }); - } - // if (req.headers["x-idempotency-key"]) { - // const isIdempotencyValid = await validateIdempotencyKey(req); - // if (!isIdempotencyValid) { - // return res.status(409).json({ error: "Idempotency key already used" }); - // } - // try { - // createIdempotencyKey(req); - // } catch (error) { - // Logger.error(error); - // return res.status(500).json({ error: error.message }); - // } - // } + return res.status(200).json({ success: true, links: [ "test1", "test2" ] }); - // const { success: creditsCheckSuccess, message: creditsCheckMessage } = - // await checkTeamCredits(team_id, 1); - // if (!creditsCheckSuccess) { - // return res.status(402).json({ error: "Insufficient credits" }); - // } + // const mode = req.body.mode ?? "crawl"; - let url = req.body.url; - if (!url) { - return res.status(400).json({ success: false, error: "Url is required" }); - } + // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; + // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - if (isUrlBlocked(url)) { - return res - .status(403) - .json({ - success: false, - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - }); - } + // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? + // try { + // const a = new WebScraperDataProvider(); + // await a.setOptions({ + // jobId: uuidv4(), + // mode: "single_urls", + // urls: [url], + // crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, + // pageOptions: pageOptions, + // }); - try { - url = checkAndUpdateURL(url).url; - } catch (error) { - return res.status(400).json({ success: false, error: 'Invalid Url' }); - } + // const docs = await a.getDocuments(false, (progress) => { + // job.progress({ + // current: progress.current, + // total: progress.total, + // current_step: "SCRAPING", + // current_url: progress.currentDocumentUrl, + // }); + // }); + // return res.json({ + // success: true, + // documents: docs, + // }); + // } catch (error) { + // Logger.error(error); + // return res.status(500).json({ error: error.message }); + // } + // } - return res.status(200).json({ success: true, links: [ "test1", "test2" ] }); + // const job = await addWebScraperJob({ + // url: url, + // mode: mode ?? "crawl", // fix for single urls not working + // crawlerOptions: crawlerOptions, + // team_id: team_id, + // pageOptions: pageOptions, + // origin: req.body.origin ?? defaultOrigin, + // }); - // const mode = req.body.mode ?? "crawl"; + // await logCrawl(job.id.toString(), team_id); - // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; - // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - - // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? - // try { - // const a = new WebScraperDataProvider(); - // await a.setOptions({ - // jobId: uuidv4(), - // mode: "single_urls", - // urls: [url], - // crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, - // pageOptions: pageOptions, - // }); - - // const docs = await a.getDocuments(false, (progress) => { - // job.progress({ - // current: progress.current, - // total: progress.total, - // current_step: "SCRAPING", - // current_url: progress.currentDocumentUrl, - // }); - // }); - // return res.json({ - // success: true, - // documents: docs, - // }); - // } catch (error) { - // Logger.error(error); - // return res.status(500).json({ error: error.message }); - // } - // } - - // const job = await addWebScraperJob({ - // url: url, - // mode: mode ?? "crawl", // fix for single urls not working - // crawlerOptions: crawlerOptions, - // team_id: team_id, - // pageOptions: pageOptions, - // origin: req.body.origin ?? defaultOrigin, - // }); - - // await logCrawl(job.id.toString(), team_id); - - // res.json({ jobId: job.id }); - } catch (error) { - Logger.error(error); - return res.status(500).json({ success: false, error: error.message }); - } + // res.json({ jobId: job.id }); } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index e3cfcbdc..a61a7f6a 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -1,26 +1,11 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../types"; -import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { Logger } from '../../lib/logger'; import { checkAndUpdateURL } from '../../lib/validateUrl'; -import { ScrapeRequest, ScrapeResponse } from "./types"; +import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; -export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRequest>, res: Response) { - let url = req.body.url; - if (!url) { - return { success: false, error: "Url is required", returnCode: 400 }; - } - - if (isUrlBlocked(url)) { - return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; - } - - try { - url = checkAndUpdateURL(url).url; - } catch (error) { - return { success: false, error: "Invalid URL", returnCode: 400 }; - } +export async function scrapeController(req: RequestWithAuth, res: Response) { + req.body = scrapeRequestSchema.parse(req.body); + console.log(req.body); // TODO: check req.body // mockup req.body @@ -37,137 +22,124 @@ export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRe // waitFor: number // } - try { - let earlyReturn = false; - // make sure to authenticate user first, Bearer - const { success, team_id, error, status, plan } = await authenticateUser( - req, - res, - RateLimiterMode.Scrape - ); - if (!success) { - return res.status(status).json({ success: false, error }); - } + let earlyReturn = false; + // make sure to authenticate user first, Bearer - // check credits + // check credits - const result: ScrapeResponse = { - success: true, - warning: "test", - data: { - markdown: "test", - html: "test", - rawHtml: "test", - links: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } + const result: ScrapeResponse = { + success: true, + warning: "test", + data: { + markdown: "test", + html: "test", + rawHtml: "test", + links: ["test1", "test2"], + screenshot: "test", + metadata: { + title: "test", + description: "test", + language: "test", + sourceURL: "test", + statusCode: 200, + error: "test" } } - - return res.status(200).json(result); - - // const crawlerOptions = req.body.crawlerOptions ?? {}; - // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; - // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; - // const origin = req.body.origin ?? defaultOrigin; - // let timeout = req.body.timeout ?? defaultTimeout; - - // if (extractorOptions.mode.includes("llm-extraction")) { - // pageOptions.onlyMainContent = true; - // timeout = req.body.timeout ?? 90000; - // } - - // const checkCredits = async () => { - // try { - // const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); - // if (!creditsCheckSuccess) { - // earlyReturn = true; - // return res.status(402).json({ error: "Insufficient credits" }); - // } - // } catch (error) { - // Logger.error(error); - // earlyReturn = true; - // return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); - // } - // }; - - - // await checkCredits(); - - // const jobId = uuidv4(); - - // const startTime = new Date().getTime(); - // const result = await scrapeHelper( - // jobId, - // req, - // team_id, - // crawlerOptions, - // pageOptions, - // extractorOptions, - // timeout, - // plan - // ); - // const endTime = new Date().getTime(); - // const timeTakenInSeconds = (endTime - startTime) / 1000; - // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; - - // if (result.success) { - // let creditsToBeBilled = 1; // Assuming 1 credit per document - // const creditsPerLLMExtract = 50; - - // if (extractorOptions.mode.includes("llm-extraction")) { - // // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); - // creditsToBeBilled += creditsPerLLMExtract; - // } - - // let startTimeBilling = new Date().getTime(); - - // if (earlyReturn) { - // // Don't bill if we're early returning - // return; - // } - // const billingResult = await billTeam( - // team_id, - // creditsToBeBilled - // ); - // if (!billingResult.success) { - // return res.status(402).json({ - // success: false, - // error: "Failed to bill team. Insufficient credits or subscription not found.", - // }); - // } - // } - - // logJob({ - // job_id: jobId, - // success: result.success, - // message: result.error, - // num_docs: 1, - // docs: [result.data], - // time_taken: timeTakenInSeconds, - // team_id: team_id, - // mode: "scrape", - // url: req.body.url, - // crawlerOptions: crawlerOptions, - // pageOptions: pageOptions, - // origin: origin, - // extractor_options: extractorOptions, - // num_tokens: numTokens, - // }); - - - // return res.status(result.returnCode).json(result); - } catch (error) { - Logger.error(error); - return res.status(500).json({ success: false, error: error.message }); } + + return res.status(200).json(result); + + // const crawlerOptions = req.body.crawlerOptions ?? {}; + // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; + // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; + // const origin = req.body.origin ?? defaultOrigin; + // let timeout = req.body.timeout ?? defaultTimeout; + + // if (extractorOptions.mode.includes("llm-extraction")) { + // pageOptions.onlyMainContent = true; + // timeout = req.body.timeout ?? 90000; + // } + + // const checkCredits = async () => { + // try { + // const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); + // if (!creditsCheckSuccess) { + // earlyReturn = true; + // return res.status(402).json({ error: "Insufficient credits" }); + // } + // } catch (error) { + // Logger.error(error); + // earlyReturn = true; + // return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); + // } + // }; + + + // await checkCredits(); + + // const jobId = uuidv4(); + + // const startTime = new Date().getTime(); + // const result = await scrapeHelper( + // jobId, + // req, + // team_id, + // crawlerOptions, + // pageOptions, + // extractorOptions, + // timeout, + // plan + // ); + // const endTime = new Date().getTime(); + // const timeTakenInSeconds = (endTime - startTime) / 1000; + // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; + + // if (result.success) { + // let creditsToBeBilled = 1; // Assuming 1 credit per document + // const creditsPerLLMExtract = 50; + + // if (extractorOptions.mode.includes("llm-extraction")) { + // // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); + // creditsToBeBilled += creditsPerLLMExtract; + // } + + // let startTimeBilling = new Date().getTime(); + + // if (earlyReturn) { + // // Don't bill if we're early returning + // return; + // } + // const billingResult = await billTeam( + // team_id, + // creditsToBeBilled + // ); + // if (!billingResult.success) { + // return res.status(402).json({ + // success: false, + // error: "Failed to bill team. Insufficient credits or subscription not found.", + // }); + // } + // } + + // logJob({ + // job_id: jobId, + // success: result.success, + // message: result.error, + // num_docs: 1, + // docs: [result.data], + // time_taken: timeTakenInSeconds, + // team_id: team_id, + // mode: "scrape", + // url: req.body.url, + // crawlerOptions: crawlerOptions, + // pageOptions: pageOptions, + // origin: origin, + // extractor_options: extractorOptions, + // num_tokens: numTokens, + // }); + + + // return res.status(result.returnCode).json(result); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 07d56e17..bd8e44f6 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,36 +1,96 @@ +import { Request } from "express"; +import { z } from "zod"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import { PageOptions } from "../../lib/entities"; + export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage"; -export type ScrapeRequest = { - url: string; - formats?: Format[]; - headers?: { [K: string]: string }; - includeTags?: string[]; - excludeTags?: string[]; - onlyMainContent?: boolean; - timeout?: number; - waitFor?: number; -} +const url = z.preprocess(x => { + if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) { + if (x.startsWith("://")) { + return "http" + x; + } else { + return "http://" + x; + } + } else { + return x; + } +}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.")); -export type CrawlerOptions = { - includePaths?: string[]; - excludePaths?: string[]; - maxDepth?: number; - limit?: number; - allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME??? - allowExternalLinks?: boolean; - ignoreSitemap?: boolean; -}; +export const scrapeOptions = z.object({ + formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]) + .array() + .optional() + .default(["markdown"]), + headers: z.record(z.string(), z.string()).optional(), + includeTags: z.string().array().optional(), + excludeTags: z.string().array().optional(), + onlyMainContent: z.boolean().default(true), + timeout: z.number().int().positive().finite().safe().default(30000), // default? + waitFor: z.number().int().nonnegative().finite().safe().default(0), +}); -export type CrawlRequest = { - url: string; - crawlerOptions?: CrawlerOptions; - scrapeOptions?: Exclude; -}; +export type ScrapeOptions = z.infer; -export type MapRequest = { - url: string; - crawlerOptions?: CrawlerOptions; -}; +export const scrapeRequestSchema = scrapeOptions.extend({ url }); + +// export type ScrapeRequest = { +// url: string; +// formats?: Format[]; +// headers?: { [K: string]: string }; +// includeTags?: string[]; +// excludeTags?: string[]; +// onlyMainContent?: boolean; +// timeout?: number; +// waitFor?: number; +// } + +export type ScrapeRequest = z.infer; + +const crawlerOptions = z.object({ + includePaths: z.string().array().default([]), + excludePaths: z.string().array().default([]), + maxDepth: z.number().default(10), // default? + limit: z.number().default(10000), // default? + allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? + allowExternalLinks: z.boolean().default(false), + ignoreSitemap: z.boolean().default(true), +}); + +// export type CrawlerOptions = { +// includePaths?: string[]; +// excludePaths?: string[]; +// maxDepth?: number; +// limit?: number; +// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME??? +// allowExternalLinks?: boolean; +// ignoreSitemap?: boolean; +// }; + +export type CrawlerOptions = z.infer; + +export const crawlRequestSchema = z.object({ + url, + crawlerOptions: crawlerOptions.default({}), + scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), +}); + +// export type CrawlRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// scrapeOptions?: Exclude; +// }; + +export type CrawlRequest = z.infer; + +export const mapRequestSchema = crawlerOptions.extend({ url }); + +// export type MapRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// }; + +export type MapRequest = z.infer; export type Document = { markdown?: string, @@ -77,6 +137,7 @@ export type Document = { export type ErrorResponse = { success: false; error: string; + details?: any; }; export type ScrapeResponse = ErrorResponse | { @@ -95,3 +156,40 @@ export type MapResponse = ErrorResponse | { success: true; links: string[]; } + +type AuthObject = { + team_id: string; + plan: string; +} + +export interface RequestWithMaybeAuth extends Request<{}, ReqBody, ResBody> { + auth?: AuthObject; +} + +export interface RequestWithAuth extends Request<{}, ReqBody, ResBody> { + auth: AuthObject; +} + +export function legacyCrawlerOptions(x: CrawlerOptions) { + return { + includes: x.includePaths, + excludes: x.excludePaths, + maxCrawledLinks: x.limit, + maxCrawledDepth: x.maxDepth, + limit: x.limit, + generateImgAltText: false, + allowBackwardCrawling: x.allowBackwardLinks, + allowExternalContentLinks: x.allowExternalLinks, + }; +} + +export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { + return { + includeHtml: x.formats.includes("html"), + includeRawHtml: x.formats.includes("rawHtml"), + onlyIncludeTags: x.includeTags, + removeTags: x.excludeTags, + onlyMainContent: x.onlyMainContent, + waitFor: x.waitFor, + }; +} diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 5099fee9..c3ea8633 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -1,9 +1,18 @@ -import express from "express"; +import express, { NextFunction, Request, Response } from "express"; import { crawlController } from "../../src/controllers/v1/crawl"; // import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; import { scrapeController } from "../../src/controllers/v1/scrape"; import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; import { mapController } from "../../src/controllers/v1/map"; +import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; +import { RateLimiterMode } from "../types"; +import { authenticateUser } from "../controllers/v1/auth"; +import { Logger } from "../lib/logger"; +import { createIdempotencyKey } from "../services/idempotency/create"; +import { validateIdempotencyKey } from "../services/idempotency/validate"; +import { ZodError } from "zod"; +import { checkTeamCredits } from "../services/billing/credit_billing"; +import { v4 as uuidv4 } from "uuid"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -12,13 +21,96 @@ import { mapController } from "../../src/controllers/v1/map"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; +function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) { + return res.status(402).json({ success: false, error: "Insufficient credits" }); + } + next(); + })() + .catch(err => next(err)); + }; +} + +function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + const { success, team_id, error, status, plan } = await authenticateUser( + req, + res, + rateLimiterMode, + ); + + if (!success) { + return res.status(status).json({ success: false, error }); + } + + req.auth = { team_id, plan }; + next(); + })() + .catch(err => next(err)); + } +} + +function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) { + (async () => { + if (req.headers["x-idempotency-key"]) { + const isIdempotencyValid = await validateIdempotencyKey(req); + if (!isIdempotencyValid) { + return res.status(409).json({ success: false, error: "Idempotency key already used" }); + } + // try { + createIdempotencyKey(req); + // } catch (error) { + // Logger.error(error); + // return res.status(500).json({ success: false, error: error.message }); + // } + } + next(); + })() + .catch(err => next(err)); +} + +function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { + return (req, res, next) => { + controller(req, res) + .catch(err => next(err)) + } +} + export const v1Router = express.Router(); -v1Router.post("/v1/scrape", scrapeController); -v1Router.post("/v1/crawl", crawlController); -v1Router.get("/v1/crawl/:jobId", crawlStatusController); +v1Router.post( + "/v1/scrape", + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + wrap(scrapeController) +); + +v1Router.post( + "/v1/crawl", + authMiddleware(RateLimiterMode.Crawl), + idempotencyMiddleware, + checkCreditsMiddleware(1), + wrap(crawlController) +); + +v1Router.post( + "/v1/map", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(1), + wrap(mapController) +); + +v1Router.get( + "/v1/crawl/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + // v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController); -// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController); +// v1Router.delete("/v1/crawl/:jobId", crawlCancelController); // v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController); // // Auth route for key based authentication @@ -31,4 +123,12 @@ v1Router.get("/v1/crawl/:jobId", crawlStatusController); // v1Router.get("/v1/health/liveness", livenessController); // v1Router.get("/v1/health/readiness", readinessController); -v1Router.post("/v1/map", mapController); \ No newline at end of file +v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { + const id = uuidv4(); + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err)); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" }); + } +});