mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 06:28:59 +08:00
add zod, create middleware, update openapi declaration, add crawl logic
This commit is contained in:
parent
4165de1773
commit
8b7569f8f3
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
@ -0,0 +1,924 @@
|
|||||||
|
{
|
||||||
|
"openapi": "3.0.0",
|
||||||
|
"info": {
|
||||||
|
"title": "Firecrawl API",
|
||||||
|
"version": "0.0.0",
|
||||||
|
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||||
|
"contact": {
|
||||||
|
"name": "Firecrawl Support",
|
||||||
|
"url": "https://firecrawl.dev/support",
|
||||||
|
"email": "support@firecrawl.dev"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"servers": [
|
||||||
|
{
|
||||||
|
"url": "https://api.firecrawl.dev/v0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"paths": {
|
||||||
|
"/scrape": {
|
||||||
|
"post": {
|
||||||
|
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||||
|
"operationId": "scrapeAndExtractFromUrl",
|
||||||
|
"tags": ["Scraping"],
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"requestBody": {
|
||||||
|
"required": true,
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri",
|
||||||
|
"description": "The URL to scrape"
|
||||||
|
},
|
||||||
|
"pageOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"headers": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"includeRawHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"onlyIncludeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"onlyMainContent": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"waitFor": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
"default": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"extractorOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||||
|
"default": {},
|
||||||
|
"properties": {
|
||||||
|
"mode": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||||
|
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||||
|
},
|
||||||
|
"extractionPrompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||||
|
},
|
||||||
|
"extractionSchema": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": true,
|
||||||
|
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||||
|
"required": [
|
||||||
|
"company_mission",
|
||||||
|
"supports_sso",
|
||||||
|
"is_open_source"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeout": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Timeout in milliseconds for the request",
|
||||||
|
"default": 30000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ScrapeResponse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Payment required to access this resource."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "An unexpected error occurred on the server."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/crawl": {
|
||||||
|
"post": {
|
||||||
|
"summary": "Crawl multiple URLs based on options",
|
||||||
|
"operationId": "crawlUrls",
|
||||||
|
"tags": ["Crawling"],
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"requestBody": {
|
||||||
|
"required": true,
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri",
|
||||||
|
"description": "The base URL to start crawling from"
|
||||||
|
},
|
||||||
|
"crawlerOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"includes": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "URL patterns to include"
|
||||||
|
},
|
||||||
|
"excludes": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "URL patterns to exclude"
|
||||||
|
},
|
||||||
|
"generateImgAltText": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"returnOnlyUrls": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"maxDepth": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||||
|
},
|
||||||
|
"mode": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["default", "fast"],
|
||||||
|
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||||
|
"default": "default"
|
||||||
|
},
|
||||||
|
"ignoreSitemap": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Ignore the website sitemap when crawling",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"limit": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum number of pages to crawl",
|
||||||
|
"default": 10000
|
||||||
|
},
|
||||||
|
"allowBackwardCrawling": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"allowExternalContentLinks": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Allows the crawler to follow links to external websites.",
|
||||||
|
"default": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pageOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"headers": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"includeRawHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"onlyIncludeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"onlyMainContent": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"waitFor": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
"default": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/CrawlResponse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Payment required to access this resource."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "An unexpected error occurred on the server."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/search": {
|
||||||
|
"post": {
|
||||||
|
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||||
|
"operationId": "searchGoogle",
|
||||||
|
"tags": ["Search"],
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"requestBody": {
|
||||||
|
"required": true,
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri",
|
||||||
|
"description": "The query to search for"
|
||||||
|
},
|
||||||
|
"pageOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"onlyMainContent": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"fetchPageContent": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||||
|
"default": true
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"includeRawHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||||
|
"default": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"searchOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"limit": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum number of results. Max is 20 during beta."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/SearchResponse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Payment required to access this resource."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "An unexpected error occurred on the server."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/crawl/status/{jobId}": {
|
||||||
|
"get": {
|
||||||
|
"tags": ["Crawl"],
|
||||||
|
"summary": "Get the status of a crawl job",
|
||||||
|
"operationId": "getCrawlStatus",
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "jobId",
|
||||||
|
"in": "path",
|
||||||
|
"description": "ID of the crawl job",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Status of the job (completed, active, failed, paused)"
|
||||||
|
},
|
||||||
|
"current": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Current page number"
|
||||||
|
},
|
||||||
|
"total": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Total number of pages"
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||||
|
},
|
||||||
|
"description": "Data returned from the job (null when it is in progress)"
|
||||||
|
},
|
||||||
|
"partial_data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||||
|
},
|
||||||
|
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Payment required to access this resource."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "An unexpected error occurred on the server."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/crawl/cancel/{jobId}": {
|
||||||
|
"delete": {
|
||||||
|
"tags": ["Crawl"],
|
||||||
|
"summary": "Cancel a crawl job",
|
||||||
|
"operationId": "cancelCrawlJob",
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "jobId",
|
||||||
|
"in": "path",
|
||||||
|
"description": "ID of the crawl job",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Returns cancelled."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Payment required to access this resource."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "An unexpected error occurred on the server."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"components": {
|
||||||
|
"securitySchemes": {
|
||||||
|
"bearerAuth": {
|
||||||
|
"type": "http",
|
||||||
|
"scheme": "bearer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schemas": {
|
||||||
|
"ScrapeResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"success": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"markdown": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"html": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||||
|
},
|
||||||
|
"rawHtml": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"sourceURL": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"<any other metadata> ": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"pageStatusCode": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The status code of the page"
|
||||||
|
},
|
||||||
|
"pageError": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "The error message of the page"
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"llm_extraction": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"warning": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CrawlStatusResponseObj": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"markdown": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"html": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||||
|
},
|
||||||
|
"rawHtml": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||||
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"sourceURL": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"<any other metadata> ": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"pageStatusCode": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The status code of the page"
|
||||||
|
},
|
||||||
|
"pageError": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "The error message of the page"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"SearchResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"success": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"markdown": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"sourceURL": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CrawlResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"jobId": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@ -18,8 +18,8 @@
|
|||||||
"paths": {
|
"paths": {
|
||||||
"/scrape": {
|
"/scrape": {
|
||||||
"post": {
|
"post": {
|
||||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
"summary": "Scrape a single URL",
|
||||||
"operationId": "scrapeAndExtractFromUrl",
|
"operationId": "scrape",
|
||||||
"tags": ["Scraping"],
|
"tags": ["Scraping"],
|
||||||
"security": [
|
"security": [
|
||||||
{
|
{
|
||||||
@ -38,94 +38,47 @@
|
|||||||
"format": "uri",
|
"format": "uri",
|
||||||
"description": "The URL to scrape"
|
"description": "The URL to scrape"
|
||||||
},
|
},
|
||||||
"pageOptions": {
|
"formats": {
|
||||||
"type": "object",
|
"type": "array",
|
||||||
"properties": {
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||||
|
},
|
||||||
|
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||||
|
"default": ["markdown"]
|
||||||
|
},
|
||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
},
|
},
|
||||||
"includeHtml": {
|
"includeTags": {
|
||||||
"type": "boolean",
|
|
||||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"includeRawHtml": {
|
|
||||||
"type": "boolean",
|
|
||||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"onlyIncludeTags": {
|
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
},
|
},
|
||||||
"onlyMainContent": {
|
"excludeTags": {
|
||||||
"type": "boolean",
|
|
||||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"removeTags": {
|
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
},
|
},
|
||||||
"replaceAllPathsWithAbsolutePaths": {
|
"onlyMainContent": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Replace all relative paths with absolute paths for images and links",
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
"default": false
|
"default": true
|
||||||
},
|
|
||||||
"screenshot": {
|
|
||||||
"type": "boolean",
|
|
||||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"fullPageScreenshot": {
|
|
||||||
"type": "boolean",
|
|
||||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"waitFor": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
|
||||||
"default": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"extractorOptions": {
|
|
||||||
"type": "object",
|
|
||||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
|
||||||
"default": {},
|
|
||||||
"properties": {
|
|
||||||
"mode": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
|
||||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
|
||||||
},
|
|
||||||
"extractionPrompt": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
|
||||||
},
|
|
||||||
"extractionSchema": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": true,
|
|
||||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
|
||||||
"required": [
|
|
||||||
"company_mission",
|
|
||||||
"supports_sso",
|
|
||||||
"is_open_source"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"timeout": {
|
"timeout": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Timeout in milliseconds for the request",
|
"description": "Timeout in milliseconds for the request",
|
||||||
"default": 30000
|
"default": 30000
|
||||||
|
},
|
||||||
|
"waitFor": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
"default": 0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
@ -741,24 +694,42 @@
|
|||||||
"success": {
|
"success": {
|
||||||
"type": "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
|
"warning": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Warning message to let you know of any issues."
|
||||||
|
},
|
||||||
"data": {
|
"data": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"markdown": {
|
"markdown": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
},
|
"nullable": true,
|
||||||
"content": {
|
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||||
"type": "string"
|
|
||||||
},
|
},
|
||||||
"html": {
|
"html": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
"description": "HTML version of the content on page if the `html` format was specified"
|
||||||
},
|
},
|
||||||
"rawHtml": {
|
"rawHtml": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||||
|
},
|
||||||
|
"links": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Links on the page if the `links` format was specified"
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -780,27 +751,16 @@
|
|||||||
"<any other metadata> ": {
|
"<any other metadata> ": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"pageStatusCode": {
|
"statusCode": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "The status code of the page"
|
"description": "The status code of the page"
|
||||||
},
|
},
|
||||||
"pageError": {
|
"error": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "The error message of the page"
|
"description": "The error message of the page"
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
},
|
|
||||||
"llm_extraction": {
|
|
||||||
"type": "object",
|
|
||||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
|
||||||
"nullable": true
|
|
||||||
},
|
|
||||||
"warning": {
|
|
||||||
"type": "string",
|
|
||||||
"nullable": true,
|
|
||||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -810,24 +770,33 @@
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"markdown": {
|
"markdown": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
},
|
"nullable": true,
|
||||||
"content": {
|
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||||
"type": "string"
|
|
||||||
},
|
},
|
||||||
"html": {
|
"html": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
"description": "HTML version of the content on page if the `html` format was specified"
|
||||||
},
|
},
|
||||||
"rawHtml": {
|
"rawHtml": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||||
},
|
},
|
||||||
"index": {
|
"links": {
|
||||||
"type": "integer",
|
"type": "array",
|
||||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Links on the page if the `links` format was specified"
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -849,11 +818,11 @@
|
|||||||
"<any other metadata> ": {
|
"<any other metadata> ": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"pageStatusCode": {
|
"statusCode": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "The status code of the page"
|
"description": "The status code of the page"
|
||||||
},
|
},
|
||||||
"pageError": {
|
"error": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "The error message of the page"
|
"description": "The error message of the page"
|
||||||
@ -871,16 +840,34 @@
|
|||||||
"data": {
|
"data": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"url": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"markdown": {
|
"markdown": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||||
},
|
},
|
||||||
"content": {
|
"html": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "HTML version of the content on page if the `html` format was specified"
|
||||||
|
},
|
||||||
|
"rawHtml": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||||
|
},
|
||||||
|
"links": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Links on the page if the `links` format was specified"
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -898,7 +885,18 @@
|
|||||||
"sourceURL": {
|
"sourceURL": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"format": "uri"
|
"format": "uri"
|
||||||
}
|
},
|
||||||
|
"<any other metadata> ": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"statusCode": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The status code of the page"
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "The error message of the page"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -909,8 +907,15 @@
|
|||||||
"CrawlResponse": {
|
"CrawlResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"jobId": {
|
"success": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -106,7 +106,7 @@
|
|||||||
"uuid": "^10.0.0",
|
"uuid": "^10.0.0",
|
||||||
"wordpos": "^2.1.0",
|
"wordpos": "^2.1.0",
|
||||||
"xml2js": "^0.6.2",
|
"xml2js": "^0.6.2",
|
||||||
"zod": "^3.23.4",
|
"zod": "^3.23.8",
|
||||||
"zod-to-json-schema": "^3.23.1"
|
"zod-to-json-schema": "^3.23.1"
|
||||||
},
|
},
|
||||||
"nodemonConfig": {
|
"nodemonConfig": {
|
||||||
|
2
apps/api/pnpm-lock.yaml
generated
2
apps/api/pnpm-lock.yaml
generated
@ -189,7 +189,7 @@ importers:
|
|||||||
specifier: ^0.6.2
|
specifier: ^0.6.2
|
||||||
version: 0.6.2
|
version: 0.6.2
|
||||||
zod:
|
zod:
|
||||||
specifier: ^3.23.4
|
specifier: ^3.23.8
|
||||||
version: 3.23.8
|
version: 3.23.8
|
||||||
zod-to-json-schema:
|
zod-to-json-schema:
|
||||||
specifier: ^3.23.1
|
specifier: ^3.23.1
|
||||||
|
@ -1,22 +1,8 @@
|
|||||||
import { Request, Response } from "express";
|
import { Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
|
||||||
import { Logger } from "../../../src/lib/logger";
|
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { RequestWithAuth } from "./types";
|
||||||
|
|
||||||
export async function crawlStatusController(req: Request, res: Response) {
|
export async function crawlStatusController(req: RequestWithAuth, res: Response) {
|
||||||
// TODO: validate req.params.jobId
|
|
||||||
|
|
||||||
try {
|
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.CrawlStatus
|
|
||||||
);
|
|
||||||
if (!success) {
|
|
||||||
return res.status(status).json({ error });
|
|
||||||
}
|
|
||||||
|
|
||||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||||
// if (!job) {
|
// if (!job) {
|
||||||
// return res.status(404).json({ error: "Job not found" });
|
// return res.status(404).json({ error: "Job not found" });
|
||||||
@ -78,9 +64,5 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
res.status(200).json(result);
|
res.status(200).json(result);
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error);
|
|
||||||
return res.status(500).json({ error: error.message });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,140 +1,87 @@
|
|||||||
import { Request, Response } from "express";
|
import { Response } from "express";
|
||||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
|
||||||
import { authenticateUser } from "./auth";
|
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
|
||||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
|
||||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
|
||||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { CrawlRequest, CrawlResponse } from "./types";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function crawlController(req: Request<{}, {}, CrawlRequest>, res: Response<CrawlResponse>) {
|
export async function crawlController(req: RequestWithAuth<CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
|
||||||
// expected req.body
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
|
|
||||||
// req.body = {
|
|
||||||
// url: string
|
|
||||||
// crawlerOptions: {
|
|
||||||
// includePaths: string[]
|
|
||||||
// excludePaths: string[]
|
|
||||||
// maxDepth: number
|
|
||||||
// limit: number
|
|
||||||
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
|
|
||||||
// allowExternalLinks: boolean
|
|
||||||
// ignoreSitemap: number
|
|
||||||
// }
|
|
||||||
// scrapeOptions: Exclude<Scrape, "url">
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
try {
|
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Crawl
|
|
||||||
);
|
|
||||||
if (!success) {
|
|
||||||
return res.status(status).json({ success: false, error });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (req.headers["x-idempotency-key"]) {
|
|
||||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
|
||||||
if (!isIdempotencyValid) {
|
|
||||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
createIdempotencyKey(req);
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error);
|
|
||||||
return res.status(500).json({ success: false, error: error.message });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
|
||||||
await checkTeamCredits(team_id, 1);
|
|
||||||
if (!creditsCheckSuccess) {
|
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
|
||||||
}
|
|
||||||
|
|
||||||
let url = req.body.url;
|
|
||||||
if (!url) {
|
|
||||||
return res.status(400).json({ success: false, error: "Url is required" });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
|
||||||
return res
|
|
||||||
.status(403)
|
|
||||||
.json({
|
|
||||||
success: false,
|
|
||||||
error:
|
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
url = checkAndUpdateURL(url).url;
|
|
||||||
} catch (error) {
|
|
||||||
return res.status(400).json({ success: false, error: 'Invalid Url' });
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: add job to queue
|
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
|
|
||||||
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
|
const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions),
|
||||||
|
pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||||
|
|
||||||
|
const sc: StoredCrawl = {
|
||||||
|
originUrl: req.body.url,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
try {
|
||||||
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
} catch (e) {
|
||||||
|
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
|
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
|
|
||||||
|
if (sitemap !== null) {
|
||||||
|
const jobs = sitemap.map(x => {
|
||||||
|
const url = x.url;
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
|
origin: "api",
|
||||||
|
crawl_id: id,
|
||||||
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
|
await lockURLs(id, jobs.map(x => x.data.url));
|
||||||
|
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||||
|
await getScrapeQueue().addBulk(jobs);
|
||||||
|
} else {
|
||||||
|
await lockURL(id, sc, req.body.url);
|
||||||
|
const job = await addScrapeJob({
|
||||||
|
url: req.body.url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: "api",
|
||||||
|
crawl_id: id,
|
||||||
|
}, {
|
||||||
|
priority: 15,
|
||||||
|
});
|
||||||
|
await addCrawlJob(id, job.id);
|
||||||
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
|
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
|
||||||
});
|
});
|
||||||
|
|
||||||
// const mode = req.body.mode ?? "crawl";
|
|
||||||
|
|
||||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
|
||||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
|
||||||
// try {
|
|
||||||
// const a = new WebScraperDataProvider();
|
|
||||||
// await a.setOptions({
|
|
||||||
// jobId: uuidv4(),
|
|
||||||
// mode: "single_urls",
|
|
||||||
// urls: [url],
|
|
||||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
|
||||||
// pageOptions: pageOptions,
|
|
||||||
// });
|
|
||||||
|
|
||||||
// const docs = await a.getDocuments(false, (progress) => {
|
|
||||||
// job.progress({
|
|
||||||
// current: progress.current,
|
|
||||||
// total: progress.total,
|
|
||||||
// current_step: "SCRAPING",
|
|
||||||
// current_url: progress.currentDocumentUrl,
|
|
||||||
// });
|
|
||||||
// });
|
|
||||||
// return res.json({
|
|
||||||
// success: true,
|
|
||||||
// documents: docs,
|
|
||||||
// });
|
|
||||||
// } catch (error) {
|
|
||||||
// Logger.error(error);
|
|
||||||
// return res.status(500).json({ error: error.message });
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const job = await addWebScraperJob({
|
|
||||||
// url: url,
|
|
||||||
// mode: mode ?? "crawl", // fix for single urls not working
|
|
||||||
// crawlerOptions: crawlerOptions,
|
|
||||||
// team_id: team_id,
|
|
||||||
// pageOptions: pageOptions,
|
|
||||||
// origin: req.body.origin ?? defaultOrigin,
|
|
||||||
// });
|
|
||||||
|
|
||||||
// await logCrawl(job.id.toString(), team_id);
|
|
||||||
|
|
||||||
// res.json({ jobId: job.id });
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error);
|
|
||||||
return res.status(500).json({ success: false, error: error.message });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
|
||||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import { MapRequest, MapResponse } from "./types";
|
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
||||||
|
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
||||||
|
|
||||||
export async function mapController(req: Request<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
export async function mapController(req: RequestWithAuth<MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||||
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
console.log(req.body);
|
||||||
// expected req.body
|
// expected req.body
|
||||||
|
|
||||||
// req.body = {
|
// req.body = {
|
||||||
@ -14,55 +14,6 @@ export async function mapController(req: Request<{}, MapResponse, MapRequest>, r
|
|||||||
// crawlerOptions:
|
// crawlerOptions:
|
||||||
// }
|
// }
|
||||||
|
|
||||||
try {
|
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Crawl
|
|
||||||
);
|
|
||||||
if (!success) {
|
|
||||||
return res.status(status).json({ success: false, error });
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (req.headers["x-idempotency-key"]) {
|
|
||||||
// const isIdempotencyValid = await validateIdempotencyKey(req);
|
|
||||||
// if (!isIdempotencyValid) {
|
|
||||||
// return res.status(409).json({ error: "Idempotency key already used" });
|
|
||||||
// }
|
|
||||||
// try {
|
|
||||||
// createIdempotencyKey(req);
|
|
||||||
// } catch (error) {
|
|
||||||
// Logger.error(error);
|
|
||||||
// return res.status(500).json({ error: error.message });
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
|
||||||
// await checkTeamCredits(team_id, 1);
|
|
||||||
// if (!creditsCheckSuccess) {
|
|
||||||
// return res.status(402).json({ error: "Insufficient credits" });
|
|
||||||
// }
|
|
||||||
|
|
||||||
let url = req.body.url;
|
|
||||||
if (!url) {
|
|
||||||
return res.status(400).json({ success: false, error: "Url is required" });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
|
||||||
return res
|
|
||||||
.status(403)
|
|
||||||
.json({
|
|
||||||
success: false,
|
|
||||||
error:
|
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
url = checkAndUpdateURL(url).url;
|
|
||||||
} catch (error) {
|
|
||||||
return res.status(400).json({ success: false, error: 'Invalid Url' });
|
|
||||||
}
|
|
||||||
|
|
||||||
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
||||||
|
|
||||||
@ -112,8 +63,4 @@ export async function mapController(req: Request<{}, MapResponse, MapRequest>, r
|
|||||||
// await logCrawl(job.id.toString(), team_id);
|
// await logCrawl(job.id.toString(), team_id);
|
||||||
|
|
||||||
// res.json({ jobId: job.id });
|
// res.json({ jobId: job.id });
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error);
|
|
||||||
return res.status(500).json({ success: false, error: error.message });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,26 +1,11 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
|
||||||
import { RateLimiterMode } from "../../types";
|
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
|
||||||
import { Logger } from '../../lib/logger';
|
import { Logger } from '../../lib/logger';
|
||||||
import { checkAndUpdateURL } from '../../lib/validateUrl';
|
import { checkAndUpdateURL } from '../../lib/validateUrl';
|
||||||
import { ScrapeRequest, ScrapeResponse } from "./types";
|
import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||||
|
|
||||||
export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||||
let url = req.body.url;
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
if (!url) {
|
console.log(req.body);
|
||||||
return { success: false, error: "Url is required", returnCode: 400 };
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
|
||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
url = checkAndUpdateURL(url).url;
|
|
||||||
} catch (error) {
|
|
||||||
return { success: false, error: "Invalid URL", returnCode: 400 };
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: check req.body
|
// TODO: check req.body
|
||||||
// mockup req.body
|
// mockup req.body
|
||||||
@ -37,17 +22,8 @@ export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRe
|
|||||||
// waitFor: number
|
// waitFor: number
|
||||||
// }
|
// }
|
||||||
|
|
||||||
try {
|
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Scrape
|
|
||||||
);
|
|
||||||
if (!success) {
|
|
||||||
return res.status(status).json({ success: false, error });
|
|
||||||
}
|
|
||||||
|
|
||||||
// check credits
|
// check credits
|
||||||
|
|
||||||
@ -164,10 +140,6 @@ export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRe
|
|||||||
|
|
||||||
|
|
||||||
// return res.status(result.returnCode).json(result);
|
// return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error);
|
|
||||||
return res.status(500).json({ success: false, error: error.message });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,36 +1,96 @@
|
|||||||
|
import { Request } from "express";
|
||||||
|
import { z } from "zod";
|
||||||
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
import { PageOptions } from "../../lib/entities";
|
||||||
|
|
||||||
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
|
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
|
||||||
|
|
||||||
export type ScrapeRequest = {
|
const url = z.preprocess(x => {
|
||||||
url: string;
|
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
||||||
formats?: Format[];
|
if (x.startsWith("://")) {
|
||||||
headers?: { [K: string]: string };
|
return "http" + x;
|
||||||
includeTags?: string[];
|
} else {
|
||||||
excludeTags?: string[];
|
return "http://" + x;
|
||||||
onlyMainContent?: boolean;
|
}
|
||||||
timeout?: number;
|
} else {
|
||||||
waitFor?: number;
|
return x;
|
||||||
}
|
}
|
||||||
|
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
|
||||||
|
|
||||||
export type CrawlerOptions = {
|
export const scrapeOptions = z.object({
|
||||||
includePaths?: string[];
|
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
|
||||||
excludePaths?: string[];
|
.array()
|
||||||
maxDepth?: number;
|
.optional()
|
||||||
limit?: number;
|
.default(["markdown"]),
|
||||||
allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
allowExternalLinks?: boolean;
|
includeTags: z.string().array().optional(),
|
||||||
ignoreSitemap?: boolean;
|
excludeTags: z.string().array().optional(),
|
||||||
};
|
onlyMainContent: z.boolean().default(true),
|
||||||
|
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||||
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
|
});
|
||||||
|
|
||||||
export type CrawlRequest = {
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
url: string;
|
|
||||||
crawlerOptions?: CrawlerOptions;
|
|
||||||
scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type MapRequest = {
|
export const scrapeRequestSchema = scrapeOptions.extend({ url });
|
||||||
url: string;
|
|
||||||
crawlerOptions?: CrawlerOptions;
|
// export type ScrapeRequest = {
|
||||||
};
|
// url: string;
|
||||||
|
// formats?: Format[];
|
||||||
|
// headers?: { [K: string]: string };
|
||||||
|
// includeTags?: string[];
|
||||||
|
// excludeTags?: string[];
|
||||||
|
// onlyMainContent?: boolean;
|
||||||
|
// timeout?: number;
|
||||||
|
// waitFor?: number;
|
||||||
|
// }
|
||||||
|
|
||||||
|
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||||
|
|
||||||
|
const crawlerOptions = z.object({
|
||||||
|
includePaths: z.string().array().default([]),
|
||||||
|
excludePaths: z.string().array().default([]),
|
||||||
|
maxDepth: z.number().default(10), // default?
|
||||||
|
limit: z.number().default(10000), // default?
|
||||||
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
|
allowExternalLinks: z.boolean().default(false),
|
||||||
|
ignoreSitemap: z.boolean().default(true),
|
||||||
|
});
|
||||||
|
|
||||||
|
// export type CrawlerOptions = {
|
||||||
|
// includePaths?: string[];
|
||||||
|
// excludePaths?: string[];
|
||||||
|
// maxDepth?: number;
|
||||||
|
// limit?: number;
|
||||||
|
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||||
|
// allowExternalLinks?: boolean;
|
||||||
|
// ignoreSitemap?: boolean;
|
||||||
|
// };
|
||||||
|
|
||||||
|
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||||
|
|
||||||
|
export const crawlRequestSchema = z.object({
|
||||||
|
url,
|
||||||
|
crawlerOptions: crawlerOptions.default({}),
|
||||||
|
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||||
|
});
|
||||||
|
|
||||||
|
// export type CrawlRequest = {
|
||||||
|
// url: string;
|
||||||
|
// crawlerOptions?: CrawlerOptions;
|
||||||
|
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||||
|
// };
|
||||||
|
|
||||||
|
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||||
|
|
||||||
|
export const mapRequestSchema = crawlerOptions.extend({ url });
|
||||||
|
|
||||||
|
// export type MapRequest = {
|
||||||
|
// url: string;
|
||||||
|
// crawlerOptions?: CrawlerOptions;
|
||||||
|
// };
|
||||||
|
|
||||||
|
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||||
|
|
||||||
export type Document = {
|
export type Document = {
|
||||||
markdown?: string,
|
markdown?: string,
|
||||||
@ -77,6 +137,7 @@ export type Document = {
|
|||||||
export type ErrorResponse = {
|
export type ErrorResponse = {
|
||||||
success: false;
|
success: false;
|
||||||
error: string;
|
error: string;
|
||||||
|
details?: any;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ScrapeResponse = ErrorResponse | {
|
export type ScrapeResponse = ErrorResponse | {
|
||||||
@ -95,3 +156,40 @@ export type MapResponse = ErrorResponse | {
|
|||||||
success: true;
|
success: true;
|
||||||
links: string[];
|
links: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type AuthObject = {
|
||||||
|
team_id: string;
|
||||||
|
plan: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RequestWithMaybeAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
||||||
|
auth?: AuthObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RequestWithAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
||||||
|
auth: AuthObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
|
return {
|
||||||
|
includes: x.includePaths,
|
||||||
|
excludes: x.excludePaths,
|
||||||
|
maxCrawledLinks: x.limit,
|
||||||
|
maxCrawledDepth: x.maxDepth,
|
||||||
|
limit: x.limit,
|
||||||
|
generateImgAltText: false,
|
||||||
|
allowBackwardCrawling: x.allowBackwardLinks,
|
||||||
|
allowExternalContentLinks: x.allowExternalLinks,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||||
|
return {
|
||||||
|
includeHtml: x.formats.includes("html"),
|
||||||
|
includeRawHtml: x.formats.includes("rawHtml"),
|
||||||
|
onlyIncludeTags: x.includeTags,
|
||||||
|
removeTags: x.excludeTags,
|
||||||
|
onlyMainContent: x.onlyMainContent,
|
||||||
|
waitFor: x.waitFor,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
@ -1,9 +1,18 @@
|
|||||||
import express from "express";
|
import express, { NextFunction, Request, Response } from "express";
|
||||||
import { crawlController } from "../../src/controllers/v1/crawl";
|
import { crawlController } from "../../src/controllers/v1/crawl";
|
||||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||||
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||||
import { mapController } from "../../src/controllers/v1/map";
|
import { mapController } from "../../src/controllers/v1/map";
|
||||||
|
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||||
|
import { RateLimiterMode } from "../types";
|
||||||
|
import { authenticateUser } from "../controllers/v1/auth";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||||
|
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||||
|
import { ZodError } from "zod";
|
||||||
|
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||||
// import { searchController } from "../../src/controllers/v1/search";
|
// import { searchController } from "../../src/controllers/v1/search";
|
||||||
@ -12,13 +21,96 @@ import { mapController } from "../../src/controllers/v1/map";
|
|||||||
// import { livenessController } from "../controllers/v1/liveness";
|
// import { livenessController } from "../controllers/v1/liveness";
|
||||||
// import { readinessController } from "../controllers/v1/readiness";
|
// import { readinessController } from "../controllers/v1/readiness";
|
||||||
|
|
||||||
|
function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||||
|
return (req, res, next) => {
|
||||||
|
(async () => {
|
||||||
|
if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) {
|
||||||
|
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||||
|
}
|
||||||
|
next();
|
||||||
|
})()
|
||||||
|
.catch(err => next(err));
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||||
|
return (req, res, next) => {
|
||||||
|
(async () => {
|
||||||
|
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||||
|
req,
|
||||||
|
res,
|
||||||
|
rateLimiterMode,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
return res.status(status).json({ success: false, error });
|
||||||
|
}
|
||||||
|
|
||||||
|
req.auth = { team_id, plan };
|
||||||
|
next();
|
||||||
|
})()
|
||||||
|
.catch(err => next(err));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||||
|
(async () => {
|
||||||
|
if (req.headers["x-idempotency-key"]) {
|
||||||
|
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||||
|
if (!isIdempotencyValid) {
|
||||||
|
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||||
|
}
|
||||||
|
// try {
|
||||||
|
createIdempotencyKey(req);
|
||||||
|
// } catch (error) {
|
||||||
|
// Logger.error(error);
|
||||||
|
// return res.status(500).json({ success: false, error: error.message });
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
next();
|
||||||
|
})()
|
||||||
|
.catch(err => next(err));
|
||||||
|
}
|
||||||
|
|
||||||
|
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||||
|
return (req, res, next) => {
|
||||||
|
controller(req, res)
|
||||||
|
.catch(err => next(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export const v1Router = express.Router();
|
export const v1Router = express.Router();
|
||||||
|
|
||||||
v1Router.post("/v1/scrape", scrapeController);
|
v1Router.post(
|
||||||
v1Router.post("/v1/crawl", crawlController);
|
"/v1/scrape",
|
||||||
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
|
checkCreditsMiddleware(1),
|
||||||
|
wrap(scrapeController)
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.post(
|
||||||
|
"/v1/crawl",
|
||||||
|
authMiddleware(RateLimiterMode.Crawl),
|
||||||
|
idempotencyMiddleware,
|
||||||
|
checkCreditsMiddleware(1),
|
||||||
|
wrap(crawlController)
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.post(
|
||||||
|
"/v1/map",
|
||||||
|
authMiddleware(RateLimiterMode.Crawl),
|
||||||
|
checkCreditsMiddleware(1),
|
||||||
|
wrap(mapController)
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/v1/crawl/:jobId",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(crawlStatusController)
|
||||||
|
);
|
||||||
|
|
||||||
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
|
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
|
||||||
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
|
// v1Router.delete("/v1/crawl/:jobId", crawlCancelController);
|
||||||
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||||
|
|
||||||
// // Auth route for key based authentication
|
// // Auth route for key based authentication
|
||||||
@ -31,4 +123,12 @@ v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
|||||||
// v1Router.get("/v1/health/liveness", livenessController);
|
// v1Router.get("/v1/health/liveness", livenessController);
|
||||||
// v1Router.get("/v1/health/readiness", readinessController);
|
// v1Router.get("/v1/health/readiness", readinessController);
|
||||||
|
|
||||||
v1Router.post("/v1/map", mapController);
|
v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||||
|
if (err instanceof ZodError) {
|
||||||
|
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||||
|
} else {
|
||||||
|
const id = uuidv4();
|
||||||
|
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err));
|
||||||
|
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user