diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 5b1b9f69..41f21e71 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -132,7 +132,7 @@ jobs: working-directory: ./apps/python-sdk - name: Run E2E tests for Python SDK run: | - pytest firecrawl/__tests__/e2e_withAuth/test.py + pytest firecrawl/__tests__/v1/e2e_withAuth/test.py working-directory: ./apps/python-sdk js-sdk-tests: diff --git a/.gitignore b/.gitignore index 91b7ef48..9eb551a9 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ apps/test-suite/load-test-results/test-run-report.json apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/package-lock.json +*.pyc +.rdb diff --git a/.gitmodules b/.gitmodules index d56adf88..b42c5d23 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "apps/go-sdk/firecrawl"] - path = apps/go-sdk/firecrawl +[submodule "apps/go-sdk/firecrawl-go"] + path = apps/go-sdk/firecrawl-go url = https://github.com/mendableai/firecrawl-go -[submodule "apps/go-sdk/examples"] - path = apps/go-sdk/examples +[submodule "apps/go-sdk/firecrawl-go-examples"] + path = apps/go-sdk/firecrawl-go-examples url = https://github.com/mendableai/firecrawl-go-examples diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cece879b..d0145a6b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,6 @@ BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs diff --git a/README.md b/README.md index 01324690..21f480cc 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom ## What is Firecrawl? -[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. +[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev). _Pst. hey, you, join our stargazers :)_ @@ -41,18 +41,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl. ```bash -curl -X POST https://api.firecrawl.dev/v0/crawl \ +curl -X POST https://api.firecrawl.dev/v1/crawl \ -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer YOUR_API_KEY' \ + -H 'Authorization: Bearer fc-YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "limit": 100, + "scrapeOptions": { + "formats": ["markdown", "html"] + } }' ``` -Returns a jobId +Returns a crawl job id and the url to check the status of the crawl. ```json -{ "jobId": "1234-5678-9101" } +{ + "success": true, + "id": "123-456-789", + "url": "https://api.firecrawl.dev/v1/crawl/123-456-789" +} ``` ### Check Crawl Job @@ -60,7 +68,7 @@ Returns a jobId Used to check the status of a crawl job and get its result. ```bash -curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ +curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' ``` @@ -68,18 +76,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ```json { "status": "completed", - "current": 22, - "total": 22, + "total": 36, + "creditsUsed": 36, + "expiresAt": "2024-00-00T00:00:00.000Z", "data": [ { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...", + "html": "...", "metadata": { - "title": "Mendable | AI for CX and Sales", - "description": "AI for CX and Sales", - "language": null, - "sourceURL": "https://www.mendable.ai/" + "title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl", + "language": "en", + "sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3", + "description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.", + "ogLocaleAlternate": [], + "statusCode": 200 } } ] @@ -88,14 +98,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ### Scraping -Used to scrape a URL and get its content. +Used to scrape a URL and get its content in the specified formats. ```bash -curl -X POST https://api.firecrawl.dev/v0/scrape \ +curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "formats" : ["markdown", "html"] }' ``` @@ -105,55 +116,83 @@ Response: { "success": true, "data": { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...", + "html": " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + + } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + } + } + } + } + }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + }, + "SearchResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "jobId": { + "type": "string" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] +} \ No newline at end of file diff --git a/apps/api/openapi.json b/apps/api/openapi.json index fb0c4305..5bd3e3d8 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -18,8 +18,8 @@ "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL and optionally extract information using an LLM", - "operationId": "scrapeAndExtractFromUrl", + "summary": "Scrape a single URL", + "operationId": "scrape", "tags": ["Scraping"], "security": [ { @@ -38,94 +38,47 @@ "format": "uri", "description": "The URL to scrape" }, - "pageOptions": { - "type": "object", - "properties": { - "headers": { - "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "includeHtml": { - "type": "boolean", - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "default": false - }, - "includeRawHtml": { - "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "default": false - }, - "onlyIncludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false - }, - "removeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, - "screenshot": { - "type": "boolean", - "description": "Include a screenshot of the top of the page that you are scraping.", - "default": false - }, - "fullPageScreenshot": { - "type": "boolean", - "description": "Include a full page screenshot of the page that you are scraping.", - "default": false - }, - "waitFor": { - "type": "integer", - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "default": 0 - } - } + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"] + }, + "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)", + "default": ["markdown"] }, - "extractorOptions": { + "headers": { "type": "object", - "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", - "default": {}, - "properties": { - "mode": { - "type": "string", - "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], - "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." - }, - "extractionPrompt": { - "type": "string", - "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." - }, - "extractionSchema": { - "type": "object", - "additionalProperties": true, - "description": "The schema for the data to be extracted, required only for LLM extraction modes.", - "required": [ - "company_mission", - "supports_sso", - "is_open_source" - ] - } - } + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true }, "timeout": { "type": "integer", "description": "Timeout in milliseconds for the request", "default": 30000 + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 } }, "required": ["url"] @@ -741,24 +694,42 @@ "success": { "type": "boolean" }, + "warning": { + "type": "string", + "nullable": true, + "description": "Warning message to let you know of any issues." + }, "data": { "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -780,27 +751,16 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" } - } - }, - "llm_extraction": { - "type": "object", - "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", - "nullable": true - }, - "warning": { - "type": "string", - "nullable": true, - "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." } } } @@ -810,24 +770,33 @@ "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" }, - "index": { - "type": "integer", - "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -849,11 +818,11 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" @@ -871,34 +840,63 @@ "data": { "type": "array", "items": { - "type": "object", - "properties": { - "url": { - "type": "string" + "markdown": { + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if the `html` format was specified" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" }, - "markdown": { - "type": "string" - }, - "content": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - } + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } } @@ -909,8 +907,15 @@ "CrawlResponse": { "type": "object", "properties": { - "jobId": { + "success": { + "type": "boolean" + }, + "id": { "type": "string" + }, + "url": { + "type": "string", + "format": "uri" } } } diff --git a/apps/api/package.json b/apps/api/package.json index c9058943..0ece960a 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -61,6 +61,8 @@ "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", "@supabase/supabase-js": "^2.44.2", + "@types/express-ws": "^3.0.4", + "@types/ws": "^8.5.12", "ajv": "^8.16.0", "async": "^3.2.5", "async-mutex": "^0.5.0", @@ -76,6 +78,7 @@ "dotenv": "^16.3.1", "dotenv-cli": "^7.4.2", "express-rate-limit": "^7.3.1", + "express-ws": "^5.0.2", "form-data": "^4.0.0", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", @@ -110,8 +113,9 @@ "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "wordpos": "^2.1.0", + "ws": "^8.18.0", "xml2js": "^0.6.2", - "zod": "^3.23.4", + "zod": "^3.23.8", "zod-to-json-schema": "^3.23.1" }, "nodemonConfig": { diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index efbe9d80..727c4ed4 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -47,6 +47,12 @@ importers: '@supabase/supabase-js': specifier: ^2.44.2 version: 2.44.2 + '@types/express-ws': + specifier: ^3.0.4 + version: 3.0.4 + '@types/ws': + specifier: ^8.5.12 + version: 8.5.12 ajv: specifier: ^8.16.0 version: 8.16.0 @@ -92,6 +98,9 @@ importers: express-rate-limit: specifier: ^7.3.1 version: 7.3.1(express@4.19.2) + express-ws: + specifier: ^5.0.2 + version: 5.0.2(express@4.19.2) form-data: specifier: ^4.0.0 version: 4.0.0 @@ -115,7 +124,7 @@ importers: version: 0.0.28 langchain: specifier: ^0.2.8 - version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) + version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -194,11 +203,14 @@ importers: wordpos: specifier: ^2.1.0 version: 2.1.0 + ws: + specifier: ^8.18.0 + version: 8.18.0 xml2js: specifier: ^0.6.2 version: 0.6.2 zod: - specifier: ^3.23.4 + specifier: ^3.23.8 version: 3.23.8 zod-to-json-schema: specifier: ^3.23.1 @@ -1637,6 +1649,9 @@ packages: '@types/express-serve-static-core@4.19.3': resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==} + '@types/express-ws@3.0.4': + resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==} + '@types/express@4.17.21': resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==} @@ -1739,8 +1754,8 @@ packages: '@types/whatwg-url@11.0.5': resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==} - '@types/ws@8.5.10': - resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} + '@types/ws@8.5.12': + resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==} '@types/yargs-parser@21.0.3': resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} @@ -2506,6 +2521,12 @@ packages: peerDependencies: express: 4 || 5 || ^5.0.0-beta.1 + express-ws@5.0.2: + resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==} + engines: {node: '>=4.5.0'} + peerDependencies: + express: ^4.0.0 || ^5.0.0-alpha.1 + express@4.19.2: resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==} engines: {node: '>= 0.10.0'} @@ -4647,8 +4668,20 @@ packages: resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} - ws@8.17.1: - resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==} + ws@7.5.10: + resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==} + engines: {node: '>=8.3.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: ^5.0.2 + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + ws@8.18.0: + resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==} engines: {node: '>=10.0.0'} peerDependencies: bufferutil: ^4.0.1 @@ -5286,13 +5319,13 @@ snapshots: '@js-sdsl/ordered-map@4.4.2': {} - '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': + '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5304,9 +5337,9 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))': + '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) js-tiktoken: 1.0.12 openai: 4.52.2 zod: 3.23.8 @@ -5315,9 +5348,9 @@ snapshots: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -6545,8 +6578,8 @@ snapshots: dependencies: '@supabase/node-fetch': 2.6.15 '@types/phoenix': 1.6.5 - '@types/ws': 8.5.10 - ws: 8.17.1 + '@types/ws': 8.5.12 + ws: 8.18.0 transitivePeerDependencies: - bufferutil - utf-8-validate @@ -6643,6 +6676,12 @@ snapshots: '@types/range-parser': 1.2.7 '@types/send': 0.17.4 + '@types/express-ws@3.0.4': + dependencies: + '@types/express': 4.17.21 + '@types/express-serve-static-core': 4.19.3 + '@types/ws': 8.5.12 + '@types/express@4.17.21': dependencies: '@types/body-parser': 1.19.5 @@ -6766,7 +6805,7 @@ snapshots: dependencies: '@types/webidl-conversions': 7.0.3 - '@types/ws@8.5.10': + '@types/ws@8.5.12': dependencies: '@types/node': 20.14.1 @@ -7521,6 +7560,14 @@ snapshots: dependencies: express: 4.19.2 + express-ws@5.0.2(express@4.19.2): + dependencies: + express: 4.19.2 + ws: 7.5.10 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + express@4.19.2: dependencies: accepts: 1.3.8 @@ -8440,17 +8487,17 @@ snapshots: kleur@3.0.3: {} - langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1): + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) - '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8470,14 +8517,14 @@ snapshots: pdf-parse: 1.1.1 puppeteer: 22.12.1(typescript@5.4.5) redis: 4.6.14 - ws: 8.17.1 + ws: 8.18.0 transitivePeerDependencies: - encoding - openai langchainhub@0.0.11: {} - langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2): + langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8486,8 +8533,8 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) - langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) openai: 4.52.2 languagedetect@2.0.0: {} @@ -9195,7 +9242,7 @@ snapshots: chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070) debug: 4.3.5 devtools-protocol: 0.0.1299070 - ws: 8.17.1 + ws: 8.18.0 transitivePeerDependencies: - bufferutil - supports-color @@ -9877,7 +9924,9 @@ snapshots: imurmurhash: 0.1.4 signal-exit: 4.1.0 - ws@8.17.1: {} + ws@7.5.10: {} + + ws@8.18.0: {} xml2js@0.6.2: dependencies: diff --git a/apps/api/requests.http b/apps/api/requests.http index 3a1a9902..5d55b481 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,12 +1,16 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc +Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 content-type: application/json { - "url":"firecrawl.dev" + "url":"corterix.com" } +### Check Job Status +GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 +Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 + ### Check Job Status GET http://localhost:3002/v0/jobs/active HTTP/1.1 diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 019bc968..b1708abc 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(firstResponse.statusCode).toBe(200); @@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(secondResponse.statusCode).toBe(409); expect(secondResponse.body.error).toBe('Idempotency key already used'); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts new file mode 100644 index 00000000..dd7d4f16 --- /dev/null +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -0,0 +1,951 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { + ScrapeRequest, + ScrapeResponseRequestTest, +} from "../../controllers/v1/types"; + +dotenv.config(); +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for v1 API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); + + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).get( + "/is-production" + ); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); + + describe("POST /v1/scrape", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).post( + "/v1/scrape" + ); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://facebook.com/fake-test", + }; + + const response = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.metadata.error).toBeUndefined(); + expect(response.body.data.metadata.title).toBe("Roast My Website"); + expect(response.body.data.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(response.body.data.metadata.robots).toBe("follow, index"); + expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); + expect(response.body.data.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(response.body.data.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.body.data.metadata.sourceURL).toBe( + "https://roastmywebsite.ai" + ); + expect(response.body.data.metadata.statusCode).toBe(200); + }, + 30000 + ); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["markdown", "html"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" + // formats: ["markdown", "html"], + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001" + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + onlyMainContent: false // default is true + }; + const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + + if (!("data" in responseWithoutRemoveTags.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav + expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer + + const scrapeRequestWithRemoveTags: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + excludeTags: ['.nav', '#footer', 'strong'], + onlyMainContent: false // default is true + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequestWithRemoveTags); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); + expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // + }, 30000); + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(400); + }, 60000); + + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(401); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 403 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(403); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(404); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(405); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(500); + }, 60000); + + it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["html","rawHtml"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.html).toContain(" { + const scrapeRequest: ScrapeRequest = { + url: "https://ycombinator.com/companies", + formats: ["markdown"], + waitFor: 5000 + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("links"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("PagerDuty"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + + }, + 30000 + ); + + it.concurrent( + "should return a successful response with a valid links on page", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["links"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("links"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.links).toContain("https://firecrawl.dev"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 30000 + ); + + + }); + +describe("POST /v1/map", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).post( + "/v1/map" + ); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key", async () => { + const mapRequest = { + url: "https://roastmywebsite.ai" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + }); + + it.concurrent("should return a successful response with a valid API key and search", async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }, 10000) + + it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + }) + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); +}); + + +describe("POST /v1/crawl", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).post( + "/v1/crawl" + ); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://facebook.com/fake-test", + }; + + const response = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("id"); + expect(response.body.id).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("url"); + expect(response.body.url).toContain("/v1/crawl/"); + }); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 10, + includePaths: ["blog/*"], + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url).toContain("firecrawl.dev/blog"); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 10, + excludePaths: ["blog/*"], + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(3); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + maxDepth: 1, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); +}) + +describe("GET /v1/crawl/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v1/crawl/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); + + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.mendable.ai" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.error + ).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds + + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.tatum.io", limit: 200 }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 10000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("cancelled"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 60000 + ); // 60 seconds +}) +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0771d10e..330f8130 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,11 +1,15 @@ import request from "supertest"; import dotenv from "dotenv"; -import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types"; +import { + FirecrawlCrawlResponse, + FirecrawlCrawlStatusResponse, + FirecrawlScrapeResponse, +} from "../../types"; dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; -describe("E2E Tests for API Routes", () => { +describe("E2E Tests for v0 API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; }); @@ -24,273 +28,365 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape"); + const response: FirecrawlScrapeResponse = await request(TEST_URL).post( + "/v0/scrape" + ); expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a successful response with a valid API key", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://roastmywebsite.ai" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.metadata.pageError).toBeUndefined(); - expect(response.body.data.metadata.title).toBe("Roast My Website"); - expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); - expect(response.body.data.metadata.robots).toBe("follow, index"); - expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); - expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); - expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); - expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); - expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); - expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://roastmywebsite.ai" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageError).toBeUndefined(); + expect(response.body.data.metadata.title).toBe("Roast My Website"); + expect(response.body.data.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(response.body.data.metadata.robots).toBe("follow, index"); + expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); + expect(response.body.data.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(response.body.data.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.body.data.metadata.sourceURL).toBe( + "https://roastmywebsite.ai" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + }, + 30000 + ); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("html"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.markdown).toContain("_Roast_"); - expect(response.body.data.html).toContain(" { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); - await new Promise((r) => setTimeout(r, 6000)); + it.concurrent( + "should return a successful response for a valid scrape with PDF file", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" }); + await new Promise((r) => setTimeout(r, 6000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - expect(response.body.data.metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds - - it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); - await new Promise((r) => setTimeout(r, 6000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - expect(response.body.data.metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a valid scrape with PDF file without explicit .pdf extension", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" }); + await new Promise((r) => setTimeout(r, 6000)); - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/" }); - expect(responseWithoutRemoveTags.statusCode).toBe(200); - expect(responseWithoutRemoveTags.body).toHaveProperty("data"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); - expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer - expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav - expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("Scrape This Site"); - expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer - expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav - expect(response.body.data.content).not.toContain("web scraping"); // strong - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key with removeTags option", + async () => { + const responseWithoutRemoveTags: FirecrawlScrapeResponse = + await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Scrape This Site" + ); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Lessons and Videos" + ); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain( + "[Sandbox](" + ); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain( + "web scraping" + ); // strong - it.concurrent('should return a successful response for a scrape with 400 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/400' }); - await new Promise((r) => setTimeout(r, 5000)); + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/", + pageOptions: { removeTags: [".nav", "#footer", "strong"] }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, + 30000 + ); // 30 seconds timeout - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(400); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 400 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/400" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 401 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/401' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "bad request" + ); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(401); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 401 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/401" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent("should return a successful response for a scrape with 403 page", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "unauthorized" + ); + }, + 60000 + ); // 60 seconds - await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(403); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 403 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/403" }); - it.concurrent('should return a successful response for a scrape with 404 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/404' }); - await new Promise((r) => setTimeout(r, 5000)); + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "forbidden" + ); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(404); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 404 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/404" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 405 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/405' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(405); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 405 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/405" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/500' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(500); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 500 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/500" }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + }, + 60000 + ); // 60 seconds }); describe("POST /v0/crawl", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl"); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return a successful response with a valid API key for crawl", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + const response: FirecrawlCrawlResponse = await request(TEST_URL).post( + "/v0/crawl" ); + expect(response.statusCode).toBe(401); }); - - it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - includes: ["blog/*"], - }, - }); - + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return a successful response with a valid API key for crawl", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + } + ); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"], + }, + }); + let response: FirecrawlCrawlStatusResponse; let isFinished = false; @@ -310,153 +406,189 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"], + }, + }); + + let isFinished = false; + let response: FirecrawlCrawlStatusResponse; + + while (!isFinished) { + response = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); - }); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); // 180 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; - it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - excludes: ["blog/*"], - }, - }); - - let isFinished = false; - let response: FirecrawlCrawlStatusResponse; + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } - while (!isFinished) { - response = await request(TEST_URL) + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse: FirecrawlCrawlStatusResponse = await request( + TEST_URL + ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response: FirecrawlCrawlStatusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); - }); - }, 90000); // 90 seconds - - it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + const completedResponse: FirecrawlCrawlStatusResponse = await request( + TEST_URL + ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); - // Check if all URLs have a maximum depth of 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, 180000); + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); }); describe("POST /v0/crawlWebsitePreview", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + const response: FirecrawlCrawlResponse = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", timeout: 1000 }); + it.concurrent( + "should return a timeout error when scraping takes longer than the specified timeout", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); - expect(response.statusCode).toBe(408); - }, 3000); + expect(response.statusCode).toBe(408); + }, + 3000 + ); }); describe("POST /v0/search", () => { @@ -465,26 +597,33 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a successful response with a valid API key for search", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 60000); // 60 seconds timeout + it.concurrent( + "should return a successful response with a valid API key for search", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, + 60000 + ); // 60 seconds timeout }); describe("GET /v0/crawl/status/:jobId", () => { @@ -493,66 +632,83 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); - expect(crawlResponse.statusCode).toBe(200); - - let isCompleted = false; - - while (!isCompleted) { + it.concurrent( + "should return an error response with an invalid API key", + async () => { const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); + + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://mendable.ai/blog" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - if (response.body.status === "completed") { - isCompleted = true; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL && + doc.metadata.sourceURL.includes("mendable.ai/blog") + ); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds - const childrenLinks = completedResponse.body.data.filter(doc => - doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - ); - - expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 180000); // 120 seconds - // TODO: review the test below // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { // const crawlResponse = await request(TEST_URL) @@ -599,97 +755,118 @@ describe("E2E Tests for API Routes", () => { // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // }, 180000); // 120 seconds - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } }); - expect(crawlResponse.statusCode).toBe(200); + expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); - const responseCancel = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); - let isNullOrEmptyArray = false; - if (completedResponse.body.data === null || completedResponse.body.data.length === 0) { - isNullOrEmptyArray = true; - } - expect(isNullOrEmptyArray).toBe(true); - expect(completedResponse.body.data).toEqual(expect.arrayContaining([])); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + let isNullOrEmptyArray = false; + if ( + completedResponse.body.data === null || + completedResponse.body.data.length === 0 + ) { + isNullOrEmptyArray = true; + } + expect(isNullOrEmptyArray).toBe(true); + expect(completedResponse.body.data).toEqual(expect.arrayContaining([])); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "content" + ); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "markdown" + ); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "metadata" + ); + expect( + completedResponse.body.partial_data[0].metadata.pageStatusCode + ).toBe(200); + expect( + completedResponse.body.partial_data[0].metadata.pageError + ).toBeUndefined(); + }, + 60000 + ); // 60 seconds }); describe("POST /v0/scrape with LLM Extraction", () => { - it.concurrent("should extract data using LLM extraction mode", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - pageOptions: { - onlyMainContent: true, - }, - extractorOptions: { - mode: "llm-extraction", - extractionPrompt: - "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractionSchema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", - }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], + it.concurrent( + "should extract data using LLM extraction mode", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true, }, - }, - }); + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); - // Ensure that the job was successfully created before proceeding with LLM extraction - expect(response.statusCode).toBe(200); + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` - let llmExtraction = response.body.data.llm_extraction; + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; - // Check if the llm_extraction object has the required properties with correct types and values - expect(llmExtraction).toHaveProperty("company_mission"); - expect(typeof llmExtraction.company_mission).toBe("string"); - expect(llmExtraction).toHaveProperty("supports_sso"); - expect(llmExtraction.supports_sso).toBe(true); - expect(typeof llmExtraction.supports_sso).toBe("boolean"); - expect(llmExtraction).toHaveProperty("is_open_source"); - expect(llmExtraction.is_open_source).toBe(false); - expect(typeof llmExtraction.is_open_source).toBe("boolean"); - }, 60000); // 60 secs + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, + 60000 + ); // 60 secs }); }); diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts index 621c7436..e65523cb 100644 --- a/apps/api/src/controllers/__tests__/crawl.test.ts +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -1,4 +1,4 @@ -import { crawlController } from '../crawl' +import { crawlController } from '../v0/crawl' import { Request, Response } from 'express'; import { authenticateUser } from '../auth'; // Ensure this import is correct import { createIdempotencyKey } from '../../services/idempotency/create'; diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 1796acc2..0aee6db0 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,23 +1,36 @@ -import { parseApi } from "../../src/lib/parseApi"; -import { getRateLimiter } from "../../src/services/rate-limiter"; +import { parseApi } from "../lib/parseApi"; +import { getRateLimiter } from "../services/rate-limiter"; import { AuthResponse, NotificationType, PlanType, RateLimiterMode, -} from "../../src/types"; -import { supabase_service } from "../../src/services/supabase"; -import { withAuth } from "../../src/lib/withAuth"; +} from "../types"; +import { supabase_service } from "../services/supabase"; +import { withAuth } from "../lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { sendNotification } from "../services/notification/email_notification"; import { Logger } from "../lib/logger"; -import { redlock } from "../../src/services/redlock"; -import { getValue } from "../../src/services/redis"; -import { setValue } from "../../src/services/redis"; +import { redlock } from "../services/redlock"; +import { getValue } from "../services/redis"; +import { setValue } from "../services/redis"; import { validate } from "uuid"; import * as Sentry from "@sentry/node"; - +// const { data, error } = await supabase_service +// .from('api_keys') +// .select(` +// key, +// team_id, +// teams ( +// subscriptions ( +// price_id +// ) +// ) +// `) +// .eq('key', normalizedApi) +// .limit(1) +// .single(); function normalizedApiIsUuid(potentialUuid: string): boolean { // Check if the string is a valid UUID return validate(potentialUuid); @@ -119,7 +132,11 @@ export async function supaAuthenticateUser( let priceId: string | null = null; if (token == "this_is_just_a_preview_token") { - rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); + if (mode == RateLimiterMode.CrawlStatus) { + rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); + } else { + rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); + } teamId = "preview"; } else { normalizedApi = parseApi(token); @@ -155,7 +172,7 @@ export async function supaAuthenticateUser( await setValue( cacheKey, JSON.stringify({ team_id: teamId, price_id: priceId }), - 10 + 60 ); } } catch (error) { @@ -234,6 +251,13 @@ export async function supaAuthenticateUser( subscriptionData.plan ); break; + case RateLimiterMode.Map: + rateLimiter = getRateLimiter( + RateLimiterMode.Map, + token, + subscriptionData.plan + ); + break; case RateLimiterMode.CrawlStatus: rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); break; @@ -286,6 +310,9 @@ export async function supaAuthenticateUser( token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || + mode === RateLimiterMode.Map || + mode === RateLimiterMode.Crawl || + mode === RateLimiterMode.CrawlStatus || mode === RateLimiterMode.Search) ) { return { success: true, team_id: "preview" }; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts deleted file mode 100644 index 1ff9a426..00000000 --- a/apps/api/src/controllers/scrape.ts +++ /dev/null @@ -1,234 +0,0 @@ - import { ExtractorOptions, PageOptions } from './../lib/entities'; -import { Request, Response } from "express"; -import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { PlanType, RateLimiterMode } from "../types"; -import { logJob } from "../services/logging/log_job"; -import { Document } from "../lib/entities"; -import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function -import { numTokensFromString } from '../lib/LLM-extraction/helpers'; -import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; -import { addScrapeJob } from '../services/queue-jobs'; -import { getScrapeQueue } from '../services/queue-service'; -import { v4 as uuidv4 } from "uuid"; -import { Logger } from '../lib/logger'; -import { getJobPriority } from '../lib/job-priority'; -import * as Sentry from "@sentry/node"; - -export async function scrapeHelper( - jobId: string, - req: Request, - team_id: string, - crawlerOptions: any, - pageOptions: PageOptions, - extractorOptions: ExtractorOptions, - timeout: number, - plan?: PlanType -): Promise<{ - success: boolean; - error?: string; - data?: Document; - returnCode: number; -}> { - const url = req.body.url; - if (!url) { - return { success: false, error: "Url is required", returnCode: 400 }; - } - - if (isUrlBlocked(url)) { - return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; - } - - const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) - - const job = await addScrapeJob({ - url, - mode: "single_urls", - crawlerOptions, - team_id, - pageOptions, - extractorOptions, - origin: req.body.origin ?? defaultOrigin, - }, {}, jobId, jobPriority); - - let doc; - - const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { - try { - doc = (await new Promise((resolve, reject) => { - const start = Date.now(); - const int = setInterval(async () => { - if (Date.now() >= start + timeout) { - clearInterval(int); - reject(new Error("Job wait ")); - } else { - const state = await job.getState(); - if (state === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(job.id)).returnvalue); - } else if (state === "failed") { - clearInterval(int); - reject((await getScrapeQueue().getJob(job.id)).failedReason); - } - } - }, 1000); - }))[0] - } catch (e) { - if (e instanceof Error && e.message.startsWith("Job wait")) { - span.setAttribute("timedOut", true); - return { - success: false, - error: "Request timed out", - returnCode: 408, - } - } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) { - return { - success: false, - error: e, - returnCode: 500, - }; - } else { - throw e; - } - } - span.setAttribute("result", JSON.stringify(doc)); - return null; - }); - - if (err !== null) { - return err; - } - - await job.remove(); - - if (!doc) { - console.error("!!! PANIC DOC IS", doc, job); - return { success: true, error: "No page found", returnCode: 200, data: doc }; - } - - delete doc.index; - delete doc.provider; - - // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html - if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - delete doc.rawHtml; - } - - return { - success: true, - data: doc, - returnCode: 200, - }; -} - -export async function scrapeController(req: Request, res: Response) { - try { - let earlyReturn = false; - // make sure to authenticate user first, Bearer - const { success, team_id, error, status, plan } = await authenticateUser( - req, - res, - RateLimiterMode.Scrape - ); - if (!success) { - return res.status(status).json({ error }); - } - - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; - const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; - const origin = req.body.origin ?? defaultOrigin; - let timeout = req.body.timeout ?? defaultTimeout; - - if (extractorOptions.mode.includes("llm-extraction")) { - if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) { - return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" }); - } - - pageOptions.onlyMainContent = true; - timeout = req.body.timeout ?? 90000; - } - - // checkCredits - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - earlyReturn = true; - return res.status(402).json({ error: "Insufficient credits" }); - } - } catch (error) { - Logger.error(error); - earlyReturn = true; - return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); - } - - const jobId = uuidv4(); - - const startTime = new Date().getTime(); - const result = await scrapeHelper( - jobId, - req, - team_id, - crawlerOptions, - pageOptions, - extractorOptions, - timeout, - plan - ); - const endTime = new Date().getTime(); - const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; - - if (result.success) { - let creditsToBeBilled = 0; // billing for doc done on queue end - const creditsPerLLMExtract = 50; - - if (extractorOptions.mode.includes("llm-extraction")) { - // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); - creditsToBeBilled += creditsPerLLMExtract; - } - - let startTimeBilling = new Date().getTime(); - - if (earlyReturn) { - // Don't bill if we're early returning - return; - } - const billingResult = await billTeam( - team_id, - creditsToBeBilled - ); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", - }); - } - } - - logJob({ - job_id: jobId, - success: result.success, - message: result.error, - num_docs: 1, - docs: [result.data], - time_taken: timeTakenInSeconds, - team_id: team_id, - mode: "scrape", - url: req.body.url, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - origin: origin, - extractor_options: extractorOptions, - num_tokens: numTokens, - }); - - - - return res.status(result.returnCode).json(result); - } catch (error) { - Sentry.captureException(error); - Logger.error(error); - return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") }); - } -} diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts similarity index 62% rename from apps/api/src/controllers/admin/queue.ts rename to apps/api/src/controllers/v0/admin/queue.ts index 06844bea..71748002 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/v0/admin/queue.ts @@ -1,11 +1,10 @@ import { Request, Response } from "express"; import { Job } from "bullmq"; -import { Logger } from "../../lib/logger"; -import { getScrapeQueue } from "../../services/queue-service"; -import { checkAlerts } from "../../services/alerts"; -import { exec } from "node:child_process"; -import { sendSlackWebhook } from "../../services/alerts/slack"; +import { Logger } from "../../../lib/logger"; +import { getScrapeQueue } from "../../../services/queue-service"; +import { checkAlerts } from "../../../services/alerts"; +import { sendSlackWebhook } from "../../../services/alerts/slack"; export async function cleanBefore24hCompleteJobsController( req: Request, @@ -94,26 +93,34 @@ export async function autoscalerController(req: Request, res: Response) { const scrapeQueue = getScrapeQueue(); - const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([ - scrapeQueue.getActiveCount(), - scrapeQueue.getWaitingCount(), - scrapeQueue.getPrioritizedCount(), - ]); + const [webScraperActive, webScraperWaiting, webScraperPriority] = + await Promise.all([ + scrapeQueue.getActiveCount(), + scrapeQueue.getWaitingCount(), + scrapeQueue.getPrioritizedCount(), + ]); let waitingAndPriorityCount = webScraperWaiting + webScraperPriority; // get number of machines active - const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines', + const request = await fetch( + "https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines", { headers: { - 'Authorization': `Bearer ${process.env.FLY_API_TOKEN}` - } + Authorization: `Bearer ${process.env.FLY_API_TOKEN}`, + }, } - ) + ); const machines = await request.json(); // Only worker machines - const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; + const activeMachines = machines.filter( + (machine) => + (machine.state === "started" || + machine.state === "starting" || + machine.state === "replacing") && + machine.config.env["FLY_PROCESS_GROUP"] === "worker" + ).length; let targetMachineCount = activeMachines; @@ -123,29 +130,57 @@ export async function autoscalerController(req: Request, res: Response) { // Scale up logic if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3)); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp * 3 + ); } else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2)); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp * 2 + ); } else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp + ); } // Scale down logic if (webScraperActive < 100 && waitingAndPriorityCount < 50) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3)); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown * 3 + ); } else if (webScraperActive < 500 && waitingAndPriorityCount < 200) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2)); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown * 2 + ); } else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown + ); } if (targetMachineCount !== activeMachines) { - Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); + Logger.info( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting` + ); - if(targetMachineCount > activeMachines) { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); + if (targetMachineCount > activeMachines) { + sendSlackWebhook( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, + false, + process.env.SLACK_AUTOSCALER ?? "" + ); } else { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, + false, + process.env.SLACK_AUTOSCALER ?? "" + ); } return res.status(200).json({ mode: "scale-descale", diff --git a/apps/api/src/controllers/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts similarity index 95% rename from apps/api/src/controllers/admin/redis-health.ts rename to apps/api/src/controllers/v0/admin/redis-health.ts index 3b1e2518..dc58d745 100644 --- a/apps/api/src/controllers/admin/redis-health.ts +++ b/apps/api/src/controllers/v0/admin/redis-health.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import Redis from "ioredis"; -import { Logger } from "../../lib/logger"; -import { redisRateLimitClient } from "../../services/rate-limiter"; +import { Logger } from "../../../lib/logger"; +import { redisRateLimitClient } from "../../../services/rate-limiter"; export async function redisHealthController(req: Request, res: Response) { const retryOperation = async (operation, retries = 3) => { diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts new file mode 100644 index 00000000..bf1c2d0a --- /dev/null +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -0,0 +1,58 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { supabase_service } from "../../../src/services/supabase"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; +import * as Sentry from "@sentry/node"; + +export async function crawlCancelController(req: Request, res: Response) { + try { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ error: "Job not found" }); + } + + // check if the job belongs to the team + if (useDbAuthentication) { + const { data, error: supaError } = await supabase_service + .from("bulljobs_teams") + .select("*") + .eq("job_id", req.params.jobId) + .eq("team_id", team_id); + if (supaError) { + return res.status(500).json({ error: supaError.message }); + } + + if (data.length === 0) { + return res.status(403).json({ error: "Unauthorized" }); + } + } + + try { + sc.cancelled = true; + await saveCrawl(req.params.jobId, sc); + } catch (error) { + Logger.error(error); + } + + res.json({ + status: "cancelled" + }); + } catch (error) { + Sentry.captureException(error); + Logger.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts similarity index 85% rename from apps/api/src/controllers/crawl-status.ts rename to apps/api/src/controllers/v0/crawl-status.ts index 76147263..bda1af70 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -1,10 +1,10 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { getScrapeQueue } from "../../src/services/queue-service"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; -import { supabaseGetJobsById } from "../../src/lib/supabase-jobs"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { getScrapeQueue } from "../../../src/services/queue-service"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; +import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; export async function getJobs(ids: string[]) { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/v0/crawl.ts similarity index 85% rename from apps/api/src/controllers/crawl.ts rename to apps/api/src/controllers/v0/crawl.ts index d2123d82..aefdb5e5 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -1,32 +1,20 @@ import { Request, Response } from "express"; -import { checkTeamCredits } from "../../src/services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { addScrapeJob } from "../../src/services/queue-jobs"; -import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; -import { logCrawl } from "../../src/services/logging/crawl_log"; -import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; -import { createIdempotencyKey } from "../../src/services/idempotency/create"; -import { - defaultCrawlPageOptions, - defaultCrawlerOptions, - defaultOrigin, -} from "../../src/lib/default-values"; +import { checkTeamCredits } from "../../../src/services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { addScrapeJob } from "../../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; +import { logCrawl } from "../../../src/services/logging/crawl_log"; +import { validateIdempotencyKey } from "../../../src/services/idempotency/validate"; +import { createIdempotencyKey } from "../../../src/services/idempotency/create"; +import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../src/lib/logger"; -import { - addCrawlJob, - addCrawlJobs, - crawlToCrawler, - lockURL, - lockURLs, - saveCrawl, - StoredCrawl, -} from "../../src/lib/crawl-redis"; -import { getScrapeQueue } from "../../src/services/queue-service"; -import { checkAndUpdateURL } from "../../src/lib/validateUrl"; -import { getJobPriority } from "../../src/lib/job-priority"; +import { Logger } from "../../../src/lib/logger"; +import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { getScrapeQueue } from "../../../src/services/queue-service"; +import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; export async function crawlController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts similarity index 90% rename from apps/api/src/controllers/crawlPreview.ts rename to apps/api/src/controllers/v0/crawlPreview.ts index 3e43f07f..f8706867 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -1,12 +1,12 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../src/lib/logger"; -import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; -import { addScrapeJob } from "../../src/services/queue-jobs"; -import { checkAndUpdateURL } from "../../src/lib/validateUrl"; +import { Logger } from "../../../src/lib/logger"; +import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { addScrapeJob } from "../../../src/services/queue-jobs"; +import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; export async function crawlPreviewController(req: Request, res: Response) { diff --git a/apps/api/src/controllers/keyAuth.ts b/apps/api/src/controllers/v0/keyAuth.ts similarity index 83% rename from apps/api/src/controllers/keyAuth.ts rename to apps/api/src/controllers/v0/keyAuth.ts index 351edd18..b70d672a 100644 --- a/apps/api/src/controllers/keyAuth.ts +++ b/apps/api/src/controllers/v0/keyAuth.ts @@ -1,8 +1,8 @@ -import { AuthResponse, RateLimiterMode } from "../types"; +import { AuthResponse, RateLimiterMode } from "../../types"; import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; +import { authenticateUser } from "../auth"; export const keyAuthController = async (req: Request, res: Response) => { diff --git a/apps/api/src/controllers/liveness.ts b/apps/api/src/controllers/v0/liveness.ts similarity index 100% rename from apps/api/src/controllers/liveness.ts rename to apps/api/src/controllers/v0/liveness.ts diff --git a/apps/api/src/controllers/readiness.ts b/apps/api/src/controllers/v0/readiness.ts similarity index 100% rename from apps/api/src/controllers/readiness.ts rename to apps/api/src/controllers/v0/readiness.ts diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts new file mode 100644 index 00000000..8cb09cf0 --- /dev/null +++ b/apps/api/src/controllers/v0/scrape.ts @@ -0,0 +1,288 @@ +import { ExtractorOptions, PageOptions } from "./../../lib/entities"; +import { Request, Response } from "express"; +import { + billTeam, + checkTeamCredits, +} from "../../services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { PlanType, RateLimiterMode } from "../../types"; +import { logJob } from "../../services/logging/log_job"; +import { Document } from "../../lib/entities"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { + defaultPageOptions, + defaultExtractorOptions, + defaultTimeout, + defaultOrigin, +} from "../../lib/default-values"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { getScrapeQueue } from "../../services/queue-service"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../../lib/logger"; +import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function scrapeHelper( + jobId: string, + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: PageOptions, + extractorOptions: ExtractorOptions, + timeout: number, + plan?: PlanType +): Promise<{ + success: boolean; + error?: string; + data?: Document; + returnCode: number; +}> { + const url = req.body.url; + if (!url) { + return { success: false, error: "Url is required", returnCode: 400 }; + } + + if (isUrlBlocked(url)) { + return { + success: false, + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + returnCode: 403, + }; + } + + const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); + + const job = await addScrapeJob( + { + url, + mode: "single_urls", + crawlerOptions, + team_id, + pageOptions, + extractorOptions, + origin: req.body.origin ?? defaultOrigin, + }, + {}, + jobId, + jobPriority + ); + + let doc; + + const err = await Sentry.startSpan( + { + name: "Wait for job to finish", + op: "bullmq.wait", + attributes: { job: jobId }, + }, + async (span) => { + try { + doc = (await waitForJob(job.id, timeout))[0]; + } catch (e) { + if (e instanceof Error && e.message.startsWith("Job wait")) { + span.setAttribute("timedOut", true); + return { + success: false, + error: "Request timed out", + returnCode: 408, + }; + } else if ( + typeof e === "string" && + (e.includes("Error generating completions: ") || + e.includes("Invalid schema for function") || + e.includes( + "LLM extraction did not match the extraction schema you provided." + )) + ) { + return { + success: false, + error: e, + returnCode: 500, + }; + } else { + throw e; + } + } + span.setAttribute("result", JSON.stringify(doc)); + return null; + } + ); + + if (err !== null) { + return err; + } + + await job.remove(); + + if (!doc) { + console.error("!!! PANIC DOC IS", doc, job); + return { + success: true, + error: "No page found", + returnCode: 200, + data: doc, + }; + } + + delete doc.index; + delete doc.provider; + + // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html + if ( + !pageOptions.includeRawHtml && + extractorOptions.mode == "llm-extraction-from-raw-html" + ) { + if (doc.rawHtml) { + delete doc.rawHtml; + } + } + + if (!pageOptions.includeHtml) { + if (doc.html) { + delete doc.html; + } + } + + return { + success: true, + data: doc, + returnCode: 200, + }; +} + +export async function scrapeController(req: Request, res: Response) { + try { + let earlyReturn = false; + // make sure to authenticate user first, Bearer + const { success, team_id, error, status, plan } = await authenticateUser( + req, + res, + RateLimiterMode.Scrape + ); + if (!success) { + return res.status(status).json({ error }); + } + + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; + const extractorOptions = { + ...defaultExtractorOptions, + ...req.body.extractorOptions, + }; + const origin = req.body.origin ?? defaultOrigin; + let timeout = req.body.timeout ?? defaultTimeout; + + if (extractorOptions.mode.includes("llm-extraction")) { + if ( + typeof extractorOptions.extractionSchema !== "object" || + extractorOptions.extractionSchema === null + ) { + return res + .status(400) + .json({ + error: + "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", + }); + } + + pageOptions.onlyMainContent = true; + timeout = req.body.timeout ?? 90000; + } + + // checkCredits + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + earlyReturn = true; + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + Logger.error(error); + earlyReturn = true; + return res + .status(500) + .json({ + error: + "Error checking team credits. Please contact hello@firecrawl.com for help.", + }); + } + + const jobId = uuidv4(); + + const startTime = new Date().getTime(); + const result = await scrapeHelper( + jobId, + req, + team_id, + crawlerOptions, + pageOptions, + extractorOptions, + timeout, + plan + ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = + result.data && result.data.markdown + ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + : 0; + + if (result.success) { + let creditsToBeBilled = 0; // billing for doc done on queue end + const creditsPerLLMExtract = 50; + + if (extractorOptions.mode.includes("llm-extraction")) { + // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); + creditsToBeBilled += creditsPerLLMExtract; + } + + let startTimeBilling = new Date().getTime(); + + if (earlyReturn) { + // Don't bill if we're early returning + return; + } + const billingResult = await billTeam(team_id, creditsToBeBilled); + if (!billingResult.success) { + return res.status(402).json({ + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + }); + } + } + + logJob({ + job_id: jobId, + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: timeTakenInSeconds, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + origin: origin, + extractor_options: extractorOptions, + num_tokens: numTokens, + }); + + return res.status(result.returnCode).json(result); + } catch (error) { + Sentry.captureException(error); + Logger.error(error); + return res + .status(500) + .json({ + error: + typeof error === "string" + ? error + : error?.message ?? "Internal Server Error", + }); + } +} diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/v0/search.ts similarity index 77% rename from apps/api/src/controllers/search.ts rename to apps/api/src/controllers/v0/search.ts index 92efe1df..825abbe1 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -1,18 +1,18 @@ import { Request, Response } from "express"; -import { WebScraperDataProvider } from "../scraper/WebScraper"; -import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { PlanType, RateLimiterMode } from "../types"; -import { logJob } from "../services/logging/log_job"; -import { PageOptions, SearchOptions } from "../lib/entities"; -import { search } from "../search"; -import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { WebScraperDataProvider } from "../../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { PlanType, RateLimiterMode } from "../../types"; +import { logJob } from "../../services/logging/log_job"; +import { PageOptions, SearchOptions } from "../../lib/entities"; +import { search } from "../../search"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../lib/logger"; -import { getJobPriority } from "../lib/job-priority"; -import { getScrapeQueue } from "../services/queue-service"; +import { Logger } from "../../lib/logger"; +import { getScrapeQueue } from "../../services/queue-service"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import * as Sentry from "@sentry/node"; -import { addScrapeJob } from "../services/queue-jobs"; +import { getJobPriority } from "../../lib/job-priority"; export async function searchHelper( jobId: string, @@ -112,24 +112,7 @@ export async function searchHelper( await getScrapeQueue().addBulk(jobs); } - const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => { - const start = Date.now(); - const int = setInterval(async () => { - if (Date.now() >= start + 60000) { - clearInterval(int); - reject(new Error("Job wait ")); - } else { - const state = await x.getState(); - if (state === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(x.id)).returnvalue); - } else if (state === "failed") { - clearInterval(int); - reject((await getScrapeQueue().getJob(x.id)).failedReason); - } - } - }, 1000); - })))).map(x => x[0]); + const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]); if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; @@ -166,17 +149,16 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: false, - onlyMainContent: true, - fetchPageContent: true, - removeTags: [], - fallback: false, + includeHtml: req.body.pageOptions?.includeHtml ?? false, + onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, + fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, + removeTags: req.body.pageOptions?.removeTags ?? [], + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 5 }; - const jobId = uuidv4(); try { diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/v0/status.ts similarity index 92% rename from apps/api/src/controllers/status.ts rename to apps/api/src/controllers/v0/status.ts index 362f1f24..34ebb3c6 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; diff --git a/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP b/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP new file mode 100644 index 00000000..621c7436 --- /dev/null +++ b/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP @@ -0,0 +1,47 @@ +import { crawlController } from '../crawl' +import { Request, Response } from 'express'; +import { authenticateUser } from '../auth'; // Ensure this import is correct +import { createIdempotencyKey } from '../../services/idempotency/create'; +import { validateIdempotencyKey } from '../../services/idempotency/validate'; +import { v4 as uuidv4 } from 'uuid'; + +jest.mock('../auth', () => ({ + authenticateUser: jest.fn().mockResolvedValue({ + success: true, + team_id: 'team123', + error: null, + status: 200 + }), + reduce: jest.fn() +})); +jest.mock('../../services/idempotency/validate'); + +describe('crawlController', () => { + it('should prevent duplicate requests using the same idempotency key', async () => { + const req = { + headers: { + 'x-idempotency-key': await uuidv4(), + 'Authorization': `Bearer ${process.env.TEST_API_KEY}` + }, + body: { + url: 'https://mendable.ai' + } + } as unknown as Request; + const res = { + status: jest.fn().mockReturnThis(), + json: jest.fn() + } as unknown as Response; + + // Mock the idempotency key validation to return false for the second call + (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false); + + // First request should succeed + await crawlController(req, res); + expect(res.status).not.toHaveBeenCalledWith(409); + + // Second request with the same key should fail + await crawlController(req, res); + expect(res.status).toHaveBeenCalledWith(409); + expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts new file mode 100644 index 00000000..0a9931d3 --- /dev/null +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -0,0 +1,64 @@ +import { url } from "../types"; + +describe("URL Schema Validation", () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + it("should prepend http:// to URLs without a protocol", () => { + const result = url.parse("example.com"); + expect(result).toBe("http://example.com"); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("http://example.com")).not.toThrow(); + expect(() => url.parse("https://example.com")).not.toThrow(); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("example.com")).not.toThrow(); + }); + + it("should reject URLs with unsupported protocols", () => { + expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL"); + }); + + it("should reject URLs without a valid top-level domain", () => { + expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path"); + }); + + it("should reject blocked URLs", () => { + expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with subdomains correctly", () => { + expect(() => url.parse("http://sub.example.com")).not.toThrow(); + expect(() => url.parse("https://blog.example.com")).not.toThrow(); + }); + + it("should handle URLs with paths correctly", () => { + expect(() => url.parse("http://example.com/path")).not.toThrow(); + expect(() => url.parse("https://example.com/another/path")).not.toThrow(); + }); + + it("should handle URLs with subdomains that are blocked", () => { + expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with paths that are blocked", () => { + expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should reject malformed URLs starting with 'http://http'", () => { + expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol."); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://example.com/http://example.com")).not.toThrow(); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); + }); +}) \ No newline at end of file diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts similarity index 83% rename from apps/api/src/controllers/crawl-cancel.ts rename to apps/api/src/controllers/v1/crawl-cancel.ts index 1de9af60..06a5b26e 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -1,9 +1,9 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { supabase_service } from "../../src/services/supabase"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../types"; +import { supabase_service } from "../../services/supabase"; +import { Logger } from "../../lib/logger"; +import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import * as Sentry from "@sentry/node"; export async function crawlCancelController(req: Request, res: Response) { diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts new file mode 100644 index 00000000..551948de --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -0,0 +1,159 @@ +import { authMiddleware } from "../../routes/v1"; +import { RateLimiterMode } from "../../types"; +import { authenticateUser } from "../auth"; +import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { WebSocket } from "ws"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../../lib/logger"; +import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis"; +import { getScrapeQueue } from "../../services/queue-service"; +import { getJob, getJobs } from "./crawl-status"; +import * as Sentry from "@sentry/node"; + +type ErrorMessage = { + type: "error", + error: string, +} + +type CatchupMessage = { + type: "catchup", + data: CrawlStatusResponse, +} + +type DocumentMessage = { + type: "document", + data: Document, +} + +type DoneMessage = { type: "done" } + +type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage; + +function send(ws: WebSocket, msg: Message) { + if (ws.readyState === 1) { + return new Promise((resolve, reject) => { + ws.send(JSON.stringify(msg), (err) => { + if (err) reject(err); + else resolve(null); + }); + }); + } +} + +function close(ws: WebSocket, code: number, msg: Message) { + if (ws.readyState <= 1) { + ws.close(code, JSON.stringify(msg)); + } +} + +async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return close(ws, 1008, { type: "error", error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return close(ws, 3003, { type: "error", error: "Forbidden" }); + } + + let doneJobIDs = []; + let finished = false; + + const loop = async () => { + if (finished) return; + + const jobIDs = await getCrawlJobs(req.params.jobId); + + if (jobIDs.length === doneJobIDs.length) { + return close(ws, 1000, { type: "done" }); + } + + const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x)); + const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)])); + const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); + + for (const jobID of newlyDoneJobIDs) { + const job = await getJob(jobID); + + if (job.returnvalue) { + send(ws, { + type: "document", + data: legacyDocumentConverter(job.returnvalue), + }) + } else { + return close(ws, 3000, { type: "error", error: job.failedReason }); + } + } + + setTimeout(loop, 1000); + }; + + setTimeout(loop, 1000); + + doneJobIDs = await getDoneJobsOrdered(req.params.jobId); + + const jobIDs = await getCrawlJobs(req.params.jobId); + const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x))); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping"; + const doneJobs = await getJobs(doneJobIDs); + const data = doneJobs.map(x => x.returnvalue); + + send(ws, { + type: "catchup", + data: { + status, + total: jobIDs.length, + completed: doneJobIDs.length, + creditsUsed: jobIDs.length, + expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), + data: data.map(x => legacyDocumentConverter(x)), + } + }); + + if (status !== "scraping") { + finished = true; + return close(ws, 1000, { type: "done" }); + } +} + +// Basically just middleware and error wrapping +export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth) { + try { + const { success, team_id, error, status, plan } = await authenticateUser( + req, + null, + RateLimiterMode.CrawlStatus, + ); + + if (!success) { + return close(ws, 3000, { + type: "error", + error, + }); + } + + req.auth = { team_id, plan }; + + await crawlStatusWS(ws, req); + } catch (err) { + Sentry.captureException(err); + + const id = uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); + return close(ws, 1011, { + type: "error", + error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + }); + } +} diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts new file mode 100644 index 00000000..1fe2fd9a --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -0,0 +1,116 @@ +import { Response } from "express"; +import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis"; +import { getScrapeQueue } from "../../services/queue-service"; +import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; + +export async function getJob(id: string) { + const job = await getScrapeQueue().getJob(id); + if (!job) return job; + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const supabaseData = await supabaseGetJobById(id); + + if (supabaseData) { + job.returnvalue = supabaseData.docs; + } + } + + job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + + return job; +} + +export async function getJobs(ids: string[]) { + const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const supabaseData = await supabaseGetJobsById(ids); + + supabaseData.forEach(x => { + const job = jobs.find(y => y.id === x.job_id); + if (job) { + job.returnvalue = x.docs; + } + }) + } + + jobs.forEach(job => { + job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + }); + + return jobs; +} + +export async function crawlStatusController(req: RequestWithAuth, res: Response) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ success: false, error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return res.status(403).json({ success: false, error: "Forbidden" }); + } + + const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0; + const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined; + + const jobIDs = await getCrawlJobs(req.params.jobId); + const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x))); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping"; + const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); + const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); + + let doneJobs = []; + + if (end === undefined) { // determine 10 megabyte limit + let bytes = 0; + const bytesLimit = 10485760; // 10 MiB in bytes + const factor = 100; // chunking for faster retrieval + + for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) { + // get current chunk and retrieve jobs + const currentIDs = doneJobsOrder.slice(i, i+factor); + const jobs = await getJobs(currentIDs); + + // iterate through jobs and add them one them one to the byte counter + // both loops will break once we cross the byte counter + for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) { + const job = jobs[ii]; + doneJobs.push(job); + bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length; + } + } + + // if we ran over the bytes limit, remove the last document + if (bytes > bytesLimit) { + doneJobs.splice(doneJobs.length - 1, 1); + } + } else { + doneJobs = await getJobs(doneJobsOrder); + } + + const data = doneJobs.map(x => x.returnvalue); + + const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + + nextURL.searchParams.set("skip", (start + data.length).toString()); + + if (typeof req.query.limit === "string") { + nextURL.searchParams.set("limit", req.query.limit); + } + + res.status(200).json({ + status, + completed: doneJobsLength, + total: jobIDs.length, + creditsUsed: jobIDs.length, + expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), + next: + status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this + ? undefined + : nextURL.href, + data: data.map(x => legacyDocumentConverter(x)), + }); +} + diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts new file mode 100644 index 00000000..f4c4586f --- /dev/null +++ b/apps/api/src/controllers/v1/crawl.ts @@ -0,0 +1,157 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + CrawlRequest, + crawlRequestSchema, + CrawlResponse, + legacyCrawlerOptions, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { addScrapeJob } from "../../services/queue-jobs"; +import { Logger } from "../../lib/logger"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function crawlController( + req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, + res: Response +) { + req.body = crawlRequestSchema.parse(req.body); + + const id = uuidv4(); + + await logCrawl(id, req.auth.team_id); + + const { remainingCredits } = req.account; + + const crawlerOptions = legacyCrawlerOptions(req.body); + const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + + // TODO: @rafa, is this right? copied from v0 + if (Array.isArray(crawlerOptions.includes)) { + for (const x of crawlerOptions.includes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ success: false, error: e.message }); + } + } + } + + if (Array.isArray(crawlerOptions.excludes)) { + for (const x of crawlerOptions.excludes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ success: false, error: e.message }); + } + } + } + + crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); + + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + const crawler = crawlToCrawler(id, sc); + + try { + sc.robots = await crawler.getRobotsTxt(); + } catch (e) { + Logger.debug( + `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( + e + )}` + ); + } + + await saveCrawl(id, sc); + + const sitemap = sc.crawlerOptions.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); + + if (sitemap !== null && sitemap.length > 0) { + let jobPriority = 20; + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(sitemap.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } + const jobs = sitemap.map((x) => { + const url = x.url; + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + team_id: req.auth.team_id, + crawlerOptions, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + v1: true, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); + await getScrapeQueue().addBulk(jobs); + } else { + await lockURL(id, sc, req.body.url); + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: req.auth.team_id, + pageOptions: pageOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + v1: true, + }, + { + priority: 15, + } + ); + await addCrawlJob(id, job.id); + } + + return res.status(200).json({ + success: true, + id, + url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, + }); +} diff --git a/apps/api/src/controllers/v1/liveness.ts b/apps/api/src/controllers/v1/liveness.ts new file mode 100644 index 00000000..8ff1a96f --- /dev/null +++ b/apps/api/src/controllers/v1/liveness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function livenessController(req: Request, res: Response) { + //TODO: add checks if the application is live and healthy like checking the redis connection + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts new file mode 100644 index 00000000..19dc4165 --- /dev/null +++ b/apps/api/src/controllers/v1/map.ts @@ -0,0 +1,122 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + legacyCrawlerOptions, + mapRequestSchema, + RequestWithAuth, +} from "./types"; +import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; +import { MapResponse, MapRequest } from "./types"; +import { configDotenv } from "dotenv"; +import { + checkAndUpdateURLForMap, + isSameDomain, + isSameSubdomain, + removeDuplicateUrls, +} from "../../lib/validateUrl"; +import { fireEngineMap } from "../../search/fireEngine"; +import { billTeam } from "../../services/billing/credit_billing"; +import { logJob } from "../../services/logging/log_job"; + +configDotenv(); + +export async function mapController( + req: RequestWithAuth<{}, MapResponse, MapRequest>, + res: Response +) { + const startTime = new Date().getTime(); + + req.body = mapRequestSchema.parse(req.body); + + + const limit = req.body.limit; + const id = uuidv4(); + let links: string[] = [req.body.url]; + + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions: legacyCrawlerOptions(req.body), + pageOptions: {}, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + const crawler = crawlToCrawler(id, sc); + + const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); + + if (sitemap !== null) { + sitemap.map((x) => { + links.push(x.url); + }); + } + + let urlWithoutWww = req.body.url.replace("www.", ""); + + let mapUrl = req.body.search + ? `"${req.body.search}" site:${urlWithoutWww}` + : `site:${req.body.url}`; + // www. seems to exclude subdomains in some cases + const mapResults = await fireEngineMap(mapUrl, { + // limit to 50 results (beta) + numResults: Math.min(limit, 50), + }); + + if (mapResults.length > 0) { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; + } else { + mapResults.map((x) => { + links.push(x.url); + }); + } + } + + links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); + + // allows for subdomains to be included + links = links.filter((x) => isSameDomain(x, req.body.url)); + + // if includeSubdomains is false, filter out subdomains + if (!req.body.includeSubdomains) { + links = links.filter((x) => isSameSubdomain(x, req.body.url)); + } + + // remove duplicates that could be due to http/https or www + links = removeDuplicateUrls(links); + + await billTeam(req.auth.team_id, 1); + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + + const linksToReturn = links.slice(0, limit); + + logJob({ + job_id: id, + success: links.length > 0, + message: "Map completed", + num_docs: links.length, + docs: linksToReturn, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "map", + url: req.body.url, + crawlerOptions: {}, + pageOptions: {}, + origin: req.body.origin, + extractor_options: { mode: "markdown" }, + num_tokens: 0, + }); + + return res.status(200).json({ + success: true, + links: linksToReturn, + }); +} diff --git a/apps/api/src/controllers/v1/readiness.ts b/apps/api/src/controllers/v1/readiness.ts new file mode 100644 index 00000000..cdb1f02c --- /dev/null +++ b/apps/api/src/controllers/v1/readiness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function readinessController(req: Request, res: Response) { + // TODO: add checks when the application is ready to serve traffic + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts new file mode 100644 index 00000000..940296bf --- /dev/null +++ b/apps/api/src/controllers/v1/scrape.ts @@ -0,0 +1,108 @@ +import { Request, Response } from "express"; +import { Logger } from '../../lib/logger'; +import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; +import { billTeam } from "../../services/billing/credit_billing"; +import { v4 as uuidv4 } from 'uuid'; +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { logJob } from "../../services/logging/log_job"; +import { getJobPriority } from "../../lib/job-priority"; +import { PlanType } from "../../types"; + +export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response) { + req.body = scrapeRequestSchema.parse(req.body); + let earlyReturn = false; + + const origin = req.body.origin; + const timeout = req.body.timeout; + const pageOptions = legacyScrapeOptions(req.body); + const jobId = uuidv4(); + + const startTime = new Date().getTime(); + const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10}) + + const job = await addScrapeJob({ + url: req.body.url, + mode: "single_urls", + crawlerOptions: {}, + team_id: req.auth.team_id, + pageOptions, + extractorOptions: {}, + origin: req.body.origin, + }, {}, jobId, jobPriority); + + let doc: any | undefined; + try { + doc = (await waitForJob(job.id, timeout))[0]; + } catch (e) { + Logger.error(`Error in scrapeController: ${e}`); + if (e instanceof Error && e.message.startsWith("Job wait")) { + return res.status(408).json({ + success: false, + error: "Request timed out", + }); + } else { + return res.status(500).json({ + success: false, + error: "Internal server error", + }); + } + } + + await job.remove(); + + if (!doc) { + console.error("!!! PANIC DOC IS", doc, job); + return res.status(200).json({ + success: true, + warning: "No page found", + data: doc + }); + } + + delete doc.index; + delete doc.provider; + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0; + + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (earlyReturn) { + // Don't bill if we're early returning + return; + } + + const billingResult = await billTeam( + req.auth.team_id, + creditsToBeBilled + ); + if (!billingResult.success) { + return res.status(402).json({ + success: false, + error: "Failed to bill team. Insufficient credits or subscription not found.", + }); + } + + logJob({ + job_id: jobId, + success: true, + message: "Scrape completed", + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: {}, + pageOptions: pageOptions, + origin: origin, + extractor_options: { mode: "markdown" }, + num_tokens: numTokens, + }); + + return res.status(200).json({ + success: true, + data: legacyDocumentConverter(doc), + }); +} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts new file mode 100644 index 00000000..20f1b775 --- /dev/null +++ b/apps/api/src/controllers/v1/types.ts @@ -0,0 +1,321 @@ +import { Request, Response } from "express"; +import { z } from "zod"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import { PageOptions } from "../../lib/entities"; +import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; +import { PlanType } from "../../types"; + +export type Format = + | "markdown" + | "html" + | "rawHtml" + | "links" + | "screenshot" + | "screenshot@fullPage"; + +export const url = z.preprocess( + (x) => { + if (!protocolIncluded(x as string)) { + return `http://${x}`; + } + return x; + }, + z + .string() + .url() + .regex(/^https?:\/\//, "URL uses unsupported protocol") + .refine( + (x) => /\.[a-z]{2,}(\/|$)/i.test(x), + "URL must have a valid top-level domain or be a valid path" + ) + .refine( + (x) => checkUrl(x as string), + "Invalid URL" + ) + .refine( + (x) => !isUrlBlocked(x as string), + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ) +); + +const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; + +export const scrapeOptions = z.object({ + formats: z + .enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + ]) + .array() + .optional() + .default(["markdown"]), + headers: z.record(z.string(), z.string()).optional(), + includeTags: z.string().array().optional(), + excludeTags: z.string().array().optional(), + onlyMainContent: z.boolean().default(true), + timeout: z.number().int().positive().finite().safe().default(30000), // default? + waitFor: z.number().int().nonnegative().finite().safe().default(0), + parsePDF: z.boolean().default(true), +}).strict(strictMessage); + +export type ScrapeOptions = z.infer; + +export const scrapeRequestSchema = scrapeOptions.extend({ + url, + origin: z.string().optional().default("api"), +}).strict(strictMessage); + +// export type ScrapeRequest = { +// url: string; +// formats?: Format[]; +// headers?: { [K: string]: string }; +// includeTags?: string[]; +// excludeTags?: string[]; +// onlyMainContent?: boolean; +// timeout?: number; +// waitFor?: number; +// } + +export type ScrapeRequest = z.infer; + +const crawlerOptions = z.object({ + includePaths: z.string().array().default([]), + excludePaths: z.string().array().default([]), + maxDepth: z.number().default(10), // default? + limit: z.number().default(10000), // default? + allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? + allowExternalLinks: z.boolean().default(false), + ignoreSitemap: z.boolean().default(true), +}).strict(strictMessage); + +// export type CrawlerOptions = { +// includePaths?: string[]; +// excludePaths?: string[]; +// maxDepth?: number; +// limit?: number; +// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME??? +// allowExternalLinks?: boolean; +// ignoreSitemap?: boolean; +// }; + +export type CrawlerOptions = z.infer; + +export const crawlRequestSchema = crawlerOptions.extend({ + url, + origin: z.string().optional().default("api"), + scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), + webhook: z.string().url().optional(), + limit: z.number().default(10000), +}).strict(strictMessage); + +// export type CrawlRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// scrapeOptions?: Exclude; +// }; + +export type CrawlRequest = z.infer; + +export const mapRequestSchema = crawlerOptions.extend({ + url, + origin: z.string().optional().default("api"), + includeSubdomains: z.boolean().default(true), + search: z.string().optional(), + ignoreSitemap: z.boolean().default(false), + limit: z.number().min(1).max(50).default(5000).optional(), +}).strict(strictMessage); + +// export type MapRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// }; + +export type MapRequest = z.infer; + +export type Document = { + markdown?: string; + html?: string; + rawHtml?: string; + links?: string[]; + screenshot?: string; + metadata: { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dcTermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dcTermsType?: string; + dcType?: string; + dcTermsAudience?: string; + dcTermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dcTermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + statusCode?: number; + error?: string; + }; +}; + +export type ErrorResponse = { + success: false; + error: string; + details?: any; +}; + +export type ScrapeResponse = + | ErrorResponse + | { + success: true; + warning?: string; + data: Document; + }; + +export interface ScrapeResponseRequestTest { + statusCode: number; + body: ScrapeResponse; + error?: string; +} + +export type CrawlResponse = + | ErrorResponse + | { + success: true; + id: string; + url: string; + }; + +export type MapResponse = + | ErrorResponse + | { + success: true; + links: string[]; + }; + +export type CrawlStatusParams = { + jobId: string; +}; + +export type CrawlStatusResponse = + | ErrorResponse + | { + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: string; + next?: string; + data: Document[]; + }; + +type AuthObject = { + team_id: string; + plan: PlanType; +}; + +type Account = { + remainingCredits: number; +}; + +export interface RequestWithMaybeAuth< + ReqParams = {}, + ReqBody = undefined, + ResBody = undefined +> extends Request { + auth?: AuthObject; + account?: Account; +} + +export interface RequestWithAuth< + ReqParams = {}, + ReqBody = undefined, + ResBody = undefined, +> extends Request { + auth: AuthObject; + account?: Account; +} + +export interface ResponseWithSentry< + ResBody = undefined, +> extends Response { + sentry?: string, +} + +export function legacyCrawlerOptions(x: CrawlerOptions) { + return { + includes: x.includePaths, + excludes: x.excludePaths, + maxCrawledLinks: x.limit, + maxCrawledDepth: x.maxDepth, + limit: x.limit, + generateImgAltText: false, + allowBackwardCrawling: x.allowBackwardLinks, + allowExternalContentLinks: x.allowExternalLinks, + }; +} + +export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { + return { + includeMarkdown: x.formats.includes("markdown"), + includeHtml: x.formats.includes("html"), + includeRawHtml: x.formats.includes("rawHtml"), + onlyIncludeTags: x.includeTags, + removeTags: x.excludeTags, + onlyMainContent: x.onlyMainContent, + waitFor: x.waitFor, + includeLinks: x.formats.includes("links"), + screenshot: x.formats.includes("screenshot"), + fullPageScreenshot: x.formats.includes("screenshot@fullPage"), + parsePDF: x.parsePDF, + }; +} + +export function legacyDocumentConverter(doc: any): Document { + if (doc.metadata) { + if (doc.metadata.screenshot) { + doc.screenshot = doc.metadata.screenshot; + delete doc.metadata.screenshot; + } + + if (doc.metadata.fullPageScreenshot) { + doc.fullPageScreenshot = doc.metadata.fullPageScreenshot; + delete doc.metadata.fullPageScreenshot; + } + } + + return { + markdown: doc.markdown, + links: doc.linksOnPage, + rawHtml: doc.rawHtml, + html: doc.html, + screenshot: doc.screenshot ?? doc.fullPageScreenshot, + metadata: { + ...doc.metadata, + pageError: undefined, + pageStatusCode: undefined, + error: doc.metadata.pageError, + statusCode: doc.metadata.pageStatusCode, + }, + }; +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 4d096894..a30005c4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,7 +1,7 @@ import "dotenv/config"; import "./services/sentry" import * as Sentry from "@sentry/node"; -import express from "express"; +import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; import cors from "cors"; import { getScrapeQueue } from "./services/queue-service"; @@ -15,8 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events"; import http from 'node:http'; import https from 'node:https'; import CacheableLookup from 'cacheable-lookup'; - - +import { v1Router } from "./routes/v1"; +import expressWs from "express-ws"; +import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; +import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; +import { ZodError } from "zod"; +import { v4 as uuidv4 } from "uuid"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -49,7 +53,8 @@ if (cluster.isMaster) { } }); } else { - const app = express(); + const ws = expressWs(express()); + const app = ws.app; global.isProduction = process.env.IS_PRODUCTION === "true"; @@ -82,6 +87,7 @@ if (cluster.isMaster) { // register router app.use(v0Router); + app.use("/v1", v1Router); app.use(adminRouter); const DEFAULT_PORT = process.env.PORT ?? 3002; @@ -186,6 +192,27 @@ if (cluster.isMaster) { Sentry.setupExpressErrorHandler(app); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + } + }); + Logger.info(`Worker ${process.pid} started`); } diff --git a/apps/api/src/lib/checkCredits.ts b/apps/api/src/lib/checkCredits.ts new file mode 100644 index 00000000..7e9d988d --- /dev/null +++ b/apps/api/src/lib/checkCredits.ts @@ -0,0 +1,32 @@ +import { checkTeamCredits } from "../services/billing/credit_billing"; +import { Logger } from "./logger"; + +type checkCreditsResponse = { + status: number; + error: string | null; +} + +export const checkCredits = async (team_id: string): Promise => { + try { + const { + success: creditsCheckSuccess, + message: creditsCheckMessage + } = await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return { + status: 402, + error: "Insufficient credits" + }; + } + } catch (error) { + Logger.error(error); + return { + status: 500, + error: "Error checking team credits. Please contact hello@firecrawl.com for help." + }; + } + return { + status: 200, + error: null + } +}; \ No newline at end of file diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b8c1c151..9240018e 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -27,6 +27,14 @@ export async function getCrawl(id: string): Promise { return JSON.parse(x); } +export async function getCrawlExpiry(id: string): Promise { + const d = new Date(); + const ttl = await redisConnection.pttl("crawl:" + id); + d.setMilliseconds(d.getMilliseconds() + ttl); + d.setMilliseconds(0); + return d; +} + export async function addCrawlJob(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs", job_id); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); @@ -39,13 +47,27 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); + await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); +} + +export async function getDoneJobsOrderedLength(id: string): Promise { + return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); +} + +export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise { + return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end); } export async function isCrawlFinished(id: string) { return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs")); } +export async function isCrawlFinishedLocked(id: string) { + return (await redisConnection.exists("crawl:" + id + ":finish")); +} + export async function finishCrawl(id: string) { if (await isCrawlFinished(id)) { const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d833bda0..b4b26040 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -11,6 +11,7 @@ export interface Progress { } export type PageOptions = { + includeMarkdown?: boolean; onlyMainContent?: boolean; includeHtml?: boolean; includeRawHtml?: boolean; @@ -24,6 +25,7 @@ export type PageOptions = { parsePDF?: boolean; removeTags?: string | string[]; onlyIncludeTags?: string | string[]; + includeLinks?: boolean; useFastMode?: boolean; // beta disableJSDom?: boolean; // beta atsv?: boolean; // beta diff --git a/apps/api/src/lib/validateUrl.test.ts b/apps/api/src/lib/validateUrl.test.ts new file mode 100644 index 00000000..eec39f97 --- /dev/null +++ b/apps/api/src/lib/validateUrl.test.ts @@ -0,0 +1,159 @@ +import { isSameDomain, removeDuplicateUrls } from "./validateUrl"; +import { isSameSubdomain } from "./validateUrl"; + +describe("isSameDomain", () => { + it("should return true for a subdomain", () => { + const result = isSameDomain("http://sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same domain", () => { + const result = isSameDomain("http://example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return false for different domains", () => { + const result = isSameDomain("http://example.com", "http://another.com"); + expect(result).toBe(false); + }); + + it("should return true for a subdomain with different protocols", () => { + const result = isSameDomain("https://sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return false for invalid URLs", () => { + const result = isSameDomain("invalid-url", "http://example.com"); + expect(result).toBe(false); + const result2 = isSameDomain("http://example.com", "invalid-url"); + expect(result2).toBe(false); + }); + + it("should return true for a subdomain with www prefix", () => { + const result = isSameDomain("http://www.sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same domain with www prefix", () => { + const result = isSameDomain("http://docs.s.s.example.com", "http://example.com"); + expect(result).toBe(true); + }); +}); + + + + +describe("isSameSubdomain", () => { + it("should return false for a subdomain", () => { + const result = isSameSubdomain("http://example.com", "http://docs.example.com"); + expect(result).toBe(false); + }); + + it("should return true for the same subdomain", () => { + const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return false for different subdomains", () => { + const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com"); + expect(result).toBe(false); + }); + + it("should return false for different domains", () => { + const result = isSameSubdomain("http://example.com", "http://another.com"); + expect(result).toBe(false); + }); + + it("should return false for invalid URLs", () => { + const result = isSameSubdomain("invalid-url", "http://example.com"); + expect(result).toBe(false); + const result2 = isSameSubdomain("http://example.com", "invalid-url"); + expect(result2).toBe(false); + }); + + it("should return true for the same subdomain with different protocols", () => { + const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same subdomain with www prefix", () => { + const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return false for a subdomain with www prefix and different subdomain", () => { + const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com"); + expect(result).toBe(false); + }); +}); + +describe("removeDuplicateUrls", () => { + it("should remove duplicate URLs with different protocols", () => { + const urls = [ + "http://example.com", + "https://example.com", + "http://www.example.com", + "https://www.example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should keep URLs with different paths", () => { + const urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]); + }); + + it("should prefer https over http", () => { + const urls = [ + "http://example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should prefer non-www over www", () => { + const urls = [ + "https://www.example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should handle empty input", () => { + const urls: string[] = []; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([]); + }); + + it("should handle URLs with different cases", () => { + const urls = [ + "https://EXAMPLE.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://EXAMPLE.com"]); + }); + + it("should handle URLs with trailing slashes", () => { + const urls = [ + "https://example.com", + "https://example.com/" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); +}); diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index 2d2111c8..14a74de8 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -1,9 +1,8 @@ - -const protocolIncluded = (url: string) => { +export const protocolIncluded = (url: string) => { // if :// not in the start of the url assume http (maybe https?) // regex checks if :// appears before any . - return(/^([^.:]+:\/\/)/.test(url)); -} + return /^([^.:]+:\/\/)/.test(url); +}; const getURLobj = (s: string) => { // URL fails if we dont include the protocol ie google.com @@ -18,7 +17,6 @@ const getURLobj = (s: string) => { }; export const checkAndUpdateURL = (url: string) => { - if (!protocolIncluded(url)) { url = `http://${url}`; } @@ -30,9 +28,143 @@ export const checkAndUpdateURL = (url: string) => { const typedUrlObj = urlObj as URL; - if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { throw new Error("Invalid URL"); } return { urlObj: typedUrlObj, url: url }; +}; + +export const checkUrl = (url: string) => { + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + if ((url.split(".")[0].match(/:/g) || []).length !== 1) { + throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com + } + + return url; +}; + +/** + * Same domain check + * It checks if the domain of the url is the same as the base url + * It accounts true for subdomains and www.subdomains + * @param url + * @param baseUrl + * @returns + */ +export function isSameDomain(url: string, baseUrl: string) { + const { urlObj: urlObj1, error: error1 } = getURLobj(url); + const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl); + + if (error1 || error2) { + return false; + } + + const typedUrlObj1 = urlObj1 as URL; + const typedUrlObj2 = urlObj2 as URL; + + const cleanHostname = (hostname: string) => { + return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + }; + + const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); + const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + + return domain1 === domain2; +} + + +export function isSameSubdomain(url: string, baseUrl: string) { + const { urlObj: urlObj1, error: error1 } = getURLobj(url); + const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl); + + if (error1 || error2) { + return false; + } + + const typedUrlObj1 = urlObj1 as URL; + const typedUrlObj2 = urlObj2 as URL; + + const cleanHostname = (hostname: string) => { + return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + }; + + const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); + const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + + const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.'); + const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.'); + + // Check if the domains are the same and the subdomains are the same + return domain1 === domain2 && subdomain1 === subdomain2; +} + + +export const checkAndUpdateURLForMap = (url: string) => { + if (!protocolIncluded(url)) { + url = `http://${url}`; + } + // remove last slash if present + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + + + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + // remove any query params + url = url.split("?")[0].trim(); + + return { urlObj: typedUrlObj, url: url }; +}; + + + + + +export function removeDuplicateUrls(urls: string[]): string[] { + const urlMap = new Map(); + + for (const url of urls) { + const parsedUrl = new URL(url); + const protocol = parsedUrl.protocol; + const hostname = parsedUrl.hostname.replace(/^www\./, ''); + const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash; + + const key = `${hostname}${path}`; + + if (!urlMap.has(key)) { + urlMap.set(key, url); + } else { + const existingUrl = new URL(urlMap.get(key)!); + const existingProtocol = existingUrl.protocol; + + if (protocol === 'https:' && existingProtocol === 'http:') { + urlMap.set(key, url); + } else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) { + urlMap.set(key, url); + } + } + } + + return [...new Set(Array.from(urlMap.values()))]; } \ No newline at end of file diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index aea7876e..84d6e99a 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -26,7 +26,12 @@ export async function startWebScraperPipeline({ mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, extractorOptions: job.data.extractorOptions, - pageOptions: job.data.pageOptions, + pageOptions: { + ...job.data.pageOptions, + ...(job.data.crawl_id ? ({ + includeRawHtml: true, + }): {}), + }, inProgress: (progress) => { Logger.debug(`🐂 Job in progress ${job.id}`); if (progress.currentDocument) { @@ -39,6 +44,9 @@ export async function startWebScraperPipeline({ }, onSuccess: (result, mode) => { Logger.debug(`🐂 Job completed ${job.id}`); + if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { + delete result[0].rawHtml; + } saveJob(job, result, token, mode); }, onError: (error) => { diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index d32808ce..38611eac 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -1,11 +1,11 @@ import express from "express"; -import { redisHealthController } from "../controllers/admin/redis-health"; +import { redisHealthController } from "../controllers/v0/admin/redis-health"; import { autoscalerController, checkQueuesController, cleanBefore24hCompleteJobsController, queuesController, -} from "../controllers/admin/queue"; +} from "../controllers/v0/admin/queue"; export const adminRouter = express.Router(); diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 9c68d9bb..3a7bda65 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -1,14 +1,14 @@ import express from "express"; -import { crawlController } from "../../src/controllers/crawl"; -import { crawlStatusController } from "../../src/controllers/crawl-status"; -import { scrapeController } from "../../src/controllers/scrape"; -import { crawlPreviewController } from "../../src/controllers/crawlPreview"; -import { crawlJobStatusPreviewController } from "../../src/controllers/status"; -import { searchController } from "../../src/controllers/search"; -import { crawlCancelController } from "../../src/controllers/crawl-cancel"; -import { keyAuthController } from "../../src/controllers/keyAuth"; -import { livenessController } from "../controllers/liveness"; -import { readinessController } from "../controllers/readiness"; +import { crawlController } from "../../src/controllers/v0/crawl"; +import { crawlStatusController } from "../../src/controllers/v0/crawl-status"; +import { scrapeController } from "../../src/controllers/v0/scrape"; +import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview"; +import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status"; +import { searchController } from "../../src/controllers/v0/search"; +import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel"; +import { keyAuthController } from "../../src/controllers/v0/keyAuth"; +import { livenessController } from "../controllers/v0/liveness"; +import { readinessController } from "../controllers/v0/readiness"; export const v0Router = express.Router(); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts new file mode 100644 index 00000000..27da0a1a --- /dev/null +++ b/apps/api/src/routes/v1.ts @@ -0,0 +1,150 @@ +import express, { NextFunction, Request, Response } from "express"; +import { crawlController } from "../controllers/v1/crawl"; +// import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; +import { scrapeController } from "../../src/controllers/v1/scrape"; +import { crawlStatusController } from "../controllers/v1/crawl-status"; +import { mapController } from "../controllers/v1/map"; +import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; +import { RateLimiterMode } from "../types"; +import { authenticateUser } from "../controllers/auth"; +import { createIdempotencyKey } from "../services/idempotency/create"; +import { validateIdempotencyKey } from "../services/idempotency/validate"; +import { checkTeamCredits } from "../services/billing/credit_billing"; +import expressWs from "express-ws"; +import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { crawlCancelController } from "../controllers/v1/crawl-cancel"; +import { Logger } from "../lib/logger"; +// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; +// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; +// import { searchController } from "../../src/controllers/v1/search"; +// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel"; +// import { keyAuthController } from "../../src/controllers/v1/keyAuth"; +// import { livenessController } from "../controllers/v1/liveness"; +// import { readinessController } from "../controllers/v1/readiness"; + +function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + if (!minimum && req.body) { + minimum = (req.body as any)?.limit ?? 1; + } + const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); + if (!success) { + Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); + return res.status(402).json({ success: false, error: "Insufficient credits" }); + } + req.account = { remainingCredits } + next(); + })() + .catch(err => next(err)); + }; +} + +export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + const { success, team_id, error, status, plan } = await authenticateUser( + req, + res, + rateLimiterMode, + ); + + if (!success) { + return res.status(status).json({ success: false, error }); + } + + req.auth = { team_id, plan }; + next(); + })() + .catch(err => next(err)); + } +} + +function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) { + (async () => { + if (req.headers["x-idempotency-key"]) { + const isIdempotencyValid = await validateIdempotencyKey(req); + if (!isIdempotencyValid) { + return res.status(409).json({ success: false, error: "Idempotency key already used" }); + } + createIdempotencyKey(req); + } + next(); + })() + .catch(err => next(err)); +} + +function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { + if (req.body.url && isUrlBlocked(req.body.url)) { + return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + } + next(); +} + +function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { + return (req, res, next) => { + controller(req, res) + .catch(err => next(err)) + } +} + +expressWs(express()); + +export const v1Router = express.Router(); + +v1Router.post( + "/scrape", + blocklistMiddleware, + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + wrap(scrapeController) +); + +v1Router.post( + "/crawl", + blocklistMiddleware, + authMiddleware(RateLimiterMode.Crawl), + idempotencyMiddleware, + checkCreditsMiddleware(), + wrap(crawlController) +); + +v1Router.post( + "/map", + blocklistMiddleware, + authMiddleware(RateLimiterMode.Map), + checkCreditsMiddleware(1), + wrap(mapController) +); + +v1Router.get( + "/crawl/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + +v1Router.ws( + "/crawl/:jobId", + crawlStatusWSController +); + +// v1Router.post("/crawlWebsitePreview", crawlPreviewController); + + +v1Router.delete( + "/crawl/:jobId", + authMiddleware(RateLimiterMode.Crawl), + crawlCancelController +); +// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController); + +// // Auth route for key based authentication +// v1Router.get("/keyAuth", keyAuthController); + +// // Search routes +// v0Router.post("/search", searchController); + +// Health/Probe routes +// v1Router.get("/health/liveness", livenessController); +// v1Router.get("/health/readiness", readinessController); diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index da66830b..c4c7de65 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -31,7 +31,8 @@ it('should return a list of links on the firecrawl.ai page', async () => { // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); + console.log({result}); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage).toContain('https://flutterbricks.com/features') -}, 10000); +}, 15000); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 92b9ae40..d5dadaf8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -309,6 +309,23 @@ export class WebCrawler { return null; } + public extractLinksFromHTML(html: string, url: string) { + let links: string[] = []; + + const $ = load(html); + $("a").each((_, element) => { + const href = $(element).attr("href"); + if (href) { + const u = this.filterURL(href, url); + if (u !== null) { + links.push(u); + } + } + }); + + return links; + } + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; @@ -352,15 +369,7 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } - $("a").each((_, element) => { - const href = $(element).attr("href"); - if (href) { - const u = this.filterURL(href, url); - if (u !== null) { - links.push({ url: u, html: content, pageStatusCode, pageError }); - } - } - }); + links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError }))); if (this.visited.size === 1) { return links; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 38d0cc32..44a90b85 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -294,7 +294,16 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(this.urls[0], documents); } - documents = this.applyPathReplacements(documents); + if (this.pageOptions.includeMarkdown) { + documents = this.applyPathReplacements(documents); + } + + if (!this.pageOptions.includeHtml) { + for (let document of documents) { + delete document.html; + } + } + // documents = await this.applyImgAltText(documents); if ( (this.extractorOptions.mode === "llm-extraction" || @@ -347,6 +356,7 @@ export class WebScraperDataProvider { }); return { content: content, + markdown: content, metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, provider: "web-scraper", }; @@ -569,12 +579,20 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { - onlyMainContent: false, - includeHtml: false, - replaceAllPathsWithAbsolutePaths: false, - parsePDF: true, - removeTags: [], + this.pageOptions = { + onlyMainContent: options.pageOptions?.onlyMainContent ?? false, + includeHtml: options.pageOptions?.includeHtml ?? false, + replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true, + parsePDF: options.pageOptions?.parsePDF ?? true, + onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [], + removeTags: options.pageOptions?.removeTags ?? [], + includeMarkdown: options.pageOptions?.includeMarkdown ?? true, + includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + waitFor: options.pageOptions?.waitFor ?? undefined, + headers: options.pageOptions?.headers ?? undefined, + includeLinks: options.pageOptions?.includeLinks ?? true, + fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, + screenshot: options.pageOptions?.screenshot ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6998a665..bdcd62cd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -122,22 +122,38 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( jobId: string, urlToScrap: string, - pageOptions: PageOptions = { - onlyMainContent: true, - includeHtml: false, - includeRawHtml: false, - waitFor: 0, - screenshot: false, - fullPageScreenshot: false, - headers: undefined, - }, - extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown", - }, - existingHtml: string = "", + pageOptions: PageOptions, + extractorOptions?: ExtractorOptions, + existingHtml?: string, priority?: number, teamId?: string ): Promise { + pageOptions = { + includeMarkdown: pageOptions.includeMarkdown ?? true, + onlyMainContent: pageOptions.onlyMainContent ?? false, + includeHtml: pageOptions.includeHtml ?? false, + includeRawHtml: pageOptions.includeRawHtml ?? false, + waitFor: pageOptions.waitFor ?? undefined, + screenshot: pageOptions.screenshot ?? false, + fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, + headers: pageOptions.headers ?? undefined, + includeLinks: pageOptions.includeLinks ?? true, + replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true, + parsePDF: pageOptions.parsePDF ?? true, + removeTags: pageOptions.removeTags ?? [], + onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], + } + + if (extractorOptions) { + extractorOptions = { + mode: extractorOptions?.mode ?? "llm-extraction-from-markdown", + } + } + + if (!existingHtml) { + existingHtml = ""; + } + urlToScrap = urlToScrap.trim(); const attemptScraping = async ( @@ -341,8 +357,8 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) { @@ -364,20 +380,22 @@ export async function scrapSingleUrl( let linksOnPage: string[] | undefined; - linksOnPage = extractLinks(rawHtml, urlToScrap); + if (pageOptions.includeLinks) { + linksOnPage = extractLinks(rawHtml, urlToScrap); + } let document: Document; if (screenshot && screenshot.length > 0) { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions?.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -389,11 +407,11 @@ export async function scrapSingleUrl( } else { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions?.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { @@ -402,7 +420,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, }; } @@ -416,9 +434,9 @@ export async function scrapSingleUrl( }); return { content: "", - markdown: "", + markdown: pageOptions.includeMarkdown ? "" : undefined, html: "", - linksOnPage: [], + linksOnPage: pageOptions.includeLinks ? [] : undefined, metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 9496d569..fac53b38 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { description = soup('meta[name="description"]').attr("content") || null; // Assuming the language is part of the URL as per the regex pattern - const pattern = /([a-zA-Z]+-[A-Z]{2})/; - const match = pattern.exec(url); - language = match ? match[1] : null; + language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts new file mode 100644 index 00000000..7c6d8a4d --- /dev/null +++ b/apps/api/src/search/fireEngine.ts @@ -0,0 +1,45 @@ +import axios from "axios"; +import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; + +dotenv.config(); + +export async function fireEngineMap(q: string, options: { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + numResults: number; + page?: number; +}): Promise { + let data = JSON.stringify({ + query: q, + lang: options.lang, + country: options.country, + location: options.location, + tbs: options.tbs, + numResults: options.numResults, + page: options.page ?? 1, + }); + + if (!process.env.FIRE_ENGINE_BETA_URL) { + console.warn("(v1/map Beta) Results might differ from cloud offering currently."); + return []; + } + + let config = { + method: "POST", + url: `${process.env.FIRE_ENGINE_BETA_URL}/search`, + headers: { + "Content-Type": "application/json", + }, + data: data, + }; + const response = await axios(config); + if (response && response) { + return response.data + } else { + return []; + } +} diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 060f4bd8..0e247702 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string -export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { +export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { let proxies = null; if (proxy) { if (proxy.startsWith("https")) { diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f5bc06e3..f4c5b6d0 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,11 +1,9 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; -import { google_search } from "./googlesearch"; +import { googleSearch } from "./googlesearch"; +import { fireEngineMap } from "./fireEngine"; import { serper_search } from "./serper"; - - - export async function search({ query, advanced = false, @@ -30,12 +28,20 @@ export async function search({ proxy?: string; sleep_interval?: number; timeout?: number; -}) : Promise { +}): Promise { try { - if (process.env.SERPER_API_KEY ) { - return await serper_search(query, {num_results, tbs, filter, lang, country, location}); + + if (process.env.SERPER_API_KEY) { + return await serper_search(query, { + num_results, + tbs, + filter, + lang, + country, + location, + }); } - return await google_search( + return await googleSearch( query, advanced, num_results, @@ -49,7 +55,6 @@ export async function search({ ); } catch (error) { Logger.error(`Error in search function: ${error}`); - return [] + return []; } - // if process.env.SERPER_API_KEY is set, use serper } diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 740f48a2..2b476f52 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -49,5 +49,23 @@ export async function addScrapeJob( } } - - +export function waitForJob(jobId: string, timeout: number) { + return new Promise((resolve, reject) => { + const start = Date.now(); + const int = setInterval(async () => { + if (Date.now() >= start + timeout) { + clearInterval(int); + reject(new Error("Job wait ")); + } else { + const state = await getScrapeQueue().getJobState(jobId); + if (state === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(jobId)).returnvalue); + } else if (state === "failed") { + clearInterval(int); + reject((await getScrapeQueue().getJob(jobId)).failedReason); + } + } + }, 1000); + }) +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index ca1a4cbd..31d70a0b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -195,6 +195,14 @@ async function processJob(job: Job, token: string) { const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; + const rawHtml = docs[0] ? docs[0].rawHtml : ""; + + if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { + if (docs[0] && docs[0].rawHtml) { + delete docs[0].rawHtml; + } + } + const data = { success, result: { @@ -211,7 +219,7 @@ async function processJob(job: Job, token: string) { }; if (job.data.mode === "crawl") { - await callWebhook(job.data.team_id, job.id as string, data); + await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook, job.data.v1); } if (job.data.crawl_id) { @@ -238,15 +246,9 @@ async function processJob(job: Job, token: string) { if (!job.data.sitemapped) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); - let linksOnPage = []; - try{ - linksOnPage = data.docs[0]?.linksOnPage ?? []; - }catch(e){ - linksOnPage = [] - } + const links = crawler.filterLinks( - linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl)) - .filter(x => x !== null), + crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl), Infinity, sc.crawlerOptions?.maxDepth ?? 10 ) @@ -271,6 +273,7 @@ async function processJob(job: Job, token: string) { pageOptions: sc.pageOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, + v1: job.data.v1, }, {}, jobId, jobPriority); await addCrawlJob(job.data.crawl_id, newJob.id); @@ -340,7 +343,7 @@ async function processJob(job: Job, token: string) { docs: fullDocs, }; - await callWebhook(job.data.team_id, job.data.crawl_id, data); + await callWebhook(job.data.team_id, job.data.crawl_id, data, job.data.webhook, job.data.v1); } } @@ -384,7 +387,7 @@ async function processJob(job: Job, token: string) { }; if (job.data.mode === "crawl" || job.data.crawl_id) { - await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data); + await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data, job.data.webhook, job.data.v1); } if (job.data.crawl_id) { diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index e1c81e08..3e252301 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => { "test-prefix:someToken", "growth" ); - expect(limiter4.points).toBe(150); + expect(limiter4.points).toBe(250); }); it("should return the default rate limiter if plan is not provided", () => { @@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => { "crawlStatus" as RateLimiterMode, "test-prefix:someToken" ); - expect(limiter2.points).toBe(150); + expect(limiter2.points).toBe(250); }); it("should consume points correctly for 'crawl' mode", async () => { @@ -315,7 +315,7 @@ describe("Rate Limiter Service", () => { "crawlStatus" as RateLimiterMode, "test-prefix:someToken" ); - expect(limiter2.points).toBe(150); + expect(limiter2.points).toBe(250); }); it("should return the correct rate limiter for 'testSuite' mode", () => { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 75c4d6af..1798b23a 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -65,7 +65,7 @@ const RATE_LIMITS = { }, crawlStatus: { free: 150, - default: 150, + default: 250, }, testSuite: { free: 10000, diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index b0222ea3..b60774e0 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,15 +1,16 @@ +import { legacyDocumentConverter } from "../../src/controllers/v1/types"; import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; -export const callWebhook = async (teamId: string, jobId: string,data: any) => { +export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string, v1 = false) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - let webhookUrl = selfHostedUrl; + let webhookUrl = specified ?? selfHostedUrl; - // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // and the USE_DB_AUTHENTICATION environment variable is set to true - if (!selfHostedUrl && useDbAuthentication) { + if (!webhookUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") @@ -30,11 +31,15 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { let dataToSend = []; if (data.result.links && data.result.links.length !== 0) { for (let i = 0; i < data.result.links.length; i++) { - dataToSend.push({ - content: data.result.links[i].content.content, - markdown: data.result.links[i].content.markdown, - metadata: data.result.links[i].content.metadata, - }); + if (v1) { + dataToSend.push(legacyDocumentConverter(data.result.links[i].content)) + } else { + dataToSend.push({ + content: data.result.links[i].content.content, + markdown: data.result.links[i].content.markdown, + metadata: data.result.links[i].content.metadata, + }); + } } } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index f2932b5a..c57969f2 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -30,6 +30,8 @@ export interface WebScraperOptions { origin?: string; crawl_id?: string; sitemapped?: boolean; + webhook?: string; + v1?: boolean; } export interface RunWebScraperParams { @@ -105,6 +107,7 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", + Map = "map", } @@ -113,6 +116,7 @@ export interface AuthResponse { team_id?: string; error?: string; status?: number; + api_key?: string; plan?: PlanType; } diff --git a/apps/go-sdk/examples/.gitignore b/apps/go-sdk/examples/.gitignore deleted file mode 100644 index 6f72f892..00000000 --- a/apps/go-sdk/examples/.gitignore +++ /dev/null @@ -1,25 +0,0 @@ -# If you prefer the allow list template instead of the deny list, see community template: -# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore -# -# Binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE -*.out - -# Dependency directories (remove the comment below to include it) -# vendor/ - -# Go workspace file -go.work -go.work.sum - -# env file -.env diff --git a/apps/go-sdk/examples/LICENSE b/apps/go-sdk/examples/LICENSE deleted file mode 100644 index 25800a2e..00000000 --- a/apps/go-sdk/examples/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Mendable - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/apps/go-sdk/examples/example.go b/apps/go-sdk/examples/example.go deleted file mode 100644 index 48ba49da..00000000 --- a/apps/go-sdk/examples/example.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "log" - - "github.com/google/uuid" - "github.com/mendableai/firecrawl-go" -) - -func main() { - app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev") - if err != nil { - log.Fatalf("Failed to create FirecrawlApp: %v", err) - } - - // Scrape a website - scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil) - if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) - } - fmt.Println(scrapeResult.Markdown) - - // Crawl a website - idempotencyKey := uuid.New().String() // optional idempotency key - crawlParams := map[string]any{ - "crawlerOptions": map[string]any{ - "excludes": []string{"blog/*"}, - }, - } - crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey) - if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) - } - jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ") - if err != nil { - log.Fatalf("Failed to marshal crawl result: %v", err) - } - fmt.Println(string(jsonCrawlResult)) - - // LLM Extraction using JSON schema - jsonSchema := map[string]any{ - "type": "object", - "properties": map[string]any{ - "top": map[string]any{ - "type": "array", - "items": map[string]any{ - "type": "object", - "properties": map[string]any{ - "title": map[string]string{"type": "string"}, - "points": map[string]string{"type": "number"}, - "by": map[string]string{"type": "string"}, - "commentsURL": map[string]string{"type": "string"}, - }, - "required": []string{"title", "points", "by", "commentsURL"}, - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News", - }, - }, - "required": []string{"top"}, - } - - llmExtractionParams := map[string]any{ - "extractorOptions": firecrawl.ExtractorOptions{ - ExtractionSchema: jsonSchema, - Mode: "llm-extraction", - }, - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, - } - - llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams) - if err != nil { - log.Fatalf("Failed to perform LLM extraction: %v", err) - } - - // Pretty print the LLM extraction result - jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ") - if err != nil { - log.Fatalf("Failed to marshal LLM extraction result: %v", err) - } - fmt.Println(string(jsonResult)) -} diff --git a/apps/go-sdk/examples/go.mod b/apps/go-sdk/examples/go.mod deleted file mode 100644 index 3ea9b92f..00000000 --- a/apps/go-sdk/examples/go.mod +++ /dev/null @@ -1,9 +0,0 @@ -module github.com/mendableai/firecrawl-go-examples - -go 1.22.5 - -replace github.com/mendableai/firecrawl => ../ - -require github.com/google/uuid v1.6.0 - -require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect diff --git a/apps/go-sdk/examples/go.sum b/apps/go-sdk/examples/go.sum deleted file mode 100644 index 760ca553..00000000 --- a/apps/go-sdk/examples/go.sum +++ /dev/null @@ -1,14 +0,0 @@ -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= -github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs= -github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/apps/go-sdk/firecrawl/.env.example b/apps/go-sdk/firecrawl/.env.example deleted file mode 100644 index 772a6243..00000000 --- a/apps/go-sdk/firecrawl/.env.example +++ /dev/null @@ -1,2 +0,0 @@ -API_URL=http://localhost:3002 -TEST_API_KEY=fc-YOUR-API-KEY diff --git a/apps/go-sdk/firecrawl/.gitignore b/apps/go-sdk/firecrawl/.gitignore deleted file mode 100644 index db27dc80..00000000 --- a/apps/go-sdk/firecrawl/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.env -vendor \ No newline at end of file diff --git a/apps/go-sdk/firecrawl/LICENSE b/apps/go-sdk/firecrawl/LICENSE deleted file mode 100644 index 2635155f..00000000 --- a/apps/go-sdk/firecrawl/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Sideguide Technologies Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/apps/go-sdk/firecrawl/README.md b/apps/go-sdk/firecrawl/README.md deleted file mode 100644 index 7e17c10f..00000000 --- a/apps/go-sdk/firecrawl/README.md +++ /dev/null @@ -1,189 +0,0 @@ -# Firecrawl Go SDK - -The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Go SDK, you can - -```bash -go get github.com/mendableai/firecrawl -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK with error handling: - -```go -import ( - "fmt" - "log" - - "github.com/mendableai/firecrawl/firecrawl" -) - -func main() { - // Initialize the FirecrawlApp with your API key - app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY") - if err != nil { - log.Fatalf("Failed to initialize FirecrawlApp: %v", err) - } - - // Scrape a single URL - url := "https://mendable.ai" - scrapedData, err := app.ScrapeURL(url, nil) - if err != nil { - log.Fatalf("Error occurred while scraping: %v", err) - } - fmt.Println(scrapedData) - - // Crawl a website - crawlUrl := "https://mendable.ai" - params := map[string]any{ - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, - } - - crawlResult, err := app.CrawlURL(crawlUrl, params) - if err != nil { - log.Fatalf("Error occurred while crawling: %v", err) - } - fmt.Println(crawlResult) -} -``` - -### Scraping a URL - -To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```go -url := "https://mendable.ai" -scrapedData, err := app.ScrapeURL(url, nil) -if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) -} -fmt.Println(scrapedData) -``` - -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it: - -```go -jsonSchema := map[string]any{ - "type": "object", - "properties": map[string]any{ - "top": map[string]any{ - "type": "array", - "items": map[string]any{ - "type": "object", - "properties": map[string]any{ - "title": map[string]string{"type": "string"}, - "points": map[string]string{"type": "number"}, - "by": map[string]string{"type": "string"}, - "commentsURL": map[string]string{"type": "string"}, - }, - "required": []string{"title", "points", "by", "commentsURL"}, - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News", - }, - }, - "required": []string{"top"}, -} - -llmExtractionParams := map[string]any{ - "extractorOptions": firecrawl.ExtractorOptions{ - ExtractionSchema: jsonSchema, - }, -} - -scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams) -if err != nil { - log.Fatalf("Failed to perform LLM extraction: %v", err) -} -fmt.Println(scrapeResult) -``` - -### Search for a query - -To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results. - - -```go -query := "what is mendable?" -searchResult, err := app.Search(query) -if err != nil { - log.Fatalf("Failed to search: %v", err) -} -fmt.Println(searchResult) -``` - -### Crawling a Website - -To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -```go -crawlParams := map[string]any{ - "crawlerOptions": map[string]any{ - "excludes": []string{"blog/*"}, - "includes": []string{}, // leave empty for all pages - "limit": 1000, - }, - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, -} -crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey) -if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) -} -fmt.Println(crawlResult) -``` - -### Checking Crawl Status - -To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```go -status, err := app.CheckCrawlStatus(jobId) -if err != nil { - log.Fatalf("Failed to check crawl status: %v", err) -} -fmt.Println(status) -``` - -### Canceling a Crawl Job -To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job. - -```go -canceled, err := app.CancelCrawlJob(jobId) -if err != nil { - log.Fatalf("Failed to cancel crawl job: %v", err) -} -fmt.Println(canceled) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Contributing - -Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions: - -- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details. diff --git a/apps/go-sdk/firecrawl/firecrawl.go b/apps/go-sdk/firecrawl/firecrawl.go deleted file mode 100644 index 9a9dcfef..00000000 --- a/apps/go-sdk/firecrawl/firecrawl.go +++ /dev/null @@ -1,584 +0,0 @@ -// Package firecrawl provides a client for interacting with the Firecrawl API. -package firecrawl - -import ( - "bytes" - "encoding/json" - "fmt" - "io" - "math" - "net/http" - "os" - "time" -) - -// FirecrawlDocumentMetadata represents metadata for a Firecrawl document -type FirecrawlDocumentMetadata struct { - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - Keywords string `json:"keywords,omitempty"` - Robots string `json:"robots,omitempty"` - OGTitle string `json:"ogTitle,omitempty"` - OGDescription string `json:"ogDescription,omitempty"` - OGURL string `json:"ogUrl,omitempty"` - OGImage string `json:"ogImage,omitempty"` - OGAudio string `json:"ogAudio,omitempty"` - OGDeterminer string `json:"ogDeterminer,omitempty"` - OGLocale string `json:"ogLocale,omitempty"` - OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` - OGSiteName string `json:"ogSiteName,omitempty"` - OGVideo string `json:"ogVideo,omitempty"` - DCTermsCreated string `json:"dctermsCreated,omitempty"` - DCDateCreated string `json:"dcDateCreated,omitempty"` - DCDate string `json:"dcDate,omitempty"` - DCTermsType string `json:"dctermsType,omitempty"` - DCType string `json:"dcType,omitempty"` - DCTermsAudience string `json:"dctermsAudience,omitempty"` - DCTermsSubject string `json:"dctermsSubject,omitempty"` - DCSubject string `json:"dcSubject,omitempty"` - DCDescription string `json:"dcDescription,omitempty"` - DCTermsKeywords string `json:"dctermsKeywords,omitempty"` - ModifiedTime string `json:"modifiedTime,omitempty"` - PublishedTime string `json:"publishedTime,omitempty"` - ArticleTag string `json:"articleTag,omitempty"` - ArticleSection string `json:"articleSection,omitempty"` - SourceURL string `json:"sourceURL,omitempty"` - PageStatusCode int `json:"pageStatusCode,omitempty"` - PageError string `json:"pageError,omitempty"` -} - -// FirecrawlDocument represents a document in Firecrawl -type FirecrawlDocument struct { - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` - Content string `json:"content"` - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - LLMExtraction map[string]any `json:"llm_extraction,omitempty"` - CreatedAt *time.Time `json:"createdAt,omitempty"` - UpdatedAt *time.Time `json:"updatedAt,omitempty"` - Type string `json:"type,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` - ChildrenLinks []string `json:"childrenLinks,omitempty"` - Provider string `json:"provider,omitempty"` - Warning string `json:"warning,omitempty"` - Index int `json:"index,omitempty"` -} - -// ExtractorOptions represents options for extraction. -type ExtractorOptions struct { - Mode string `json:"mode,omitempty"` - ExtractionPrompt string `json:"extractionPrompt,omitempty"` - ExtractionSchema any `json:"extractionSchema,omitempty"` -} - -// ScrapeResponse represents the response for scraping operations -type ScrapeResponse struct { - Success bool `json:"success"` - Data *FirecrawlDocument `json:"data,omitempty"` -} - -// SearchResponse represents the response for searching operations -type SearchResponse struct { - Success bool `json:"success"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - -// CrawlResponse represents the response for crawling operations -type CrawlResponse struct { - Success bool `json:"success"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - -// JobStatusResponse represents the response for checking crawl job status -type JobStatusResponse struct { - Success bool `json:"success"` - Status string `json:"status"` - Current int `json:"current,omitempty"` - CurrentURL string `json:"current_url,omitempty"` - CurrentStep string `json:"current_step,omitempty"` - Total int `json:"total,omitempty"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` - PartialData []*FirecrawlDocument `json:"partial_data,omitempty"` -} - -// CancelCrawlJobResponse represents the response for canceling a crawl job -type CancelCrawlJobResponse struct { - Success bool `json:"success"` - Status string `json:"status"` -} - -// requestOptions represents options for making requests. -type requestOptions struct { - retries int - backoff int -} - -// requestOption is a functional option type for requestOptions. -type requestOption func(*requestOptions) - -// newRequestOptions creates a new requestOptions instance with the provided options. -// -// Parameters: -// - opts: Optional request options. -// -// Returns: -// - *requestOptions: A new instance of requestOptions with the provided options. -func newRequestOptions(opts ...requestOption) *requestOptions { - options := &requestOptions{retries: 1} - for _, opt := range opts { - opt(options) - } - return options -} - -// withRetries sets the number of retries for a request. -// -// Parameters: -// - retries: The number of retries to be performed. -// -// Returns: -// - requestOption: A functional option that sets the number of retries for a request. -func withRetries(retries int) requestOption { - return func(opts *requestOptions) { - opts.retries = retries - } -} - -// withBackoff sets the backoff interval for a request. -// -// Parameters: -// - backoff: The backoff interval (in milliseconds) to be used for retries. -// -// Returns: -// - requestOption: A functional option that sets the backoff interval for a request. -func withBackoff(backoff int) requestOption { - return func(opts *requestOptions) { - opts.backoff = backoff - } -} - -// FirecrawlApp represents a client for the Firecrawl API. -type FirecrawlApp struct { - APIKey string - APIURL string - Client *http.Client -} - -// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. -// If the API key or API URL is not provided, it attempts to retrieve them from environment variables. -// If the API key is still not found, it returns an error. -// -// Parameters: -// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable. -// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev". -// -// Returns: -// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. -// - error: An error if the API key is not provided or retrieved. -func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { - if apiKey == "" { - apiKey = os.Getenv("FIRECRAWL_API_KEY") - if apiKey == "" { - return nil, fmt.Errorf("no API key provided") - } - } - - if apiURL == "" { - apiURL = os.Getenv("FIRECRAWL_API_URL") - if apiURL == "" { - apiURL = "https://api.firecrawl.dev" - } - } - - client := &http.Client{ - Timeout: 60 * time.Second, - } - - return &FirecrawlApp{ - APIKey: apiKey, - APIURL: apiURL, - Client: client, - }, nil -} - -// ScrapeURL scrapes the content of the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to be scraped. -// - params: Optional parameters for the scrape request, including extractor options for LLM extraction. -// -// Returns: -// - *FirecrawlDocument: The scraped document data. -// - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) { - headers := app.prepareHeaders("") - scrapeBody := map[string]any{"url": url} - - if params != nil { - if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { - if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { - extractorOptions.ExtractionSchema = schema.schema() - } - if extractorOptions.Mode == "" { - extractorOptions.Mode = "llm-extraction" - } - scrapeBody["extractorOptions"] = extractorOptions - } - - for key, value := range params { - if key != "extractorOptions" { - scrapeBody[key] = value - } - } - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/scrape", app.APIURL), - scrapeBody, - headers, - "scrape URL", - ) - if err != nil { - return nil, err - } - - var scrapeResponse ScrapeResponse - err = json.Unmarshal(resp, &scrapeResponse) - if err != nil { - return nil, err - } - - if scrapeResponse.Success { - return scrapeResponse.Data, nil - } - - return nil, fmt.Errorf("failed to scrape URL") -} - -// Search performs a search query using the Firecrawl API and returns the search results. -// -// Parameters: -// - query: The search query string. -// - params: Optional parameters for the search request. -// -// Returns: -// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results. -// - error: An error if the search request fails. -func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) { - headers := app.prepareHeaders("") - searchBody := map[string]any{"query": query} - for k, v := range params { - searchBody[k] = v - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/search", app.APIURL), - searchBody, - headers, - "search", - ) - if err != nil { - return nil, err - } - - var searchResponse SearchResponse - err = json.Unmarshal(resp, &searchResponse) - if err != nil { - return nil, err - } - - if searchResponse.Success { - return searchResponse.Data, nil - } - - return nil, fmt.Errorf("failed to search") -} - -// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to crawl. -// - params: Optional parameters for the crawl request. -// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning. -// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true. -// - idempotencyKey: An optional idempotency key to ensure the request is idempotent. -// -// Returns: -// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true. -// - error: An error if the crawl request fails. -func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) { - headers := app.prepareHeaders(idempotencyKey) - crawlBody := map[string]any{"url": url} - for k, v := range params { - crawlBody[k] = v - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/crawl", app.APIURL), - crawlBody, - headers, - "start crawl job", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval) - } - - if crawlResponse.JobID == "" { - return nil, fmt.Errorf("failed to get job ID") - } - - return crawlResponse.JobID, nil -} - -// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to check. -// -// Returns: -// - *JobStatusResponse: The status of the crawl job. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) { - headers := app.prepareHeaders("") - resp, err := app.makeRequest( - http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var jobStatusResponse JobStatusResponse - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err - } - - return &jobStatusResponse, nil -} - -// CancelCrawlJob cancels a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to cancel. -// -// Returns: -// - string: The status of the crawl job after cancellation. -// - error: An error if the crawl job cancellation request fails. -func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { - headers := app.prepareHeaders("") - resp, err := app.makeRequest( - http.MethodDelete, - fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID), - nil, - headers, - "cancel crawl job", - ) - if err != nil { - return "", err - } - - var cancelCrawlJobResponse CancelCrawlJobResponse - err = json.Unmarshal(resp, &cancelCrawlJobResponse) - if err != nil { - return "", err - } - - return cancelCrawlJobResponse.Status, nil -} - -// prepareHeaders prepares the headers for an HTTP request. -// -// Parameters: -// - idempotencyKey: A string representing the idempotency key to be included in the headers. -// If the idempotency key is an empty string, it will not be included in the headers. -// -// Returns: -// - map[string]string: A map containing the headers for the HTTP request. -func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string { - headers := map[string]string{ - "Content-Type": "application/json", - "Authorization": fmt.Sprintf("Bearer %s", app.APIKey), - } - if idempotencyKey != "" { - headers["x-idempotency-key"] = idempotencyKey - } - return headers -} - -// makeRequest makes a request to the specified URL with the provided method, data, headers, and options. -// -// Parameters: -// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE"). -// - url: The URL to send the request to. -// - data: The data to be sent in the request body. -// - headers: The headers to be included in the request. -// - action: A string describing the action being performed. -// - opts: Optional request options. -// -// Returns: -// - []byte: The response body from the request. -// - error: An error if the request fails. -func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) { - var body []byte - var err error - if data != nil { - body, err = json.Marshal(data) - if err != nil { - return nil, err - } - } - - req, err := http.NewRequest(method, url, bytes.NewBuffer(body)) - if err != nil { - return nil, err - } - - for key, value := range headers { - req.Header.Set(key, value) - } - - var resp *http.Response - options := newRequestOptions(opts...) - for i := 0; i < options.retries; i++ { - resp, err = app.Client.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != 502 { - break - } - - time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond) - } - - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return nil, err - } - - statusCode := resp.StatusCode - if statusCode != 200 { - return nil, app.handleError(statusCode, respBody, action) - } - - return respBody, nil -} - -// monitorJobStatus monitors the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to monitor. -// - headers: The headers to be included in the request. -// - pollInterval: The interval (in seconds) at which to poll the job status. -// -// Returns: -// - []*FirecrawlDocument: The crawl result if the job is completed. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) { - attempts := 0 - for { - resp, err := app.makeRequest( - http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var statusData JobStatusResponse - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } - - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") - } - - if status == "completed" { - if statusData.Data != nil { - return statusData.Data, nil - } - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") - } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) - } - } -} - -// handleError handles errors returned by the Firecrawl API. -// -// Parameters: -// - resp: The HTTP response object. -// - body: The response body from the HTTP response. -// - action: A string describing the action being performed. -// -// Returns: -// - error: An error describing the failure reason. -func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error { - var errorData map[string]any - err := json.Unmarshal(body, &errorData) - if err != nil { - return fmt.Errorf("failed to parse error response: %v", err) - } - - errorMessage, _ := errorData["error"].(string) - if errorMessage == "" { - errorMessage = "No additional error details provided." - } - - var message string - switch statusCode { - case 402: - message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage) - case 408: - message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage) - case 409: - message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage) - case 500: - message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage) - default: - message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage) - } - - return fmt.Errorf(message) -} diff --git a/apps/go-sdk/firecrawl/firecrawl_test.go b/apps/go-sdk/firecrawl/firecrawl_test.go deleted file mode 100644 index 9d56c7ac..00000000 --- a/apps/go-sdk/firecrawl/firecrawl_test.go +++ /dev/null @@ -1,292 +0,0 @@ -package firecrawl - -import ( - "log" - "os" - "testing" - "time" - - "github.com/google/uuid" - "github.com/joho/godotenv" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var API_URL string -var TEST_API_KEY string - -func init() { - err := godotenv.Load("../.env") - if err != nil { - log.Fatalf("Error loading .env file: %v", err) - } - API_URL = os.Getenv("API_URL") - TEST_API_KEY = os.Getenv("TEST_API_KEY") -} - -func TestNoAPIKey(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL) - assert.Error(t, err) - assert.Contains(t, err.Error(), "no API key provided") -} - -func TestScrapeURLInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) - require.NoError(t, err) - - _, err = app.ScrapeURL("https://firecrawl.dev", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") -} - -func TestBlocklistedURL(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") -} - -func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL) - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") -} - -func TestScrapeURLE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") - assert.NotEqual(t, response.Markdown, "") - assert.NotNil(t, response.Metadata) - assert.Equal(t, response.HTML, "") -} - -func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - params := map[string]any{ - "pageOptions": map[string]any{ - "includeHtml": true, - }, - } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") - assert.Contains(t, response.Markdown, "_Roast_") - assert.Contains(t, response.HTML, " setTimeout(resolve, 1000)); // wait 1 second } -console.log(job.data[0].content); - -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) -console.log(searchResult) - -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); - -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" - } - }, - "required": ["top"] +if (job.data) { + console.log(job.data[0].markdown); } -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file +const mapResult = await app.map('https://firecrawl.dev'); +console.log(mapResult) diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index f314c080..9a4d840c 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,25 +1,25 @@ -import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; -import { z } from "zod"; +import FirecrawlApp, { ScrapeResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; +import { CrawlStatusResponse } from './firecrawl/src/index'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); // Scrape a website: const scrapeResult = await app.scrapeUrl('firecrawl.dev'); -if (scrapeResult.data) { - console.log(scrapeResult.data.content) +if (scrapeResult) { + console.log(scrapeResult.markdown) } // Crawl a website: -const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludePaths: ['blog/*'], limit: 5}}, false); console.log(crawlResult) const jobId: string = await crawlResult['jobId']; console.log(jobId); -let job: JobStatusResponse; +let job: CrawlStatusResponse; while (true) { - job = await app.checkCrawlStatus(jobId); + job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse; if (job.status === 'completed') { break; } @@ -27,66 +27,8 @@ while (true) { } if (job.data) { - console.log(job.data[0].content); -} - -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) - -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); - -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); -} - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" - } - }, - "required": ["top"] -} - -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); + console.log(job.data[0].markdown); } +const mapResult = await app.mapUrl('https://firecrawl.dev'); +console.log(mapResult) diff --git a/apps/js-sdk/exampleV0.js b/apps/js-sdk/exampleV0.js new file mode 100644 index 00000000..7f198598 --- /dev/null +++ b/apps/js-sdk/exampleV0.js @@ -0,0 +1,85 @@ +import { v4 as uuidv4 } from 'uuid'; +import FirecrawlApp from '@mendable/firecrawl-js'; +import { z } from "zod"; + +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); + +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev'); +console.log(scrapeResult.data.content) + +// Crawl a website: +const idempotencyKey = uuidv4(); // optional +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey); +console.log(crawlResult) + +const jobId = await crawlResult['jobId']; +console.log(jobId); + +let job; +while (true) { + job = await app.checkCrawlStatus(jobId); + if (job.status == 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second +} + +console.log(job.data[0].content); + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) +console.log(searchResult) + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file diff --git a/apps/js-sdk/exampleV0.ts b/apps/js-sdk/exampleV0.ts new file mode 100644 index 00000000..cecaaf24 --- /dev/null +++ b/apps/js-sdk/exampleV0.ts @@ -0,0 +1,93 @@ +import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js'; +import { z } from "zod"; + +const app = new FirecrawlApp<"v0">({apiKey: "fc-YOUR_API_KEY", version: "v0"}) + +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev'); + +if (scrapeResult.data) { + console.log(scrapeResult.data.content) +} + +// Crawl a website: +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); +console.log(crawlResult) + +const jobId: string = await crawlResult['jobId']; +console.log(jobId); + +let job: CrawlStatusResponseV0; +while (true) { + job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0; + if (job.status === 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second +} + +if (job.data) { + console.log(job.data[0].content); +} + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) as SearchResponseV0; +if (searchResult.data) { + console.log(searchResult.data[0].content) +} + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com"); + +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data[0].llm_extraction); +} + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data[0].llm_extraction); +} + diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 4b857b65..58b125e4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.36", + "version": "1.0.0", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", @@ -19,7 +19,7 @@ "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", - "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" + "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts" }, "repository": { "type": "git", diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index ad917de4..d71688b5 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -1,160 +1,331 @@ -import FirecrawlApp from '../../index'; -import { v4 as uuidv4 } from 'uuid'; -import dotenv from 'dotenv'; -import { describe, test, expect } from '@jest/globals'; +import FirecrawlApp, { + CrawlResponseV0, + CrawlStatusResponse, + CrawlStatusResponseV0, + FirecrawlDocumentV0, + ScrapeResponseV0, + SearchResponseV0, +} from "../../index"; +import { v4 as uuidv4 } from "uuid"; +import dotenv from "dotenv"; +import { describe, test, expect } from "@jest/globals"; dotenv.config(); const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = "http://127.0.0.1:3002"; -describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { +describe('FirecrawlApp<"v0"> E2E Tests', () => { + test.concurrent("should throw error for no API key", async () => { expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + new FirecrawlApp<"v0">({ apiKey: null, apiUrl: API_URL, version: "v0" }); }).toThrow("No API key provided"); }); - test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); - }); - - test.concurrent('should throw error for blocklisted URL on scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); - }); - - test.concurrent('should return successful response with valid preview token', async () => { - const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response for valid scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - expect(response.data).toHaveProperty('markdown'); - expect(response.data).toHaveProperty('metadata'); - expect(response.data).not.toHaveProperty('html'); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response with valid API key and include HTML', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - expect(response.data?.markdown).toContain("_Roast_"); - expect(response.data?.html).toContain(" { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds timeout - - test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); - }); - - test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); - }); - - test.concurrent('should return successful response for crawl and wait for completion', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30); - expect(response).not.toBeNull(); - expect(response[0].content).toContain("_Roast_"); - }, 60000); // 60 seconds timeout - - test.concurrent('should handle idempotency key for crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey); - expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); - - await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); - }); - - test.concurrent('should check crawl status', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false); - expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); - - let statusResponse = await app.checkCrawlStatus(response.jobId); - const maxChecks = 15; - let checks = 0; - - while (statusResponse.status === 'active' && checks < maxChecks) { - await new Promise(resolve => setTimeout(resolve, 1000)); - expect(statusResponse.partial_data).not.toBeNull(); - expect(statusResponse.current).toBeGreaterThanOrEqual(1); - statusResponse = await app.checkCrawlStatus(response.jobId); - checks++; + test.concurrent( + "should throw error for invalid API key on scrape", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect( + invalidApp.scrapeUrl("https://roastmywebsite.ai") + ).rejects.toThrow("Request failed with status code 401"); } + ); - expect(statusResponse).not.toBeNull(); - expect(statusResponse.success).toBe(true); - expect(statusResponse.status).toBe('completed'); - expect(statusResponse.total).toEqual(statusResponse.current); - expect(statusResponse.current_step).not.toBeNull(); - expect(statusResponse?.data?.length).toBeGreaterThan(0); - }, 35000); // 35 seconds timeout + test.concurrent( + "should throw error for blocklisted URL on scrape", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( + "Request failed with status code 403" + ); + } + ); - test.concurrent('should return successful response for search', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.search("test query"); + test.concurrent( + "should return successful response with valid preview token", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: "this_is_just_a_preview_token", + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://roastmywebsite.ai" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response for valid scrape", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://roastmywebsite.ai" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data).toHaveProperty("markdown"); + expect(response.data).toHaveProperty("metadata"); + expect(response.data).not.toHaveProperty("html"); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response with valid API key and include HTML", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl("https://roastmywebsite.ai", { + pageOptions: { includeHtml: true }, + })) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data?.markdown).toContain("_Roast_"); + expect(response.data?.html).toContain(" { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://arxiv.org/pdf/astro-ph/9301001.pdf" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response for valid scrape with PDF file without explicit extension", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://arxiv.org/pdf/astro-ph/9301001" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should throw error for invalid API key on crawl", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect( + invalidApp.crawlUrl("https://roastmywebsite.ai") + ).rejects.toThrow("Request failed with status code 401"); + } + ); + + test.concurrent( + "should throw error for blocklisted URL on crawl", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const blocklistedUrl = "https://twitter.com/fake-test"; + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( + "Request failed with status code 403" + ); + } + ); + + test.concurrent( + "should return successful response for crawl and wait for completion", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + true, + 10 + )) as FirecrawlDocumentV0[]; + expect(response).not.toBeNull(); + console.log({ response }); + expect(response[0].content).toContain("_Roast_"); + }, + 60000 + ); // 60 seconds timeout + + test.concurrent("should handle idempotency key for crawl", async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const uniqueIdempotencyKey = uuidv4(); + const response = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + false, + 2, + uniqueIdempotencyKey + )) as CrawlResponseV0; expect(response).not.toBeNull(); - expect(response?.data?.[0]?.content).toBeDefined(); - expect(response?.data?.length).toBeGreaterThan(2); - }, 30000); // 30 seconds timeout + expect(response.jobId).toBeDefined(); - test.concurrent('should throw error for invalid API key on search', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401"); + await expect( + app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + true, + 2, + uniqueIdempotencyKey + ) + ).rejects.toThrow("Request failed with status code 409"); }); - test.concurrent('should perform LLM extraction', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl("https://mendable.ai", { - extractorOptions: { - mode: 'llm-extraction', - extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractionSchema: { - type: 'object', - properties: { - company_mission: { type: 'string' }, - supports_sso: { type: 'boolean' }, - is_open_source: { type: 'boolean' } - }, - required: ['company_mission', 'supports_sso', 'is_open_source'] - } + test.concurrent( + "should check crawl status", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response: any = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + false + )) as CrawlResponseV0; + expect(response).not.toBeNull(); + expect(response.jobId).toBeDefined(); + + let statusResponse = await app.checkCrawlStatus(response.jobId); + const maxChecks = 15; + let checks = 0; + + while (statusResponse.status === "active" && checks < maxChecks) { + await new Promise((resolve) => setTimeout(resolve, 5000)); + expect(statusResponse.partial_data).not.toBeNull(); + // expect(statusResponse.current).toBeGreaterThanOrEqual(1); + statusResponse = (await app.checkCrawlStatus( + response.jobId + )) as CrawlStatusResponseV0; + checks++; } - }); - expect(response).not.toBeNull(); - expect(response.data?.llm_extraction).toBeDefined(); - const llmExtraction = response.data?.llm_extraction; - expect(llmExtraction?.company_mission).toBeDefined(); - expect(typeof llmExtraction?.supports_sso).toBe('boolean'); - expect(typeof llmExtraction?.is_open_source).toBe('boolean'); - }, 30000); // 30 seconds timeout + + expect(statusResponse).not.toBeNull(); + expect(statusResponse.success).toBe(true); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.total).toEqual(statusResponse.current); + expect(statusResponse.current_step).not.toBeNull(); + expect(statusResponse.current).toBeGreaterThanOrEqual(1); + + expect(statusResponse?.data?.length).toBeGreaterThan(0); + }, + 35000 + ); // 35 seconds timeout + + test.concurrent( + "should return successful response for search", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.search("test query")) as SearchResponseV0; + expect(response).not.toBeNull(); + expect(response?.data?.[0]?.content).toBeDefined(); + expect(response?.data?.length).toBeGreaterThan(2); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should throw error for invalid API key on search", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect(invalidApp.search("test query")).rejects.toThrow( + "Request failed with status code 401" + ); + } + ); + + test.concurrent( + "should perform LLM extraction", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl("https://mendable.ai", { + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { type: "string" }, + supports_sso: { type: "boolean" }, + is_open_source: { type: "boolean" }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + })) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.llm_extraction).toBeDefined(); + const llmExtraction = response.data?.llm_extraction; + expect(llmExtraction?.company_mission).toBeDefined(); + expect(typeof llmExtraction?.supports_sso).toBe("boolean"); + expect(typeof llmExtraction?.is_open_source).toBe("boolean"); + }, + 30000 + ); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index dcda96f7..92951237 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -31,7 +31,7 @@ describe('the firecrawl JS SDK', () => { }); const apiKey = 'YOUR_API_KEY' - const app = new FirecrawlApp({ apiKey }); + const app = new FirecrawlApp<"v0">({ apiKey }); // Scrape a single URL const url = 'https://mendable.ai'; const scrapedData = await app.scrapeUrl(url); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts new file mode 100644 index 00000000..9f6c6462 --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -0,0 +1,312 @@ +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index'; +import { v4 as uuidv4 } from 'uuid'; +import dotenv from 'dotenv'; +import { describe, test, expect } from '@jest/globals'; + +dotenv.config(); + +const TEST_API_KEY = process.env.TEST_API_KEY; +const API_URL = "http://127.0.0.1:3002"; + +describe('FirecrawlApp E2E Tests', () => { + test.concurrent('should throw error for no API key', async () => { + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + }); + + test.concurrent('should throw error for invalid API key on scrape', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain("_Roast_"); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response).not.toHaveProperty('content'); // v0 + expect(response).not.toHaveProperty('html'); + expect(response).not.toHaveProperty('rawHtml'); + expect(response).not.toHaveProperty('screenshot'); + expect(response).not.toHaveProperty('links'); + + expect(response).toHaveProperty('markdown'); + expect(response).toHaveProperty('metadata'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response with valid API key and options', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl( + 'https://roastmywebsite.ai', { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + timeout: 30000, + waitFor: 1000 + }) as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response).not.toHaveProperty('content'); // v0 + expect(response.markdown).toContain("_Roast_"); + expect(response.html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should throw error for invalid API key on crawl', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on crawl', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://twitter.com/fake-test"; + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + test.concurrent('should return successful response for crawl and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse; + expect(response).not.toBeNull(); + expect(response).toHaveProperty("total"); + expect(response.total).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).not.toHaveProperty("next"); // wait until done + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]).toHaveProperty("markdown"); + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty('content'); // v0 + expect(response.data?.[0]).not.toHaveProperty("html"); + expect(response.data?.[0]).not.toHaveProperty("rawHtml"); + expect(response.data?.[0]).not.toHaveProperty("screenshot"); + expect(response.data?.[0]).not.toHaveProperty("links"); + expect(response.data?.[0]).toHaveProperty("metadata"); + expect(response.data?.[0].metadata).toHaveProperty("title"); + expect(response.data?.[0].metadata).toHaveProperty("description"); + expect(response.data?.[0].metadata).toHaveProperty("language"); + expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); + expect(response.data?.[0].metadata).toHaveProperty("statusCode"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); + }, 60000); // 60 seconds timeout + + test.concurrent('should return successful response for crawl with options and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { + excludePaths: ['blog/*'], + includePaths: ['/'], + maxDepth: 2, + ignoreSitemap: true, + limit: 10, + allowBackwardLinks: true, + allowExternalLinks: true, + scrapeOptions: { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + waitFor: 1000 + } + } as CrawlParams, true, 30) as CrawlStatusResponse; + expect(response).not.toBeNull(); + expect(response).toHaveProperty("total"); + expect(response.total).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).not.toHaveProperty("next"); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]).toHaveProperty("markdown"); + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty('content'); // v0 + expect(response.data?.[0]).toHaveProperty("html"); + expect(response.data?.[0].html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const uniqueIdempotencyKey = uuidv4(); + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; + expect(response).not.toBeNull(); + expect(response.id).toBeDefined(); + + await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); + }); + + test.concurrent('should check crawl status', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse; + expect(response).not.toBeNull(); + expect(response.id).toBeDefined(); + + let statusResponse = await app.checkCrawlStatus(response.id); + const maxChecks = 15; + let checks = 0; + + while (statusResponse.status === 'scraping' && checks < maxChecks) { + await new Promise(resolve => setTimeout(resolve, 5000)); + expect(statusResponse).not.toHaveProperty("partial_data"); // v0 + expect(statusResponse).not.toHaveProperty("current"); // v0 + expect(statusResponse).toHaveProperty("data"); + expect(statusResponse).toHaveProperty("total"); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse).toHaveProperty("next"); + expect(statusResponse.total).toBeGreaterThan(0); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse.status).toBe("scraping"); + expect(statusResponse.next).toContain("/v1/crawl/"); + statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; + checks++; + } + + expect(statusResponse).not.toBeNull(); + expect(statusResponse).toHaveProperty("total"); + expect(statusResponse.total).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.data?.length).toBeGreaterThan(0); + expect(statusResponse.data?.[0]).toHaveProperty("markdown"); + expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10); + expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0 + expect(statusResponse.data?.[0]).toHaveProperty("html"); + expect(statusResponse.data?.[0].html).toContain(" { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + expect(response.links?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + + expect(response.links?.length).toBeGreaterThan(0); + expect(response.links?.[0]).toContain("https://"); + const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); + expect(filteredLinks?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout + + test('should throw NotImplementedError for search on v1', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + }); +}); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a42d4618..4f3f820f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,16 +1,22 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; + /** * Configuration interface for FirecrawlApp. + * @param apiKey - Optional API key for authentication. + * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. + * @param version - API version, either 'v0' or 'v1'. */ export interface FirecrawlAppConfig { apiKey?: string | null; apiUrl?: string | null; + version?: "v0" | "v1"; } /** * Metadata for a Firecrawl document. + * Includes various optional properties for document metadata. */ export interface FirecrawlDocumentMetadata { title?: string; @@ -43,6 +49,17 @@ export interface FirecrawlDocumentMetadata { articleTag?: string; articleSection?: string; sourceURL?: string; + statusCode?: number; + error?: string; + [key: string]: any; // Allows for additional metadata properties not explicitly defined. +} + +/** + * Metadata for a Firecrawl document on v0. + * Similar to FirecrawlDocumentMetadata but includes properties specific to API version v0. + */ +export interface FirecrawlDocumentMetadataV0 { + // Similar properties as FirecrawlDocumentMetadata with additional v0 specific adjustments pageStatusCode?: number; pageError?: string; [key: string]: any; @@ -50,8 +67,23 @@ export interface FirecrawlDocumentMetadata { /** * Document interface for Firecrawl. + * Represents a document retrieved or processed by Firecrawl. */ export interface FirecrawlDocument { + url?: string; + markdown?: string; + html?: string; + rawHtml?: string; + links?: string[]; + screenshot?: string; + metadata: FirecrawlDocumentMetadata; +} + +/** + * Document interface for Firecrawl on v0. + * Represents a document specifically for API version v0 with additional properties. + */ +export interface FirecrawlDocumentV0 { id?: string; url?: string; content: string; @@ -61,79 +93,242 @@ export interface FirecrawlDocument { createdAt?: Date; updatedAt?: Date; type?: string; - metadata: FirecrawlDocumentMetadata; + metadata: FirecrawlDocumentMetadataV0; childrenLinks?: string[]; provider?: string; warning?: string; - index?: number; } /** - * Response interface for scraping operations. + * Parameters for scraping operations. + * Defines the options and configurations available for scraping web content. */ -export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; - error?: string; +export interface ScrapeParams { + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[]; + headers?: Record; + includeTags?: string[]; + excludeTags?: string[]; + onlyMainContent?: boolean; + screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile"; + waitFor?: number; + timeout?: number; } + /** - * Response interface for searching operations. + * Parameters for scraping operations on v0. + * Includes page and extractor options specific to API version v0. */ -export interface SearchResponse { +export interface ScrapeParamsV0 { + pageOptions?: { + headers?: Record; + includeHtml?: boolean; + includeRawHtml?: boolean; + onlyIncludeTags?: string[]; + onlyMainContent?: boolean; + removeTags?: string[]; + replaceAllPathsWithAbsolutePaths?: boolean; + screenshot?: boolean; + fullPageScreenshot?: boolean; + waitFor?: number; + }; + extractorOptions?: { + mode?: "markdown" | "llm-extraction" | "llm-extraction-from-raw-html" | "llm-extraction-from-markdown"; + extractionPrompt?: string; + extractionSchema?: Record | z.ZodSchema | any; + }; + timeout?: number; +} + +/** + * Response interface for scraping operations. + * Defines the structure of the response received after a scraping operation. + */ +export interface ScrapeResponse extends FirecrawlDocument { success: boolean; - data?: FirecrawlDocument[]; + warning?: string; error?: string; } + +/** + * Response interface for scraping operations on v0. + * Similar to ScrapeResponse but tailored for responses from API version v0. + */ +export interface ScrapeResponseV0 { + success: boolean; + data?: FirecrawlDocumentV0; + error?: string; +} + +/** + * Parameters for crawling operations. + * Includes options for both scraping and mapping during a crawl. + */ +export interface CrawlParams { + scrapeOptions?: ScrapeParams; + crawlerOptions?: { + includePaths?: string[] + excludePaths?: string[] + maxDepth?: number + limit?: number + allowBackwardLinks?: boolean + allowExternalLinks?: boolean + ignoreSitemap?: boolean + }; +} + +/** + * Parameters for crawling operations on v0. + * Tailored for API version v0, includes specific options for crawling. + */ +export interface CrawlParamsV0 { + crawlerOptions?: { + includes?: string[]; + excludes?: string[]; + generateImgAltText?: boolean; + returnOnlyUrls?: boolean; + maxDepth?: number; + mode?: "default" | "fast"; + ignoreSitemap?: boolean; + limit?: number; + allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; + }; + pageOptions?: { + headers?: Record; + includeHtml?: boolean; + includeRawHtml?: boolean; + onlyIncludeTags?: string[]; + onlyMainContent?: boolean; + removeTags?: string[]; + replaceAllPathsWithAbsolutePaths?: boolean; + screenshot?: boolean; + fullPageScreenshot?: boolean; + waitFor?: number; + }; +} + /** * Response interface for crawling operations. + * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { + id?: string; + url?: string; success: boolean; + error?: string; +} + +/** + * Response interface for crawling operations on v0. + * Similar to CrawlResponse but tailored for responses from API version v0. + */ +export interface CrawlResponseV0 { jobId?: string; + success: boolean; + error?: string; +} + +/** + * Response interface for job status checks. + * Provides detailed status of a crawl job including progress and results. + */ +export interface CrawlStatusResponse { + success: boolean; + total: number; + completed: number; + creditsUsed: number; + expiresAt: Date; + status: "scraping" | "completed" | "failed"; + next: string; data?: FirecrawlDocument[]; error?: string; } + /** - * Response interface for job status checks. + * Response interface for job status checks on v0. + * Tailored for API version v0, provides status and partial data of a crawl job. */ -export interface JobStatusResponse { +export interface CrawlStatusResponseV0 { success: boolean; status: string; current?: number; current_url?: string; current_step?: string; total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; + data?: FirecrawlDocumentV0[]; + partial_data?: FirecrawlDocumentV0[]; error?: string; } + + /** - * Generic parameter interface. + * Parameters for mapping operations. + * Defines options for mapping URLs during a crawl. */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; +export interface MapParams { + includePaths?: string[] + excludePaths?: string[] + maxDepth?: number + limit?: number + allowBackwardLinks?: boolean + allowExternalLinks?: boolean + ignoreSitemap?: boolean +} + +/** + * Response interface for mapping operations. + * Defines the structure of the response received after a mapping operation. + */ +export interface MapResponse { + success: boolean; + links?: string[]; + error?: string; +} + +/** + * Parameters for searching operations on v0. + * Tailored for API version v0, includes specific options for searching content. + */ +export interface SearchParamsV0 { + pageOptions?: { + onlyMainContent?: boolean; + fetchPageContent?: boolean; + includeHtml?: boolean; + includeRawHtml?: boolean; + }; + searchOptions?: { + limit?: number; }; } + +/** + * Response interface for searching operations on v0. + * Defines the structure of the response received after a search operation on v0. + */ +export interface SearchResponseV0 { + success: boolean; + data?: FirecrawlDocumentV0[]; + error?: string; +} + /** * Main class for interacting with the Firecrawl API. + * Provides methods for scraping, searching, crawling, and mapping web content. */ -export default class FirecrawlApp { +export default class FirecrawlApp { private apiKey: string; private apiUrl: string; + public version: T; /** * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + * @param config - Configuration options for the FirecrawlApp instance. */ - constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { + constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) { this.apiKey = apiKey || ""; this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.version = version as T; if (!this.apiKey) { throw new Error("No API key provided"); } @@ -141,21 +336,21 @@ export default class FirecrawlApp { /** * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. + * @param url - The URL to scrape. + * @param params - Additional parameters for the scrape request. + * @returns The response from the scrape operation. */ async scrapeUrl( url: string, - params: Params | null = null - ): Promise { + params?: ScrapeParams | ScrapeParamsV0 + ): Promise { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: Params = { url, ...params }; - if (params?.extractorOptions?.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; + let jsonData: any = { url, ...params }; + if (jsonData?.extractorOptions?.extractionSchema) { + let schema = jsonData.extractorOptions.extractionSchema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas if (schema instanceof z.ZodSchema) { schema = zodToJsonSchema(schema); @@ -163,22 +358,27 @@ export default class FirecrawlApp { jsonData = { ...jsonData, extractorOptions: { - ...params.extractorOptions, + ...jsonData.extractorOptions, extractionSchema: schema, - mode: params.extractorOptions.mode || "llm-extraction", + mode: jsonData.extractorOptions.mode || "llm-extraction", }, }; } try { const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/scrape", + this.apiUrl + `/${this.version}/scrape`, jsonData, { headers } ); if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData; + return (this.version === 'v0' ? responseData as ScrapeResponseV0 : { + success: true, + warning: responseData.warning, + error: responseData.error, + ...responseData.data + }) as ScrapeResponse; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } @@ -188,24 +388,28 @@ export default class FirecrawlApp { } catch (error: any) { throw new Error(error.message); } - return { success: false, error: "Internal server error." }; + return { success: false, error: "Internal server error." } as this['version'] extends 'v0' ? ScrapeResponseV0 : ScrapeResponse; } /** * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. + * @param query - The query to search for. + * @param params - Additional parameters for the search request. + * @returns The response from the search operation. */ async search( query: string, - params: Params | null = null - ): Promise { + params?: SearchParamsV0 + ): Promise { + if (this.version === "v1") { + throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0."); + } + const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: Params = { query }; + let jsonData: any = { query }; if (params) { jsonData = { ...jsonData, ...params }; } @@ -233,93 +437,160 @@ export default class FirecrawlApp { /** * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. + * @param url - The URL to crawl. + * @param params - Additional parameters for the crawl request. + * @param waitUntilDone - Whether to wait for the crawl job to complete. + * @param pollInterval - Time in seconds for job status checks. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns The response from the crawl operation. */ async crawlUrl( url: string, - params: Params | null = null, + params?: this['version'] extends 'v0' ? CrawlParamsV0 : CrawlParams, waitUntilDone: boolean = true, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise< + this['version'] extends 'v0' + ? CrawlResponseV0 | CrawlStatusResponseV0 | FirecrawlDocumentV0[] + : CrawlResponse | CrawlStatusResponse + > { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: Params = { url }; - if (params) { - jsonData = { ...jsonData, ...params }; - } + let jsonData: any = { url, ...params }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + "/v0/crawl", + this.apiUrl + `/${this.version}/crawl`, jsonData, headers ); if (response.status === 200) { - const jobId: string = response.data.jobId; + const id: string = this.version === 'v0' ? response.data.jobId : response.data.id; + let checkUrl: string | undefined = undefined; if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); + if (this.version === 'v1') { checkUrl = response.data.url } + return this.monitorJobStatus(id, headers, pollInterval, checkUrl); } else { - return { success: true, jobId }; + if (this.version === 'v0') { + return { + success: true, + jobId: id + } as CrawlResponseV0; + } else { + return { + success: true, + id: id + } as CrawlResponse; + } } } else { this.handleError(response, "start crawl job"); } } catch (error: any) { - console.log(error); - throw new Error(error.message); + if (error.response?.data?.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } } - return { success: false, error: "Internal server error." }; + return { success: false, error: "Internal server error." } as this['version'] extends 'v0' ? CrawlResponseV0 : CrawlResponse; } /** * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. + * @param id - The ID of the crawl operation. + * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(id?: string): Promise { + if (!id) { + throw new Error("No crawl ID provided"); + } + const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + this.version === 'v1' ? + `${this.apiUrl}/${this.version}/crawl/${id}` : + `${this.apiUrl}/${this.version}/crawl/status/${id}`, headers ); if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; + if (this.version === 'v0') { + return ({ + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + } as CrawlStatusResponseV0) as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse; + } else { + return ({ + success: true, + status: response.data.status, + total: response.data.total, + completed: response.data.completed, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, + data: response.data.data, + error: response.data.error + } as CrawlStatusResponse) as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse; + } } else { this.handleError(response, "check crawl status"); } } catch (error: any) { throw new Error(error.message); } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; + + return this.version === 'v0' ? + ({ + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + } as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse) : + ({ + success: false, + error: "Internal server error.", + } as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse); + } + + async mapUrl(url: string, params?: MapParams): Promise { + if (this.version == 'v0') { + throw new Error("Map is not supported in v0"); + } + const headers = this.prepareHeaders(); + let jsonData: { url: string } & MapParams = { url, ...params }; + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/${this.version}/map`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data as MapResponse; + } else { + this.handleError(response, "map"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." } as MapResponse; } /** * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. + * @param idempotencyKey - Optional key to ensure idempotency. + * @returns The prepared headers. */ prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { return { @@ -331,14 +602,14 @@ export default class FirecrawlApp { /** * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. + * @param url - The URL to send the request to. + * @param data - The data to send in the request. + * @param headers - The headers for the request. + * @returns The response from the POST request. */ postRequest( url: string, - data: Params, + data: any, headers: AxiosRequestHeaders ): Promise { return axios.post(url, data, { headers }); @@ -346,9 +617,9 @@ export default class FirecrawlApp { /** * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. + * @param url - The URL to send the request to. + * @param headers - The headers for the request. + * @returns The response from the GET request. */ getRequest( url: string, @@ -359,38 +630,44 @@ export default class FirecrawlApp { /** * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. + * @param id - The ID of the crawl operation. + * @param headers - The headers for the request. + * @param checkInterval - Interval in seconds for job status checks. + * @param checkUrl - Optional URL to check the status (used for v1 API) + * @returns The final job status or data. */ async monitorJobStatus( - jobId: string, + id: string, headers: AxiosRequestHeaders, - checkInterval: number - ): Promise { + checkInterval: number, + checkUrl?: string + ): Promise { + let apiUrl: string = ''; while (true) { + if (this.version === 'v1') { + apiUrl = checkUrl ?? `${this.apiUrl}/v1/crawl/${id}`; + } else if (this.version === 'v0') { + apiUrl = `${this.apiUrl}/v0/crawl/status/${id}`; + } const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + apiUrl, headers ); if (statusResponse.status === 200) { const statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { - return statusData.data; + return this.version === 'v0' ? statusData.data : statusData; } else { throw new Error("Crawl job completed but no data was returned"); } } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) + ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status) ) { - if (checkInterval < 2) { - checkInterval = 2; - } + checkInterval = Math.max(checkInterval, 2); await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000) - ); // Wait for the specified timeout before checking again + ); } else { throw new Error( `Crawl job failed or was stopped. Status: ${statusData.status}` diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index d7764a46..56f13ced 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -11,7 +11,7 @@ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ /* Language and Environment */ - "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + "target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ // "jsx": "preserve", /* Specify what JSX code is generated. */ // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ @@ -25,9 +25,9 @@ // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ /* Modules */ - "module": "NodeNext", /* Specify what module code is generated. */ + "module": "commonjs", /* Specify what module code is generated. */ "rootDir": "./src", /* Specify the root folder within your source files. */ - "moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */ + "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index ca337062..95dd7d27 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.19", + "@mendable/firecrawl-js": "^0.0.36", "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", @@ -422,15 +422,29 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.19", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz", - "integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==", + "version": "0.0.36", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz", + "integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==", "dependencies": { "axios": "^1.6.8", + "dotenv": "^16.4.5", + "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, + "node_modules/@mendable/firecrawl-js/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", @@ -531,6 +545,17 @@ "node": ">=0.3.1" } }, + "node_modules/dotenv": { + "version": "16.4.5", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", + "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/esbuild": { "version": "0.20.2", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 2d2c36e8..2be17886 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.19", + "@mendable/firecrawl-js": "^0.0.36", "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", diff --git a/apps/python-sdk/build/lib/firecrawl/__init__.py b/apps/python-sdk/build/lib/firecrawl/__init__.py deleted file mode 100644 index e7f8063d..00000000 --- a/apps/python-sdk/build/lib/firecrawl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .firecrawl import FirecrawlApp diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py deleted file mode 100644 index 3f50c798..00000000 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -FirecrawlApp Module - -This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. -It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, -and check the status of these jobs. The module uses requests for HTTP communication -and handles retries for certain HTTP status codes. - -Classes: - - FirecrawlApp: Main class for interacting with the Firecrawl API. -""" - -import os -import time -from typing import Any, Dict, Optional - -import requests - - -class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. - - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: - """ - Scrape the specified URL using the Firecrawl API. - - Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. - - Returns: - Any: The scraped data if the request is successful. - - Raises: - Exception: If the scrape request fails. - """ - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} - - # If there are additional params, process them - if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key != 'extractorOptions': - scrape_params[key] = value - # Make the POST request with the prepared headers and JSON data - response = requests.post( - f'{self.api_url}/v0/scrape', - headers=headers, - json=scrape_params, - ) - if response.status_code == 200: - response = response.json() - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - - def search(self, query, params=None): - """ - Perform a search using the Firecrawl API. - - Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. - - Returns: - Any: The search results if the request is successful. - - Raises: - Exception: If the search request fails. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - json_data = {'query': query} - if params: - json_data.update(params) - response = requests.post( - f'{self.api_url}/v0/search', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to search. Error: {response["error"]}') - - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to search. Status code: {response.status_code}') - - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): - """ - Initiate a crawl job for the specified URL using the Firecrawl API. - - Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - wait_until_done (bool): Whether to wait until the crawl job is completed. - timeout (int): Timeout between status checks when waiting for job completion. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. - - Returns: - Any: The crawl job ID or the crawl results if waiting until completion. - - Raises: - Exception: If the crawl job initiation or monitoring fails. - """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} - else: - self._handle_error(response, 'start crawl job') - - def check_crawl_status(self, job_id): - """ - Check the status of a crawl job using the Firecrawl API. - - Args: - job_id (str): The ID of the crawl job. - - Returns: - Any: The status of the crawl job. - - Raises: - Exception: If the status check request fails. - """ - headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') - - def _prepare_headers(self, idempotency_key=None): - """ - Prepare the headers for API requests. - - Args: - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. - - Returns: - Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. - """ - if idempotency_key: - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - 'x-idempotency-key': idempotency_key - } - - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): - """ - Make a POST request with retries. - - Args: - url (str): The URL to send the POST request to. - data (Dict[str, Any]): The JSON data to include in the POST request. - headers (Dict[str, str]): The headers to include in the POST request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the POST request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.post(url, headers=headers, json=data) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): - """ - Make a GET request with retries. - - Args: - url (str): The URL to send the GET request to. - headers (Dict[str, str]): The headers to include in the GET request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the GET request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.get(url, headers=headers) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _monitor_job_status(self, job_id, headers, timeout): - """ - Monitor the status of a crawl job until completion. - - Args: - job_id (str): The ID of the crawl job. - headers (Dict[str, str]): The headers to include in the status check requests. - timeout (int): Timeout between status checks. - - Returns: - Any: The crawl results if the job is completed successfully. - - Raises: - Exception: If the job fails or an error occurs during status checks. - """ - while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: - timeout=max(timeout,2) - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') - - def _handle_error(self, response, action): - """ - Handle errors from API responses. - - Args: - response (requests.Response): The response object from the API request. - action (str): Description of the action that was being performed. - - Raises: - Exception: An exception with a message containing the status code and error details from the response. - """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz deleted file mode 100644 index 83cd7221..00000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl deleted file mode 100644 index b96c8f48..00000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/examplev0.py b/apps/python-sdk/examplev0.py new file mode 100644 index 00000000..d80fa795 --- /dev/null +++ b/apps/python-sdk/examplev0.py @@ -0,0 +1,75 @@ +import uuid +from firecrawl.firecrawl import FirecrawlApp + +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") + +# Scrape a website: +scrape_result = app.scrape_url('firecrawl.dev') +print(scrape_result['markdown']) + +# Crawl a website: +idempotency_key = str(uuid.uuid4()) # optional idempotency key +crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +print(crawl_result) + +# LLM Extraction: +# Define schema to extract contents into using pydantic +from pydantic import BaseModel, Field +from typing import List + +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) + +# Define schema to extract contents into using json schema +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index fbb2bdbf..1beaa043 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "0.0.16" +__version__ = "1.0.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 452d4982..8945d74d 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -7,7 +7,7 @@ from dotenv import load_dotenv load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = "http://127.0.0.1:3002" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) + invalid_app = FirecrawlApp(api_url=API_URL, version='v0') assert "No API key provided" in str(excinfo.value) def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response assert "_Roast_" in response['content'] def test_scrape_url_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') + print(response) + assert response is not None assert 'content' in response assert 'markdown' in response @@ -54,7 +56,7 @@ def test_scrape_url_e2e(): assert "_Roast_" in response['content'] def test_successful_response_with_valid_api_key_and_include_html(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) assert response is not None assert 'content' in response @@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html(): assert " 0 @@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e(): assert "_Roast_" in response[0]['content'] def test_crawl_url_with_idempotency_key_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') uniqueIdempotencyKey = str(uuid4()) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) assert response is not None @@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e(): assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) assert response is not None assert 'jobId' in response @@ -131,21 +133,21 @@ def test_check_crawl_status_e2e(): assert len(status_response['data']) > 0 def test_search_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.search("test query") assert response is not None assert 'content' in response[0] assert len(response) > 2 def test_search_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.search("test query") assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url("https://mendable.ai", { + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') + response = app.scrape_url("https://firecrawl.dev", { 'extractorOptions': { 'mode': 'llm-extraction', 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example new file mode 100644 index 00000000..904887bf --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example @@ -0,0 +1,3 @@ +API_URL=http://localhost:3002 +ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py +TEST_API_KEY=fc-YOUR_API_KEY \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 00000000..5ba1f132 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py new file mode 100644 index 00000000..12fa10ce --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -0,0 +1,352 @@ +import importlib.util +import pytest +import time +import os +from uuid import uuid4 +from dotenv import load_dotenv +from datetime import datetime + +load_dotenv() + +API_URL = "http://127.0.0.1:3002"; +ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" +TEST_API_KEY = os.getenv('TEST_API_KEY') + +print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}") + +spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH) +firecrawl = importlib.util.module_from_spec(spec) +spec.loader.exec_module(firecrawl) +FirecrawlApp = firecrawl.FirecrawlApp + +def test_no_api_key(): + with pytest.raises(Exception) as excinfo: + invalid_app = FirecrawlApp(api_url=API_URL) + assert "No API key provided" in str(excinfo.value) + +def test_scrape_url_invalid_api_key(): + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.scrape_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url(): + blocklisted_url = "https://facebook.com/fake-test" + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(Exception) as excinfo: + app.scrape_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert "_Roast_" in response['markdown'] + assert "content" not in response + assert "html" not in response + assert "metadata" in response + assert "links" not in response + assert "rawHtml" not in response + +def test_successful_response_for_valid_scrape(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert 'markdown' in response + assert "_Roast_" in response['markdown'] + assert 'metadata' in response + assert 'content' not in response + assert 'html' not in response + assert 'rawHtml' not in response + assert 'screenshot' not in response + assert 'links' not in response + +def test_successful_response_with_valid_api_key_and_options(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + params = { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {'x-key': 'test'}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'timeout': 30000, + 'waitFor': 1000 + } + response = app.scrape_url('https://roastmywebsite.ai', params) + assert response is not None + assert 'content' not in response + assert 'markdown' in response + assert 'html' in response + assert 'rawHtml' in response + assert 'screenshot' in response + assert 'links' in response + assert "_Roast_" in response['markdown'] + assert " 0 + assert "https://" in response['links'][0] + assert 'metadata' in response + assert 'title' in response['metadata'] + assert 'description' in response['metadata'] + assert 'keywords' in response['metadata'] + assert 'robots' in response['metadata'] + assert 'ogTitle' in response['metadata'] + assert 'ogDescription' in response['metadata'] + assert 'ogUrl' in response['metadata'] + assert 'ogImage' in response['metadata'] + assert 'ogLocaleAlternate' in response['metadata'] + assert 'ogSiteName' in response['metadata'] + assert 'sourceURL' in response['metadata'] + assert 'statusCode' in response['metadata'] + assert 'pageStatusCode' not in response['metadata'] + assert 'pageError' not in response['metadata'] + assert 'error' not in response['metadata'] + assert response['metadata']['title'] == "Roast My Website" + assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl" + assert response['metadata']['robots'] == "follow, index" + assert response['metadata']['ogTitle'] == "Roast My Website" + assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai" + assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png" + assert response['metadata']['ogLocaleAlternate'] == [] + assert response['metadata']['ogSiteName'] == "Roast My Website" + assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai" + assert response['metadata']['statusCode'] == 200 + +def test_successful_response_for_valid_scrape_with_pdf_file(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf') + assert response is not None + assert 'content' not in response + assert 'metadata' in response + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] + +def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001') + time.sleep(1) # wait for 1 second + assert response is not None + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] + +def test_crawl_url_invalid_api_key(): + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.crawl_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_should_return_error_for_blocklisted_url(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + blocklisted_url = "https://twitter.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.crawl_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_crawl_url_wait_for_completion_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30) + assert response is not None + assert 'total' in response + assert response['total'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' not in response['data'][0] + assert 'rawHtml' not in response['data'][0] + assert 'screenshot' not in response['data'][0] + assert 'links' not in response['data'][0] + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_options_and_wait_for_completion(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', { + 'excludePaths': ['blog/*'], + 'includePaths': ['/'], + 'maxDepth': 2, + 'ignoreSitemap': True, + 'limit': 10, + 'allowBackwardLinks': True, + 'allowExternalLinks': True, + 'scrapeOptions': { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {"x-key": "test"}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'waitFor': 1000 + } + }, True, 30) + assert response is not None + assert 'total' in response + assert response['total'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' in response['data'][0] + assert " 0 + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey) + assert response is not None + assert 'id' in response + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey) + assert "Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False) + assert response is not None + assert 'id' in response + + max_checks = 15 + checks = 0 + status_response = app.check_crawl_status(response['id']) + + while status_response['status'] == 'scraping' and checks < max_checks: + time.sleep(1) # wait for 1 second + assert 'partial_data' not in status_response + assert 'current' not in status_response + assert 'data' in status_response + assert 'total' in status_response + assert 'creditsUsed' in status_response + assert 'expiresAt' in status_response + assert 'status' in status_response + assert 'next' in status_response + assert status_response['total'] > 0 + assert status_response['creditsUsed'] > 0 + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert status_response['status'] == 'scraping' + assert '/v1/crawl/' in status_response['next'] + status_response = app.check_crawl_status(response['id']) + checks += 1 + + assert status_response is not None + assert 'total' in status_response + assert status_response['total'] > 0 + assert 'creditsUsed' in status_response + assert status_response['creditsUsed'] > 0 + assert 'expiresAt' in status_response + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert len(status_response['data']) > 0 + assert 'markdown' in status_response['data'][0] + assert len(status_response['data'][0]['markdown']) > 10 + assert 'content' not in status_response['data'][0] + assert 'html' in status_response['data'][0] + assert " 0 + assert 'metadata' in status_response['data'][0] + assert 'title' in status_response['data'][0]['metadata'] + assert 'description' in status_response['data'][0]['metadata'] + assert 'language' in status_response['data'][0]['metadata'] + assert 'sourceURL' in status_response['data'][0]['metadata'] + assert 'statusCode' in status_response['data'][0]['metadata'] + assert 'error' not in status_response['data'][0]['metadata'] + +def test_invalid_api_key_on_map(): + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url_on_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + blocklisted_url = "https://facebook.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.map_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token_on_map(): + app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + +def test_successful_response_for_valid_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + assert any("https://" in link for link in response) + filtered_links = [link for link in response if "roastmywebsite.ai" in link] + assert len(filtered_links) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +# def test_llm_extraction(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# response = app.scrape_url("https://mendable.ai", { +# 'extractorOptions': { +# 'mode': 'llm-extraction', +# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", +# 'extractionSchema': { +# 'type': 'object', +# 'properties': { +# 'company_mission': {'type': 'string'}, +# 'supports_sso': {'type': 'boolean'}, +# 'is_open_source': {'type': 'boolean'} +# }, +# 'required': ['company_mission', 'supports_sso', 'is_open_source'] +# } +# } +# }) +# assert response is not None +# assert 'llm_extraction' in response +# llm_extraction = response['llm_extraction'] +# assert 'company_mission' in llm_extraction +# assert isinstance(llm_extraction['supports_sso'], bool) +# assert isinstance(llm_extraction['is_open_source'], bool) + + + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..89c51803 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,24 +19,22 @@ import requests logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None: + """ + Initialize the FirecrawlApp instance with API key, API URL, and version. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) - - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + version (str): API version, either 'v0' or 'v1'. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.version = version + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -75,9 +73,11 @@ class FirecrawlApp: for key, value in params.items(): if key != 'extractorOptions': scrape_params[key] = value + + endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -104,6 +104,9 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ + if self.version == 'v1': + raise NotImplementedError("Search is not supported in v1") + headers = self._prepare_headers() json_data = {'query': query} if params: @@ -145,26 +148,37 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/{self.version}/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, poll_interval) + if self.version == 'v0': + id = response.json().get('jobId') else: - return {'jobId': job_id} + id = response.json().get('id') + + if wait_until_done: + check_url = None + if self.version == 'v1': + check_url = response.json().get('url') + return self._monitor_job_status(id, headers, poll_interval, check_url) + else: + if self.version == 'v0': + return {'jobId': id} + else: + return {'id': id} else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id: str) -> Any: + def check_crawl_status(self, id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: Any: The status of the crawl job. @@ -172,13 +186,73 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + + if self.version == 'v0': + endpoint = f'/{self.version}/crawl/status/{id}' + else: + endpoint = f'/{self.version}/crawl/{id}' + headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - return response.json() + data = response.json() + if self.version == 'v0': + return { + 'success': True, + 'status': data.get('status'), + 'current': data.get('current'), + 'current_url': data.get('current_url'), + 'current_step': data.get('current_step'), + 'total': data.get('total'), + 'data': data.get('data'), + 'partial_data': data.get('partial_data') if not data.get('data') else None, + } + elif self.version == 'v1': + return { + 'success': True, + 'status': data.get('status'), + 'total': data.get('total'), + 'completed': data.get('completed'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } else: self._handle_error(response, 'check crawl status') + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + """ + if self.version == 'v0': + raise NotImplementedError("Map is not supported in v0") + + endpoint = f'/{self.version}/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + print(response) + if response['success'] and 'links' in response: + return response['links'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -257,15 +331,15 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any: """ Monitor the status of a crawl job until completion. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. poll_interval (int): Secounds between status checks. - + check_url (Optional[str]): The URL to check for the crawl job. Returns: Any: The crawl results if the job is completed successfully. @@ -273,15 +347,30 @@ class FirecrawlApp: Exception: If the job fails or an error occurs during status checks. """ while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + api_url = '' + if (self.version == 'v0'): + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v0/crawl/status/{id}' + else: + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v1/crawl/{id}' + + status_response = self._get_request(api_url, headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: - return status_data['data'] + if self.version == 'v0': + return status_data['data'] + else: + return status_data else: raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: poll_interval=max(poll_interval,2) time.sleep(poll_interval) # Wait for the specified interval before checking again else: @@ -300,18 +389,19 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - error_message = response.json().get('error', 'No additional error details provided.') + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message}" + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message}" + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO deleted file mode 100644 index 288eb7a5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ /dev/null @@ -1,179 +0,0 @@ -Metadata-Version: 2.1 -Name: firecrawl-py -Version: 0.0.12 -Summary: Python SDK for Firecrawl API -Home-page: https://github.com/mendableai/firecrawl -Author: Mendable.ai -Author-email: nick@mendable.ai -License: GNU General Public License v3 (GPLv3) -Project-URL: Documentation, https://docs.firecrawl.dev -Project-URL: Source, https://github.com/mendableai/firecrawl -Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues -Keywords: SDK API firecrawl -Classifier: Development Status :: 5 - Production/Stable -Classifier: Environment :: Web Environment -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) -Classifier: Natural Language :: English -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Topic :: Internet -Classifier: Topic :: Internet :: WWW/HTTP -Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search -Classifier: Topic :: Software Development -Classifier: Topic :: Software Development :: Libraries -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Topic :: Text Processing -Classifier: Topic :: Text Processing :: Indexing -Requires-Python: >=3.8 -Description-Content-Type: text/markdown - -# Firecrawl Python SDK - -The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Python SDK, you can use pip: - -```bash -pip install firecrawl-py -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK: - -```python -from firecrawl import FirecrawlApp - -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') - -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) - -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) -``` - -### Scraping a URL - -To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```python -url = 'https://example.com' -scraped_data = app.scrape_url(url) -``` -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: - -```python -class ArticleSchema(BaseModel): - title: str - points: int - by: str - commentsURL: str - -class TopArticlesSchema(BaseModel): - top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") - -data = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) -print(data["llm_extraction"]) -``` - -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - -### Crawling a Website - -To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. - -```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) -``` - -If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. - -### Checking Crawl Status - -To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Running the Tests with Pytest - -To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling. - -### Running the Tests - -To run the tests, execute the following commands: - -Install pytest: -```bash -pip install pytest -``` - -Run: -```bash -pytest firecrawl/__tests__/e2e_withAuth/test.py -``` - - -## Contributing - -Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt b/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt deleted file mode 100644 index c25567c5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt +++ /dev/null @@ -1,9 +0,0 @@ -README.md -setup.py -firecrawl/__init__.py -firecrawl/firecrawl.py -firecrawl_py.egg-info/PKG-INFO -firecrawl_py.egg-info/SOURCES.txt -firecrawl_py.egg-info/dependency_links.txt -firecrawl_py.egg-info/requires.txt -firecrawl_py.egg-info/top_level.txt \ No newline at end of file diff --git a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt b/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/apps/python-sdk/firecrawl_py.egg-info/requires.txt b/apps/python-sdk/firecrawl_py.egg-info/requires.txt deleted file mode 100644 index c8d341f5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/requires.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pytest -python-dotenv diff --git a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt b/apps/python-sdk/firecrawl_py.egg-info/top_level.txt deleted file mode 100644 index 8bce1a1f..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -firecrawl diff --git a/docker-compose.yaml b/docker-compose.yaml index 8c160f4a..24b51762 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,7 +15,6 @@ x-common-service: &common-service - OPENAI_BASE_URL=${OPENAI_BASE_URL} - MODEL_NAME=${MODEL_NAME:-gpt-4o} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - - SERPER_API_KEY=${SERPER_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} - LOGTAIL_KEY=${LOGTAIL_KEY} - BULL_AUTH_KEY=${BULL_AUTH_KEY} diff --git a/examples/kubernetes/cluster-install/secret.yaml b/examples/kubernetes/cluster-install/secret.yaml index 2be96320..6d8eed3b 100644 --- a/examples/kubernetes/cluster-install/secret.yaml +++ b/examples/kubernetes/cluster-install/secret.yaml @@ -6,7 +6,6 @@ type: Opaque data: OPENAI_API_KEY: "" SLACK_WEBHOOK_URL: "" - SERPER_API_KEY: "" LLAMAPARSE_API_KEY: "" LOGTAIL_KEY: "" BULL_AUTH_KEY: ""