diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index bb47b47f..bbc1e098 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,7 +1,7 @@ --- name: Bug report about: Create a report to help us improve -title: "[BUG]" +title: "[Bug] " labels: bug assignees: '' diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index b01699b7..6760afa8 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,7 +1,7 @@ --- name: Feature request about: Suggest an idea for this project -title: "[Feat]" +title: "[Feat] " labels: '' assignees: '' diff --git a/.github/ISSUE_TEMPLATE/self_host_issue.md b/.github/ISSUE_TEMPLATE/self_host_issue.md new file mode 100644 index 00000000..73a0ef9d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/self_host_issue.md @@ -0,0 +1,40 @@ +--- +name: Self-host issue +about: Report an issue with self-hosting Firecrawl +title: "[Self-Host] " +labels: self-host +assignees: '' + +--- + +**Describe the Issue** +Provide a clear and concise description of the self-hosting issue you're experiencing. + +**To Reproduce** +Steps to reproduce the issue: +1. Configure the environment or settings with '...' +2. Run the command '...' +3. Observe the error or unexpected output at '...' +4. Log output/error message + +**Expected Behavior** +A clear and concise description of what you expected to happen when self-hosting. + +**Screenshots** +If applicable, add screenshots or copies of the command line output to help explain the self-hosting issue. + +**Environment (please complete the following information):** +- OS: [e.g. macOS, Linux, Windows] +- Firecrawl Version: [e.g. 1.2.3] +- Node.js Version: [e.g. 14.x] +- Docker Version (if applicable): [e.g. 20.10.14] +- Database Type and Version: [e.g. PostgreSQL 13.4] + +**Logs** +If applicable, include detailed logs to help understand the self-hosting problem. + +**Configuration** +Provide relevant parts of your configuration files (with sensitive information redacted). + +**Additional Context** +Add any other context about the self-hosting issue here, such as specific infrastructure details, network setup, or any modifications made to the original Firecrawl setup. diff --git a/.github/archive/publish-rust-sdk.yml b/.github/archive/publish-rust-sdk.yml new file mode 100644 index 00000000..9856bd77 --- /dev/null +++ b/.github/archive/publish-rust-sdk.yml @@ -0,0 +1,42 @@ +name: Publish Rust SDK + +on: [] + +env: + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} + +jobs: + build-and-publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + default: true + profile: minimal + + - name: Install dependencies + run: cargo build --release + + - name: Run version check script + id: version_check_script + run: | + VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name) + echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV + + - name: Build the package + if: ${{ env.VERSION_INCREMENTED == 'true' }} + run: cargo package + working-directory: ./apps/rust-sdk + + - name: Publish to crates.io + if: ${{ env.VERSION_INCREMENTED == 'true' }} + env: + CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} + run: cargo publish + working-directory: ./apps/rust-sdk \ No newline at end of file diff --git a/.github/archive/rust-sdk.yml b/.github/archive/rust-sdk.yml new file mode 100644 index 00000000..62deeaab --- /dev/null +++ b/.github/archive/rust-sdk.yml @@ -0,0 +1,61 @@ +name: Run Rust SDK E2E Tests + +on: [] + +env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + HOST: ${{ secrets.HOST }} + LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} + LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} + POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} + NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} + PORT: ${{ secrets.PORT }} + REDIS_URL: ${{ secrets.REDIS_URL }} + SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} + SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} + SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} + HDX_NODE_BETA_MODE: 1 + + +jobs: + build: + runs-on: ubuntu-latest + services: + redis: + image: redis + ports: + - 6379:6379 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Install pnpm + run: npm install -g pnpm + - name: Install dependencies for API + run: pnpm install + working-directory: ./apps/api + - name: Start the application + run: npm start & + working-directory: ./apps/api + id: start_app + - name: Start workers + run: npm run workers & + working-directory: ./apps/api + id: start_workers + - name: Set up Rust + uses: actions/setup-rust@v1 + with: + rust-version: stable + - name: Try the lib build + working-directory: ./apps/rust-sdk + run: cargo build + - name: Run E2E tests for Rust SDK + run: cargo test --test e2e_with_auth diff --git a/.github/scripts/check_version_has_incremented.py b/.github/scripts/check_version_has_incremented.py index e437c934..6dba065f 100644 --- a/.github/scripts/check_version_has_incremented.py +++ b/.github/scripts/check_version_has_incremented.py @@ -15,6 +15,7 @@ false """ import json +import toml import os import re import sys @@ -53,6 +54,19 @@ def get_npm_version(package_name: str) -> str: version = response.json()['version'] return version.strip() +def get_rust_version(file_path: str) -> str: + """Extract version string from Cargo.toml.""" + cargo_toml = toml.load(file_path) + if 'package' in cargo_toml and 'version' in cargo_toml['package']: + return cargo_toml['package']['version'].strip() + raise RuntimeError("Unable to find version string in Cargo.toml.") + +def get_crates_version(package_name: str) -> str: + """Get latest version of Rust package from crates.io.""" + response = requests.get(f"https://crates.io/api/v1/crates/{package_name}") + version = response.json()['crate']['newest_version'] + return version.strip() + def is_version_incremented(local_version: str, published_version: str) -> bool: """Compare local and published versions.""" local_version_parsed: Version = parse_version(local_version) @@ -74,6 +88,12 @@ if __name__ == "__main__": current_version = get_js_version(os.path.join(package_path, 'package.json')) # Get published version from npm published_version = get_npm_version(package_name) + if package_type == "rust": + # Get current version from Cargo.toml + current_version = get_rust_version(os.path.join(package_path, 'Cargo.toml')) + # Get published version from crates.io + published_version = get_crates_version(package_name) + else: raise ValueError("Invalid package type. Use 'python' or 'js'.") diff --git a/.github/scripts/requirements.txt b/.github/scripts/requirements.txt index 0bfc6762..60f8e191 100644 --- a/.github/scripts/requirements.txt +++ b/.github/scripts/requirements.txt @@ -1,2 +1,3 @@ requests -packaging \ No newline at end of file +packaging +toml \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2e42e4a..8a9a74cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,8 @@ env: HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} - + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + ENV: ${{ secrets.ENV }} jobs: pre-deploy: diff --git a/.github/workflows/fly-direct.yml b/.github/workflows/fly-direct.yml index d395ff31..2473642c 100644 --- a/.github/workflows/fly-direct.yml +++ b/.github/workflows/fly-direct.yml @@ -22,12 +22,19 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + ENV: ${{ secrets.ENV }} jobs: deploy: name: Deploy app runs-on: ubuntu-latest + timeout-minutes: 15 steps: - uses: actions/checkout@v3 - uses: superfly/flyctl-actions/setup-flyctl@master diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 5b1b9f69..7b45921a 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -26,7 +26,10 @@ env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + ENV: ${{ secrets.ENV }} jobs: pre-deploy-e2e-tests: @@ -56,6 +59,9 @@ jobs: run: npm run workers & working-directory: ./apps/api id: start_workers + - name: Wait for the application to be ready + run: | + sleep 10 - name: Run E2E tests run: | npm run test:prod @@ -132,7 +138,7 @@ jobs: working-directory: ./apps/python-sdk - name: Run E2E tests for Python SDK run: | - pytest firecrawl/__tests__/e2e_withAuth/test.py + pytest firecrawl/__tests__/v1/e2e_withAuth/test.py working-directory: ./apps/python-sdk js-sdk-tests: @@ -169,7 +175,7 @@ jobs: - name: Run E2E tests for JavaScript SDK run: npm run test working-directory: ./apps/js-sdk/firecrawl - + go-sdk-tests: name: Go SDK Tests needs: pre-deploy-e2e-tests @@ -205,10 +211,45 @@ jobs: run: go test -v ./... -timeout 180s working-directory: ./apps/go-sdk/firecrawl + rust-sdk-tests: + name: Rust SDK Tests + needs: pre-deploy-e2e-tests + runs-on: ubuntu-latest + services: + redis: + image: redis + ports: + - 6379:6379 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Install pnpm + run: npm install -g pnpm + - name: Install dependencies for API + run: pnpm install + working-directory: ./apps/api + - name: Start the application + run: npm start & + working-directory: ./apps/api + id: start_app + - name: Start workers + run: npm run workers & + working-directory: ./apps/api + id: start_workers + - name: Set up Rust + uses: actions/setup-rust@v1 + with: + rust-version: stable + - name: Try the lib build + working-directory: ./apps/rust-sdk + run: cargo build + - name: Run E2E tests for Rust SDK + run: cargo test --test e2e_with_auth + deploy: name: Deploy app runs-on: ubuntu-latest - needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests] + needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests, rust-sdk-tests] steps: - uses: actions/checkout@v3 - uses: superfly/flyctl-actions/setup-flyctl@master @@ -285,7 +326,7 @@ jobs: - name: Install dependencies for JavaScript SDK run: pnpm install working-directory: ./apps/js-sdk/firecrawl - + - name: Run version check script id: version_check_script run: | @@ -299,4 +340,39 @@ jobs: run: | npm run build-and-publish working-directory: ./apps/js-sdk/firecrawl - \ No newline at end of file + build-and-publish-rust-sdk: + name: Build and publish Rust SDK + runs-on: ubuntu-latest + needs: deploy + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + default: true + profile: minimal + + - name: Install dependencies + run: cargo build --release + + - name: Run version check script + id: version_check_script + run: | + VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name) + echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV + + - name: Build the package + if: ${{ env.VERSION_INCREMENTED == 'true' }} + run: cargo package + working-directory: ./apps/rust-sdk + + - name: Publish to crates.io + if: ${{ env.VERSION_INCREMENTED == 'true' }} + env: + CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} + run: cargo publish + working-directory: ./apps/rust-sdk \ No newline at end of file diff --git a/.gitignore b/.gitignore index 367f28a7..42be56cf 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,10 @@ apps/test-suite/load-test-results/test-run-report.json apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/package-lock.json -/examples/o1_web_crawler /venv + +/examples/o1_web_crawler/venv +*.pyc +.rdb + +apps/js-sdk/firecrawl/dist + diff --git a/.gitmodules b/.gitmodules index d56adf88..b42c5d23 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "apps/go-sdk/firecrawl"] - path = apps/go-sdk/firecrawl +[submodule "apps/go-sdk/firecrawl-go"] + path = apps/go-sdk/firecrawl-go url = https://github.com/mendableai/firecrawl-go -[submodule "apps/go-sdk/examples"] - path = apps/go-sdk/examples +[submodule "apps/go-sdk/firecrawl-go-examples"] + path = apps/go-sdk/firecrawl-go-examples url = https://github.com/mendableai/firecrawl-go-examples diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cece879b..d0145a6b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,6 @@ BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs diff --git a/README.md b/README.md index 01324690..96878ea2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,37 @@ +

+ +

+
+ + License + + + Downloads + + + GitHub Contributors + + + Visit firecrawl.dev + +
+
+

+ + Follow on X + + + Follow on LinkedIn + + + Join our Discord + +

+
+ # 🔥 Firecrawl Crawl and convert any website into LLM-ready markdown or structured data. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the Firecrawl community. Includes powerful scraping, crawling and data extraction capabilities. @@ -6,11 +40,13 @@ _This repository is in its early development stages. We are still merging custom ## What is Firecrawl? -[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. +[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev). _Pst. hey, you, join our stargazers :)_ - + + GitHub stars + ## How to use it? @@ -41,18 +77,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl. ```bash -curl -X POST https://api.firecrawl.dev/v0/crawl \ +curl -X POST https://api.firecrawl.dev/v1/crawl \ -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer YOUR_API_KEY' \ + -H 'Authorization: Bearer fc-YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "limit": 100, + "scrapeOptions": { + "formats": ["markdown", "html"] + } }' ``` -Returns a jobId +Returns a crawl job id and the url to check the status of the crawl. ```json -{ "jobId": "1234-5678-9101" } +{ + "success": true, + "id": "123-456-789", + "url": "https://api.firecrawl.dev/v1/crawl/123-456-789" +} ``` ### Check Crawl Job @@ -60,7 +104,7 @@ Returns a jobId Used to check the status of a crawl job and get its result. ```bash -curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ +curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' ``` @@ -68,18 +112,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ```json { "status": "completed", - "current": 22, - "total": 22, + "total": 36, + "creditsUsed": 36, + "expiresAt": "2024-00-00T00:00:00.000Z", "data": [ { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...", + "html": "...", "metadata": { - "title": "Mendable | AI for CX and Sales", - "description": "AI for CX and Sales", - "language": null, - "sourceURL": "https://www.mendable.ai/" + "title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl", + "language": "en", + "sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3", + "description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.", + "ogLocaleAlternate": [], + "statusCode": 200 } } ] @@ -88,14 +134,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ### Scraping -Used to scrape a URL and get its content. +Used to scrape a URL and get its content in the specified formats. ```bash -curl -X POST https://api.firecrawl.dev/v0/scrape \ +curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "formats" : ["markdown", "html"] }' ``` @@ -105,68 +152,95 @@ Response: { "success": true, "data": { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...", + "html": " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + + } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + } + } + } + } + }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + }, + "SearchResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + } + } + } + }, + "CrawlResponse": { + "type": "object", + "properties": { + "jobId": { + "type": "string" + } + } + } + } + }, + "security": [ + { + "bearerAuth": [] + } + ] +} \ No newline at end of file diff --git a/apps/api/openapi.json b/apps/api/openapi.json index fb0c4305..5bd3e3d8 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -18,8 +18,8 @@ "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL and optionally extract information using an LLM", - "operationId": "scrapeAndExtractFromUrl", + "summary": "Scrape a single URL", + "operationId": "scrape", "tags": ["Scraping"], "security": [ { @@ -38,94 +38,47 @@ "format": "uri", "description": "The URL to scrape" }, - "pageOptions": { - "type": "object", - "properties": { - "headers": { - "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "includeHtml": { - "type": "boolean", - "description": "Include the HTML version of the content on page. Will output a html key in the response.", - "default": false - }, - "includeRawHtml": { - "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", - "default": false - }, - "onlyIncludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false - }, - "removeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, - "screenshot": { - "type": "boolean", - "description": "Include a screenshot of the top of the page that you are scraping.", - "default": false - }, - "fullPageScreenshot": { - "type": "boolean", - "description": "Include a full page screenshot of the page that you are scraping.", - "default": false - }, - "waitFor": { - "type": "integer", - "description": "Wait x amount of milliseconds for the page to load to fetch content", - "default": 0 - } - } + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"] + }, + "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)", + "default": ["markdown"] }, - "extractorOptions": { + "headers": { "type": "object", - "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", - "default": {}, - "properties": { - "mode": { - "type": "string", - "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], - "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." - }, - "extractionPrompt": { - "type": "string", - "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." - }, - "extractionSchema": { - "type": "object", - "additionalProperties": true, - "description": "The schema for the data to be extracted, required only for LLM extraction modes.", - "required": [ - "company_mission", - "supports_sso", - "is_open_source" - ] - } - } + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true }, "timeout": { "type": "integer", "description": "Timeout in milliseconds for the request", "default": 30000 + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 } }, "required": ["url"] @@ -741,24 +694,42 @@ "success": { "type": "boolean" }, + "warning": { + "type": "string", + "nullable": true, + "description": "Warning message to let you know of any issues." + }, "data": { "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -780,27 +751,16 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" } - } - }, - "llm_extraction": { - "type": "object", - "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", - "nullable": true - }, - "warning": { - "type": "string", - "nullable": true, - "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." } } } @@ -810,24 +770,33 @@ "type": "object", "properties": { "markdown": { - "type": "string" - }, - "content": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" }, "html": { "type": "string", "nullable": true, - "description": "HTML version of the content on page if `includeHtml` is true" + "description": "HTML version of the content on page if the `html` format was specified" }, "rawHtml": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeRawHtml` is true" + "description": "Raw HTML content of the page if the `rawHtml` format was specified" }, - "index": { - "type": "integer", - "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" }, "metadata": { "type": "object", @@ -849,11 +818,11 @@ " ": { "type": "string" }, - "pageStatusCode": { + "statusCode": { "type": "integer", "description": "The status code of the page" }, - "pageError": { + "error": { "type": "string", "nullable": true, "description": "The error message of the page" @@ -871,34 +840,63 @@ "data": { "type": "array", "items": { - "type": "object", - "properties": { - "url": { - "type": "string" + "markdown": { + "type": "string", + "nullable": true, + "description": "Markdown content of the page if the `markdown` format was specified (default)" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if the `html` format was specified" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if the `rawHtml` format was specified" + }, + "links": { + "type": "array", + "items": { + "type": "string", + "format": "uri" }, - "markdown": { - "type": "string" - }, - "content": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" - } + "nullable": true, + "description": "Links on the page if the `links` format was specified" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } } @@ -909,8 +907,15 @@ "CrawlResponse": { "type": "object", "properties": { - "jobId": { + "success": { + "type": "boolean" + }, + "id": { "type": "string" + }, + "url": { + "type": "string", + "format": "uri" } } } diff --git a/apps/api/package.json b/apps/api/package.json index c9058943..dc26b34b 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -61,6 +61,8 @@ "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", "@supabase/supabase-js": "^2.44.2", + "@types/express-ws": "^3.0.4", + "@types/ws": "^8.5.12", "ajv": "^8.16.0", "async": "^3.2.5", "async-mutex": "^0.5.0", @@ -76,6 +78,7 @@ "dotenv": "^16.3.1", "dotenv-cli": "^7.4.2", "express-rate-limit": "^7.3.1", + "express-ws": "^5.0.2", "form-data": "^4.0.0", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", @@ -83,6 +86,7 @@ "joplin-turndown-plugin-gfm": "^1.0.12", "json-schema-to-zod": "^2.3.0", "keyword-extractor": "^0.0.28", + "koffi": "^2.9.0", "langchain": "^0.2.8", "languagedetect": "^2.0.0", "logsnag": "^1.0.0", @@ -91,7 +95,7 @@ "moment": "^2.29.4", "mongoose": "^8.4.4", "natural": "^7.0.7", - "openai": "^4.52.2", + "openai": "^4.57.0", "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", @@ -110,8 +114,9 @@ "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "wordpos": "^2.1.0", + "ws": "^8.18.0", "xml2js": "^0.6.2", - "zod": "^3.23.4", + "zod": "^3.23.8", "zod-to-json-schema": "^3.23.1" }, "nodemonConfig": { diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index efbe9d80..b8f876a8 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -47,6 +47,12 @@ importers: '@supabase/supabase-js': specifier: ^2.44.2 version: 2.44.2 + '@types/express-ws': + specifier: ^3.0.4 + version: 3.0.4 + '@types/ws': + specifier: ^8.5.12 + version: 8.5.12 ajv: specifier: ^8.16.0 version: 8.16.0 @@ -92,6 +98,9 @@ importers: express-rate-limit: specifier: ^7.3.1 version: 7.3.1(express@4.19.2) + express-ws: + specifier: ^5.0.2 + version: 5.0.2(express@4.19.2) form-data: specifier: ^4.0.0 version: 4.0.0 @@ -113,9 +122,12 @@ importers: keyword-extractor: specifier: ^0.0.28 version: 0.0.28 + koffi: + specifier: ^2.9.0 + version: 2.9.0 langchain: specifier: ^0.2.8 - version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) + version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -138,8 +150,8 @@ importers: specifier: ^7.0.7 version: 7.0.7(socks@2.8.3) openai: - specifier: ^4.52.2 - version: 4.52.2 + specifier: ^4.57.0 + version: 4.57.0(zod@3.23.8) pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -194,11 +206,14 @@ importers: wordpos: specifier: ^2.1.0 version: 2.1.0 + ws: + specifier: ^8.18.0 + version: 8.18.0 xml2js: specifier: ^0.6.2 version: 0.6.2 zod: - specifier: ^3.23.4 + specifier: ^3.23.8 version: 3.23.8 zod-to-json-schema: specifier: ^3.23.1 @@ -1637,6 +1652,9 @@ packages: '@types/express-serve-static-core@4.19.3': resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==} + '@types/express-ws@3.0.4': + resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==} + '@types/express@4.17.21': resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==} @@ -1739,8 +1757,8 @@ packages: '@types/whatwg-url@11.0.5': resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==} - '@types/ws@8.5.10': - resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} + '@types/ws@8.5.12': + resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==} '@types/yargs-parser@21.0.3': resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} @@ -2506,6 +2524,12 @@ packages: peerDependencies: express: 4 || 5 || ^5.0.0-beta.1 + express-ws@5.0.2: + resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==} + engines: {node: '>=4.5.0'} + peerDependencies: + express: ^4.0.0 || ^5.0.0-alpha.1 + express@4.19.2: resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==} engines: {node: '>= 0.10.0'} @@ -3149,6 +3173,9 @@ packages: resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} engines: {node: '>=6'} + koffi@2.9.0: + resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==} + langchain@0.2.8: resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==} engines: {node: '>=18'} @@ -3712,9 +3739,14 @@ packages: openai@3.3.0: resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} - openai@4.52.2: - resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==} + openai@4.57.0: + resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==} hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true openapi-types@12.1.3: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} @@ -4647,8 +4679,20 @@ packages: resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} - ws@8.17.1: - resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==} + ws@7.5.10: + resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==} + engines: {node: '>=8.3.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: ^5.0.2 + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + ws@8.18.0: + resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==} engines: {node: '>=10.0.0'} peerDependencies: bufferutil: ^4.0.1 @@ -5286,13 +5330,13 @@ snapshots: '@js-sdsl/ordered-map@4.4.2': {} - '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': + '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5304,20 +5348,20 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))': + '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 - openai: 4.52.2 + openai: 4.57.0(zod@3.23.8) zod: 3.23.8 zod-to-json-schema: 3.23.1(zod@3.23.8) transitivePeerDependencies: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -6545,8 +6589,8 @@ snapshots: dependencies: '@supabase/node-fetch': 2.6.15 '@types/phoenix': 1.6.5 - '@types/ws': 8.5.10 - ws: 8.17.1 + '@types/ws': 8.5.12 + ws: 8.18.0 transitivePeerDependencies: - bufferutil - utf-8-validate @@ -6643,6 +6687,12 @@ snapshots: '@types/range-parser': 1.2.7 '@types/send': 0.17.4 + '@types/express-ws@3.0.4': + dependencies: + '@types/express': 4.17.21 + '@types/express-serve-static-core': 4.19.3 + '@types/ws': 8.5.12 + '@types/express@4.17.21': dependencies: '@types/body-parser': 1.19.5 @@ -6766,7 +6816,7 @@ snapshots: dependencies: '@types/webidl-conversions': 7.0.3 - '@types/ws@8.5.10': + '@types/ws@8.5.12': dependencies: '@types/node': 20.14.1 @@ -7521,6 +7571,14 @@ snapshots: dependencies: express: 4.19.2 + express-ws@5.0.2(express@4.19.2): + dependencies: + express: 4.19.2 + ws: 7.5.10 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + express@4.19.2: dependencies: accepts: 1.3.8 @@ -8440,17 +8498,19 @@ snapshots: kleur@3.0.3: {} - langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1): + koffi@2.9.0: {} + + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) - '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8470,14 +8530,14 @@ snapshots: pdf-parse: 1.1.1 puppeteer: 22.12.1(typescript@5.4.5) redis: 4.6.14 - ws: 8.17.1 + ws: 8.18.0 transitivePeerDependencies: - encoding - openai langchainhub@0.0.11: {} - langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2): + langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8486,9 +8546,9 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) - langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) - openai: 4.52.2 + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + openai: 4.57.0(zod@3.23.8) languagedetect@2.0.0: {} @@ -8881,16 +8941,19 @@ snapshots: transitivePeerDependencies: - debug - openai@4.52.2: + openai@4.57.0(zod@3.23.8): dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 + '@types/qs': 6.9.15 abort-controller: 3.0.0 agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 node-fetch: 2.7.0 - web-streams-polyfill: 3.3.3 + qs: 6.12.2 + optionalDependencies: + zod: 3.23.8 transitivePeerDependencies: - encoding @@ -9195,7 +9258,7 @@ snapshots: chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070) debug: 4.3.5 devtools-protocol: 0.0.1299070 - ws: 8.17.1 + ws: 8.18.0 transitivePeerDependencies: - bufferutil - supports-color @@ -9877,7 +9940,9 @@ snapshots: imurmurhash: 0.1.4 signal-exit: 4.1.0 - ws@8.17.1: {} + ws@7.5.10: {} + + ws@8.18.0: {} xml2js@0.6.2: dependencies: diff --git a/apps/api/requests.http b/apps/api/requests.http index 3a1a9902..3e7bd2b7 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,12 +1,16 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc +Authorization: Bearer fc- content-type: application/json { - "url":"firecrawl.dev" + "url":"corterix.com" } +### Check Job Status +GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 +Authorization: Bearer fc- + ### Check Job Status GET http://localhost:3002/v0/jobs/active HTTP/1.1 diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 019bc968..b1708abc 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(firstResponse.statusCode).toBe(200); @@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(secondResponse.statusCode).toBe(409); expect(secondResponse.body.error).toBe('Idempotency key already used'); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts new file mode 100644 index 00000000..8aabf748 --- /dev/null +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -0,0 +1,961 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { + ScrapeRequest, + ScrapeResponseRequestTest, +} from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for v1 API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); + + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).get( + "/is-production" + ); + + console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION); + console.log('?', process.env.USE_DB_AUTHENTICATION === 'true'); + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + console.log('!!useDbAuthentication', !!useDbAuthentication); + console.log('!useDbAuthentication', !useDbAuthentication); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); + + describe("POST /v1/scrape", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .send({ url: "https://firecrawl.dev"}) + + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://facebook.com/fake-test", + }; + + const response = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.metadata.error).toBeUndefined(); + expect(response.body.data.metadata.title).toBe("Roast My Website"); + expect(response.body.data.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(response.body.data.metadata.robots).toBe("follow, index"); + expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); + expect(response.body.data.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(response.body.data.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.body.data.metadata.sourceURL).toBe( + "https://roastmywebsite.ai" + ); + expect(response.body.data.metadata.statusCode).toBe(200); + }, + 30000 + ); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["markdown", "html"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" + // formats: ["markdown", "html"], + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001" + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + onlyMainContent: false // default is true + }; + const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + + if (!("data" in responseWithoutRemoveTags.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav + expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer + + const scrapeRequestWithRemoveTags: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + excludeTags: ['.nav', '#footer', 'strong'], + onlyMainContent: false // default is true + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequestWithRemoveTags); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); + expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // + }, 30000); + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(400); + }, 60000); + + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(401); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 403 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(403); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(404); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(405); + }, 60000); + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(500); + }, 60000); + + it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["html","rawHtml"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.html).toContain(" { + const scrapeRequest: ScrapeRequest = { + url: "https://ycombinator.com/companies", + formats: ["markdown"], + waitFor: 8000 + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("links"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("PagerDuty"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + + }, + 30000 + ); + + it.concurrent( + "should return a successful response with a valid links on page", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["links"], + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("links"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.links).toContain("https://firecrawl.dev"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 30000 + ); + + + }); + +describe("POST /v1/map", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key", async () => { + const mapRequest = { + url: "https://roastmywebsite.ai" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + }); + + it.concurrent("should return a successful response with a valid API key and search", async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); + }, 10000) + + it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + }) + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); +}); + + +describe("POST /v1/crawl", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://facebook.com/fake-test", + }; + + const response = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("id"); + expect(response.body.id).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("url"); + expect(response.body.url).toContain("/v1/crawl/"); + }); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 10, + includePaths: ["blog/*"], + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url).toContain("firecrawl.dev/blog"); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 10, + excludePaths: ["blog/*"], + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(3); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + maxDepth: 1, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); +}) + +describe("GET /v1/crawl/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v1/crawl/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); + + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect( + completedResponse.body.data[0].metadata.error + ).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds + + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.tatum.io", limit: 200 }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 10000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("cancelled"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 60000 + ); // 60 seconds +}) +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0771d10e..26caf63e 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,11 +1,15 @@ import request from "supertest"; import dotenv from "dotenv"; -import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types"; +import { + FirecrawlCrawlResponse, + FirecrawlCrawlStatusResponse, + FirecrawlScrapeResponse, +} from "../../types"; dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; -describe("E2E Tests for API Routes", () => { +describe("E2E Tests for v0 API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; }); @@ -24,273 +28,365 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape"); + const response: FirecrawlScrapeResponse = await request(TEST_URL).post( + "/v0/scrape" + ); expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a successful response with a valid API key", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://roastmywebsite.ai" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.metadata.pageError).toBeUndefined(); - expect(response.body.data.metadata.title).toBe("Roast My Website"); - expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); - expect(response.body.data.metadata.robots).toBe("follow, index"); - expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); - expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); - expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); - expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); - expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); - expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://roastmywebsite.ai" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageError).toBeUndefined(); + expect(response.body.data.metadata.title).toBe("Roast My Website"); + expect(response.body.data.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(response.body.data.metadata.robots).toBe("follow, index"); + expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); + expect(response.body.data.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(response.body.data.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.body.data.metadata.sourceURL).toBe( + "https://roastmywebsite.ai" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + }, + 30000 + ); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("html"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.markdown).toContain("_Roast_"); - expect(response.body.data.html).toContain(" { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); - await new Promise((r) => setTimeout(r, 6000)); + it.concurrent( + "should return a successful response for a valid scrape with PDF file", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" }); + await new Promise((r) => setTimeout(r, 6000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - expect(response.body.data.metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds - - it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); - await new Promise((r) => setTimeout(r, 6000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - expect(response.body.data.metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a valid scrape with PDF file without explicit .pdf extension", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" }); + await new Promise((r) => setTimeout(r, 6000)); - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/" }); - expect(responseWithoutRemoveTags.statusCode).toBe(200); - expect(responseWithoutRemoveTags.body).toHaveProperty("data"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); - expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer - expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav - expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("Scrape This Site"); - expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer - expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav - expect(response.body.data.content).not.toContain("web scraping"); // strong - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key with removeTags option", + async () => { + const responseWithoutRemoveTags: FirecrawlScrapeResponse = + await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Scrape This Site" + ); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Lessons and Videos" + ); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain( + "[Sandbox](" + ); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain( + "web scraping" + ); // strong - it.concurrent('should return a successful response for a scrape with 400 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/400' }); - await new Promise((r) => setTimeout(r, 5000)); + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/", + pageOptions: { removeTags: [".nav", "#footer", "strong"] }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, + 30000 + ); // 30 seconds timeout - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(400); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 400 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/400" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 401 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/401' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "bad request" + ); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(401); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 401 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/401" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent("should return a successful response for a scrape with 403 page", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "unauthorized" + ); + }, + 60000 + ); // 60 seconds - await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(403); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 403 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/403" }); - it.concurrent('should return a successful response for a scrape with 404 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/404' }); - await new Promise((r) => setTimeout(r, 5000)); + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "forbidden" + ); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(404); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 404 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/404" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 405 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/405' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(405); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 405 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/405" }); + await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/500' }); - await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + }, + 60000 + ); // 60 seconds - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(500); - }, 60000); // 60 seconds + it.concurrent( + "should return a successful response for a scrape with 500 page", + async () => { + const response: FirecrawlScrapeResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/500" }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + }, + 60000 + ); // 60 seconds }); describe("POST /v0/crawl", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl"); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return a successful response with a valid API key for crawl", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + const response: FirecrawlCrawlResponse = await request(TEST_URL).post( + "/v0/crawl" ); + expect(response.statusCode).toBe(401); }); - - it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - includes: ["blog/*"], - }, - }); - + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return a successful response with a valid API key for crawl", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + } + ); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"], + }, + }); + let response: FirecrawlCrawlStatusResponse; let isFinished = false; @@ -310,153 +406,189 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"], + }, + }); + + let isFinished = false; + let response: FirecrawlCrawlStatusResponse; + + while (!isFinished) { + response = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); - }); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); // 180 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; - it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - excludes: ["blog/*"], - }, - }); - - let isFinished = false; - let response: FirecrawlCrawlStatusResponse; + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } - while (!isFinished) { - response = await request(TEST_URL) + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse: FirecrawlCrawlStatusResponse = await request( + TEST_URL + ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response: FirecrawlCrawlStatusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); - }); - }, 90000); // 90 seconds - - it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + const completedResponse: FirecrawlCrawlStatusResponse = await request( + TEST_URL + ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); - // Check if all URLs have a maximum depth of 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, 180000); + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); }); describe("POST /v0/crawlWebsitePreview", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + const response: FirecrawlCrawlResponse = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", timeout: 1000 }); + it.concurrent( + "should return a timeout error when scraping takes longer than the specified timeout", + async () => { + const response: FirecrawlCrawlResponse = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); - expect(response.statusCode).toBe(408); - }, 3000); + expect(response.statusCode).toBe(408); + }, + 3000 + ); }); describe("POST /v0/search", () => { @@ -465,26 +597,33 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a successful response with a valid API key for search", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 60000); // 60 seconds timeout + it.concurrent( + "should return a successful response with a valid API key for search", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, + 60000 + ); // 60 seconds timeout }); describe("GET /v0/crawl/status/:jobId", () => { @@ -493,66 +632,81 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); - expect(crawlResponse.statusCode).toBe(200); - - let isCompleted = false; - - while (!isCompleted) { + it.concurrent( + "should return an error response with an invalid API key", + async () => { const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); + + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev/blog" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - if (response.body.status === "completed") { - isCompleted = true; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Firecrawl"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL && + doc.metadata.sourceURL.includes("firecrawl.dev/blog") + ); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds - const childrenLinks = completedResponse.body.data.filter(doc => - doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - ); - - expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 180000); // 120 seconds - // TODO: review the test below // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { // const crawlResponse = await request(TEST_URL) @@ -599,97 +753,118 @@ describe("E2E Tests for API Routes", () => { // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // }, 180000); // 120 seconds - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } }); - expect(crawlResponse.statusCode).toBe(200); + expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); - const responseCancel = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); - let isNullOrEmptyArray = false; - if (completedResponse.body.data === null || completedResponse.body.data.length === 0) { - isNullOrEmptyArray = true; - } - expect(isNullOrEmptyArray).toBe(true); - expect(completedResponse.body.data).toEqual(expect.arrayContaining([])); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + let isNullOrEmptyArray = false; + if ( + completedResponse.body.data === null || + completedResponse.body.data.length === 0 + ) { + isNullOrEmptyArray = true; + } + expect(isNullOrEmptyArray).toBe(true); + expect(completedResponse.body.data).toEqual(expect.arrayContaining([])); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "content" + ); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "markdown" + ); + expect(completedResponse.body.partial_data[0]).toHaveProperty( + "metadata" + ); + expect( + completedResponse.body.partial_data[0].metadata.pageStatusCode + ).toBe(200); + expect( + completedResponse.body.partial_data[0].metadata.pageError + ).toBeUndefined(); + }, + 60000 + ); // 60 seconds }); describe("POST /v0/scrape with LLM Extraction", () => { - it.concurrent("should extract data using LLM extraction mode", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - pageOptions: { - onlyMainContent: true, - }, - extractorOptions: { - mode: "llm-extraction", - extractionPrompt: - "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractionSchema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", - }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], + it.concurrent( + "should extract data using LLM extraction mode", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true, }, - }, - }); + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); - // Ensure that the job was successfully created before proceeding with LLM extraction - expect(response.statusCode).toBe(200); + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` - let llmExtraction = response.body.data.llm_extraction; + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; - // Check if the llm_extraction object has the required properties with correct types and values - expect(llmExtraction).toHaveProperty("company_mission"); - expect(typeof llmExtraction.company_mission).toBe("string"); - expect(llmExtraction).toHaveProperty("supports_sso"); - expect(llmExtraction.supports_sso).toBe(true); - expect(typeof llmExtraction.supports_sso).toBe("boolean"); - expect(llmExtraction).toHaveProperty("is_open_source"); - expect(llmExtraction.is_open_source).toBe(false); - expect(typeof llmExtraction.is_open_source).toBe("boolean"); - }, 60000); // 60 secs + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, + 60000 + ); // 60 secs }); }); diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts index 621c7436..e65523cb 100644 --- a/apps/api/src/controllers/__tests__/crawl.test.ts +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -1,4 +1,4 @@ -import { crawlController } from '../crawl' +import { crawlController } from '../v0/crawl' import { Request, Response } from 'express'; import { authenticateUser } from '../auth'; // Ensure this import is correct import { createIdempotencyKey } from '../../services/idempotency/create'; diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 151733c0..d634b9ed 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,22 +1,36 @@ -import { parseApi } from "../../src/lib/parseApi"; -import { getRateLimiter } from "../../src/services/rate-limiter"; +import { parseApi } from "../lib/parseApi"; +import { getRateLimiter } from "../services/rate-limiter"; import { AuthResponse, NotificationType, + PlanType, RateLimiterMode, -} from "../../src/types"; -import { supabase_service } from "../../src/services/supabase"; -import { withAuth } from "../../src/lib/withAuth"; +} from "../types"; +import { supabase_service } from "../services/supabase"; +import { withAuth } from "../lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { sendNotification } from "../services/notification/email_notification"; import { Logger } from "../lib/logger"; -import { redlock } from "../../src/services/redlock"; -import { getValue } from "../../src/services/redis"; -import { setValue } from "../../src/services/redis"; +import { redlock } from "../services/redlock"; +import { getValue } from "../services/redis"; +import { setValue } from "../services/redis"; import { validate } from "uuid"; import * as Sentry from "@sentry/node"; - +// const { data, error } = await supabase_service +// .from('api_keys') +// .select(` +// key, +// team_id, +// teams ( +// subscriptions ( +// price_id +// ) +// ) +// `) +// .eq('key', normalizedApi) +// .limit(1) +// .single(); function normalizedApiIsUuid(potentialUuid: string): boolean { // Check if the string is a valid UUID return validate(potentialUuid); @@ -88,9 +102,10 @@ export async function supaAuthenticateUser( team_id?: string; error?: string; status?: number; - plan?: string; + plan?: PlanType; }> { - const authHeader = req.headers.authorization; + + const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null); if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; } @@ -118,7 +133,11 @@ export async function supaAuthenticateUser( let priceId: string | null = null; if (token == "this_is_just_a_preview_token") { - rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); + if (mode == RateLimiterMode.CrawlStatus) { + rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); + } else { + rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); + } teamId = "preview"; } else { normalizedApi = parseApi(token); @@ -154,7 +173,7 @@ export async function supaAuthenticateUser( await setValue( cacheKey, JSON.stringify({ team_id: teamId, price_id: priceId }), - 10 + 60 ); } } catch (error) { @@ -233,6 +252,13 @@ export async function supaAuthenticateUser( subscriptionData.plan ); break; + case RateLimiterMode.Map: + rateLimiter = getRateLimiter( + RateLimiterMode.Map, + token, + subscriptionData.plan + ); + break; case RateLimiterMode.CrawlStatus: rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); break; @@ -285,6 +311,9 @@ export async function supaAuthenticateUser( token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || + mode === RateLimiterMode.Map || + mode === RateLimiterMode.Crawl || + mode === RateLimiterMode.CrawlStatus || mode === RateLimiterMode.Search) ) { return { success: true, team_id: "preview" }; @@ -327,10 +356,10 @@ export async function supaAuthenticateUser( return { success: true, team_id: subscriptionData.team_id, - plan: subscriptionData.plan ?? "", + plan: (subscriptionData.plan ?? "") as PlanType, }; } -function getPlanByPriceId(price_id: string) { +function getPlanByPriceId(price_id: string): PlanType { switch (price_id) { case process.env.STRIPE_PRICE_ID_STARTER: return "starter"; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts deleted file mode 100644 index 0b4df13c..00000000 --- a/apps/api/src/controllers/scrape.ts +++ /dev/null @@ -1,231 +0,0 @@ - import { ExtractorOptions, PageOptions } from './../lib/entities'; -import { Request, Response } from "express"; -import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../types"; -import { logJob } from "../services/logging/log_job"; -import { Document } from "../lib/entities"; -import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function -import { numTokensFromString } from '../lib/LLM-extraction/helpers'; -import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; -import { addScrapeJob } from '../services/queue-jobs'; -import { getScrapeQueue } from '../services/queue-service'; -import { v4 as uuidv4 } from "uuid"; -import { Logger } from '../lib/logger'; -import * as Sentry from "@sentry/node"; - -export async function scrapeHelper( - jobId: string, - req: Request, - team_id: string, - crawlerOptions: any, - pageOptions: PageOptions, - extractorOptions: ExtractorOptions, - timeout: number, - plan?: string -): Promise<{ - success: boolean; - error?: string; - data?: Document; - returnCode: number; -}> { - const url = req.body.url; - if (!url) { - return { success: false, error: "Url is required", returnCode: 400 }; - } - - if (isUrlBlocked(url)) { - return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; - } - - const job = await addScrapeJob({ - url, - mode: "single_urls", - crawlerOptions, - team_id, - pageOptions, - extractorOptions, - origin: req.body.origin ?? defaultOrigin, - }, {}, jobId); - - let doc; - - const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { - try { - doc = (await new Promise((resolve, reject) => { - const start = Date.now(); - const int = setInterval(async () => { - if (Date.now() >= start + timeout) { - clearInterval(int); - reject(new Error("Job wait ")); - } else { - const state = await job.getState(); - if (state === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(job.id)).returnvalue); - } else if (state === "failed") { - clearInterval(int); - reject((await getScrapeQueue().getJob(job.id)).failedReason); - } - } - }, 1000); - }))[0] - } catch (e) { - if (e instanceof Error && e.message.startsWith("Job wait")) { - span.setAttribute("timedOut", true); - return { - success: false, - error: "Request timed out", - returnCode: 408, - } - } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) { - return { - success: false, - error: e, - returnCode: 500, - }; - } else { - throw e; - } - } - span.setAttribute("result", JSON.stringify(doc)); - return null; - }); - - if (err !== null) { - return err; - } - - await job.remove(); - - if (!doc) { - console.error("!!! PANIC DOC IS", doc, job); - return { success: true, error: "No page found", returnCode: 200, data: doc }; - } - - delete doc.index; - delete doc.provider; - - // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html - if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - delete doc.rawHtml; - } - - return { - success: true, - data: doc, - returnCode: 200, - }; -} - -export async function scrapeController(req: Request, res: Response) { - try { - let earlyReturn = false; - // make sure to authenticate user first, Bearer - const { success, team_id, error, status, plan } = await authenticateUser( - req, - res, - RateLimiterMode.Scrape - ); - if (!success) { - return res.status(status).json({ error }); - } - - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; - const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; - const origin = req.body.origin ?? defaultOrigin; - let timeout = req.body.timeout ?? defaultTimeout; - - if (extractorOptions.mode.includes("llm-extraction")) { - if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) { - return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" }); - } - - pageOptions.onlyMainContent = true; - timeout = req.body.timeout ?? 90000; - } - - // checkCredits - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - earlyReturn = true; - return res.status(402).json({ error: "Insufficient credits" }); - } - } catch (error) { - Logger.error(error); - earlyReturn = true; - return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); - } - - const jobId = uuidv4(); - - const startTime = new Date().getTime(); - const result = await scrapeHelper( - jobId, - req, - team_id, - crawlerOptions, - pageOptions, - extractorOptions, - timeout, - plan - ); - const endTime = new Date().getTime(); - const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; - - if (result.success) { - let creditsToBeBilled = 0; // billing for doc done on queue end - const creditsPerLLMExtract = 50; - - if (extractorOptions.mode.includes("llm-extraction")) { - // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); - creditsToBeBilled += creditsPerLLMExtract; - } - - let startTimeBilling = new Date().getTime(); - - if (earlyReturn) { - // Don't bill if we're early returning - return; - } - const billingResult = await billTeam( - team_id, - creditsToBeBilled - ); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", - }); - } - } - - logJob({ - job_id: jobId, - success: result.success, - message: result.error, - num_docs: 1, - docs: [result.data], - time_taken: timeTakenInSeconds, - team_id: team_id, - mode: "scrape", - url: req.body.url, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - origin: origin, - extractor_options: extractorOptions, - num_tokens: numTokens, - }); - - - - return res.status(result.returnCode).json(result); - } catch (error) { - Sentry.captureException(error); - Logger.error(error); - return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") }); - } -} diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts similarity index 62% rename from apps/api/src/controllers/admin/queue.ts rename to apps/api/src/controllers/v0/admin/queue.ts index 06844bea..71748002 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/v0/admin/queue.ts @@ -1,11 +1,10 @@ import { Request, Response } from "express"; import { Job } from "bullmq"; -import { Logger } from "../../lib/logger"; -import { getScrapeQueue } from "../../services/queue-service"; -import { checkAlerts } from "../../services/alerts"; -import { exec } from "node:child_process"; -import { sendSlackWebhook } from "../../services/alerts/slack"; +import { Logger } from "../../../lib/logger"; +import { getScrapeQueue } from "../../../services/queue-service"; +import { checkAlerts } from "../../../services/alerts"; +import { sendSlackWebhook } from "../../../services/alerts/slack"; export async function cleanBefore24hCompleteJobsController( req: Request, @@ -94,26 +93,34 @@ export async function autoscalerController(req: Request, res: Response) { const scrapeQueue = getScrapeQueue(); - const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([ - scrapeQueue.getActiveCount(), - scrapeQueue.getWaitingCount(), - scrapeQueue.getPrioritizedCount(), - ]); + const [webScraperActive, webScraperWaiting, webScraperPriority] = + await Promise.all([ + scrapeQueue.getActiveCount(), + scrapeQueue.getWaitingCount(), + scrapeQueue.getPrioritizedCount(), + ]); let waitingAndPriorityCount = webScraperWaiting + webScraperPriority; // get number of machines active - const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines', + const request = await fetch( + "https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines", { headers: { - 'Authorization': `Bearer ${process.env.FLY_API_TOKEN}` - } + Authorization: `Bearer ${process.env.FLY_API_TOKEN}`, + }, } - ) + ); const machines = await request.json(); // Only worker machines - const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; + const activeMachines = machines.filter( + (machine) => + (machine.state === "started" || + machine.state === "starting" || + machine.state === "replacing") && + machine.config.env["FLY_PROCESS_GROUP"] === "worker" + ).length; let targetMachineCount = activeMachines; @@ -123,29 +130,57 @@ export async function autoscalerController(req: Request, res: Response) { // Scale up logic if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3)); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp * 3 + ); } else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2)); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp * 2 + ); } else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) { - targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp); + targetMachineCount = Math.min( + maxNumberOfMachines, + activeMachines + baseScaleUp + ); } // Scale down logic if (webScraperActive < 100 && waitingAndPriorityCount < 50) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3)); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown * 3 + ); } else if (webScraperActive < 500 && waitingAndPriorityCount < 200) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2)); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown * 2 + ); } else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) { - targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown); + targetMachineCount = Math.max( + minNumberOfMachines, + activeMachines - baseScaleDown + ); } if (targetMachineCount !== activeMachines) { - Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); + Logger.info( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting` + ); - if(targetMachineCount > activeMachines) { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); + if (targetMachineCount > activeMachines) { + sendSlackWebhook( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, + false, + process.env.SLACK_AUTOSCALER ?? "" + ); } else { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook( + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, + false, + process.env.SLACK_AUTOSCALER ?? "" + ); } return res.status(200).json({ mode: "scale-descale", diff --git a/apps/api/src/controllers/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts similarity index 95% rename from apps/api/src/controllers/admin/redis-health.ts rename to apps/api/src/controllers/v0/admin/redis-health.ts index 3b1e2518..dc58d745 100644 --- a/apps/api/src/controllers/admin/redis-health.ts +++ b/apps/api/src/controllers/v0/admin/redis-health.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import Redis from "ioredis"; -import { Logger } from "../../lib/logger"; -import { redisRateLimitClient } from "../../services/rate-limiter"; +import { Logger } from "../../../lib/logger"; +import { redisRateLimitClient } from "../../../services/rate-limiter"; export async function redisHealthController(req: Request, res: Response) { const retryOperation = async (operation, retries = 3) => { diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts new file mode 100644 index 00000000..efcd454a --- /dev/null +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -0,0 +1,60 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { supabase_service } from "../../../src/services/supabase"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; +import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); + +export async function crawlCancelController(req: Request, res: Response) { + try { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ error: "Job not found" }); + } + + // check if the job belongs to the team + if (useDbAuthentication) { + const { data, error: supaError } = await supabase_service + .from("bulljobs_teams") + .select("*") + .eq("job_id", req.params.jobId) + .eq("team_id", team_id); + if (supaError) { + return res.status(500).json({ error: supaError.message }); + } + + if (data.length === 0) { + return res.status(403).json({ error: "Unauthorized" }); + } + } + + try { + sc.cancelled = true; + await saveCrawl(req.params.jobId, sc); + } catch (error) { + Logger.error(error); + } + + res.json({ + status: "cancelled" + }); + } catch (error) { + Sentry.captureException(error); + Logger.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts similarity index 67% rename from apps/api/src/controllers/crawl-status.ts rename to apps/api/src/controllers/v0/crawl-status.ts index 76147263..41491f86 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -1,17 +1,19 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { getScrapeQueue } from "../../src/services/queue-service"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; -import { supabaseGetJobsById } from "../../src/lib/supabase-jobs"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { getScrapeQueue } from "../../../src/services/queue-service"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; +import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); -export async function getJobs(ids: string[]) { +export async function getJobs(crawlId: string, ids: string[]) { const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); if (process.env.USE_DB_AUTHENTICATION === "true") { - const supabaseData = await supabaseGetJobsById(ids); + const supabaseData = await supabaseGetJobsByCrawlId(crawlId); supabaseData.forEach(x => { const job = jobs.find(y => y.id === x.job_id); @@ -50,12 +52,25 @@ export async function crawlStatusController(req: Request, res: Response) { const jobIDs = await getCrawlJobs(req.params.jobId); - const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); const jobStatuses = await Promise.all(jobs.map(x => x.getState())); const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + if ( + jobs.length > 0 && + jobs[0].data && + jobs[0].data.pageOptions && + !jobs[0].data.pageOptions.includeRawHtml + ) { + data.forEach(item => { + if (item) { + delete item.rawHtml; + } + }); + } + res.json({ status: jobStatus, current: jobStatuses.filter(x => x === "completed" || x === "failed").length, diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/v0/crawl.ts similarity index 78% rename from apps/api/src/controllers/crawl.ts rename to apps/api/src/controllers/v0/crawl.ts index c5f440e2..aefdb5e5 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -1,35 +1,24 @@ import { Request, Response } from "express"; -import { checkTeamCredits } from "../../src/services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { addScrapeJob } from "../../src/services/queue-jobs"; -import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; -import { logCrawl } from "../../src/services/logging/crawl_log"; -import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; -import { createIdempotencyKey } from "../../src/services/idempotency/create"; -import { - defaultCrawlPageOptions, - defaultCrawlerOptions, - defaultOrigin, -} from "../../src/lib/default-values"; +import { checkTeamCredits } from "../../../src/services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { addScrapeJob } from "../../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; +import { logCrawl } from "../../../src/services/logging/crawl_log"; +import { validateIdempotencyKey } from "../../../src/services/idempotency/validate"; +import { createIdempotencyKey } from "../../../src/services/idempotency/create"; +import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../src/lib/logger"; -import { - addCrawlJob, - addCrawlJobs, - crawlToCrawler, - lockURL, - lockURLs, - saveCrawl, - StoredCrawl, -} from "../../src/lib/crawl-redis"; -import { getScrapeQueue } from "../../src/services/queue-service"; -import { checkAndUpdateURL } from "../../src/lib/validateUrl"; +import { Logger } from "../../../src/lib/logger"; +import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { getScrapeQueue } from "../../../src/services/queue-service"; +import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; export async function crawlController(req: Request, res: Response) { try { - const { success, team_id, error, status } = await authenticateUser( + const { success, team_id, error, status, plan } = await authenticateUser( req, res, RateLimiterMode.Crawl @@ -148,6 +137,7 @@ export async function crawlController(req: Request, res: Response) { crawlerOptions, pageOptions, team_id, + plan, createdAt: Date.now(), }; @@ -163,7 +153,15 @@ export async function crawlController(req: Request, res: Response) { ? null : await crawler.tryGetSitemap(); + if (sitemap !== null && sitemap.length > 0) { + let jobPriority = 20; + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(sitemap.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan, team_id, basePriority: 21}) + } const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); @@ -181,7 +179,7 @@ export async function crawlController(req: Request, res: Response) { }, opts: { jobId: uuid, - priority: 20, + priority: jobPriority, }, }; }); @@ -204,6 +202,10 @@ export async function crawlController(req: Request, res: Response) { } } else { await lockURL(id, sc, url); + + // Not needed, first one should be 15. + // const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) + const job = await addScrapeJob( { url, diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts similarity index 87% rename from apps/api/src/controllers/crawlPreview.ts rename to apps/api/src/controllers/v0/crawlPreview.ts index 59b54458..f8706867 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -1,17 +1,17 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../../src/types"; +import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../src/lib/logger"; -import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; -import { addScrapeJob } from "../../src/services/queue-jobs"; -import { checkAndUpdateURL } from "../../src/lib/validateUrl"; +import { Logger } from "../../../src/lib/logger"; +import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { addScrapeJob } from "../../../src/services/queue-jobs"; +import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; export async function crawlPreviewController(req: Request, res: Response) { try { - const { success, error, status } = await authenticateUser( + const { success, error, status, team_id:a, plan } = await authenticateUser( req, res, RateLimiterMode.Preview @@ -89,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) { crawlerOptions, pageOptions, team_id, + plan, robots, createdAt: Date.now(), }; diff --git a/apps/api/src/controllers/keyAuth.ts b/apps/api/src/controllers/v0/keyAuth.ts similarity index 83% rename from apps/api/src/controllers/keyAuth.ts rename to apps/api/src/controllers/v0/keyAuth.ts index 351edd18..b70d672a 100644 --- a/apps/api/src/controllers/keyAuth.ts +++ b/apps/api/src/controllers/v0/keyAuth.ts @@ -1,8 +1,8 @@ -import { AuthResponse, RateLimiterMode } from "../types"; +import { AuthResponse, RateLimiterMode } from "../../types"; import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; +import { authenticateUser } from "../auth"; export const keyAuthController = async (req: Request, res: Response) => { diff --git a/apps/api/src/controllers/liveness.ts b/apps/api/src/controllers/v0/liveness.ts similarity index 100% rename from apps/api/src/controllers/liveness.ts rename to apps/api/src/controllers/v0/liveness.ts diff --git a/apps/api/src/controllers/readiness.ts b/apps/api/src/controllers/v0/readiness.ts similarity index 100% rename from apps/api/src/controllers/readiness.ts rename to apps/api/src/controllers/v0/readiness.ts diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts new file mode 100644 index 00000000..c46ebc62 --- /dev/null +++ b/apps/api/src/controllers/v0/scrape.ts @@ -0,0 +1,295 @@ +import { ExtractorOptions, PageOptions } from "./../../lib/entities"; +import { Request, Response } from "express"; +import { + billTeam, + checkTeamCredits, +} from "../../services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { PlanType, RateLimiterMode } from "../../types"; +import { logJob } from "../../services/logging/log_job"; +import { Document } from "../../lib/entities"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { + defaultPageOptions, + defaultExtractorOptions, + defaultTimeout, + defaultOrigin, +} from "../../lib/default-values"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { getScrapeQueue } from "../../services/queue-service"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../../lib/logger"; +import * as Sentry from "@sentry/node"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function scrapeHelper( + jobId: string, + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: PageOptions, + extractorOptions: ExtractorOptions, + timeout: number, + plan?: PlanType +): Promise<{ + success: boolean; + error?: string; + data?: Document; + returnCode: number; +}> { + const url = req.body.url; + if (typeof url !== "string") { + return { success: false, error: "Url is required", returnCode: 400 }; + } + + if (isUrlBlocked(url)) { + return { + success: false, + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + returnCode: 403, + }; + } + + const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); + + const job = await addScrapeJob( + { + url, + mode: "single_urls", + crawlerOptions, + team_id, + pageOptions, + extractorOptions, + origin: req.body.origin ?? defaultOrigin, + is_scrape: true, + }, + {}, + jobId, + jobPriority + ); + + let doc; + + const err = await Sentry.startSpan( + { + name: "Wait for job to finish", + op: "bullmq.wait", + attributes: { job: jobId }, + }, + async (span) => { + try { + doc = (await waitForJob(job.id, timeout))[0]; + } catch (e) { + if (e instanceof Error && e.message.startsWith("Job wait")) { + span.setAttribute("timedOut", true); + return { + success: false, + error: "Request timed out", + returnCode: 408, + }; + } else if ( + typeof e === "string" && + (e.includes("Error generating completions: ") || + e.includes("Invalid schema for function") || + e.includes( + "LLM extraction did not match the extraction schema you provided." + )) + ) { + return { + success: false, + error: e, + returnCode: 500, + }; + } else { + throw e; + } + } + span.setAttribute("result", JSON.stringify(doc)); + return null; + } + ); + + if (err !== null) { + return err; + } + + await job.remove(); + + if (!doc) { + console.error("!!! PANIC DOC IS", doc, job); + return { + success: true, + error: "No page found", + returnCode: 200, + data: doc, + }; + } + + delete doc.index; + delete doc.provider; + + // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html + if ( + !pageOptions.includeRawHtml && + extractorOptions.mode == "llm-extraction-from-raw-html" + ) { + if (doc.rawHtml) { + delete doc.rawHtml; + } + } + + if (!pageOptions.includeHtml) { + if (doc.html) { + delete doc.html; + } + } + + return { + success: true, + data: doc, + returnCode: 200, + }; +} + +export async function scrapeController(req: Request, res: Response) { + try { + let earlyReturn = false; + // make sure to authenticate user first, Bearer + const { success, team_id, error, status, plan } = await authenticateUser( + req, + res, + RateLimiterMode.Scrape + ); + if (!success) { + return res.status(status).json({ error }); + } + + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; + const extractorOptions = { + ...defaultExtractorOptions, + ...req.body.extractorOptions, + }; + const origin = req.body.origin ?? defaultOrigin; + let timeout = req.body.timeout ?? defaultTimeout; + + if (extractorOptions.mode.includes("llm-extraction")) { + if ( + typeof extractorOptions.extractionSchema !== "object" || + extractorOptions.extractionSchema === null + ) { + return res.status(400).json({ + error: + "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", + }); + } + + pageOptions.onlyMainContent = true; + timeout = req.body.timeout ?? 90000; + } + + // checkCredits + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + earlyReturn = true; + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + Logger.error(error); + earlyReturn = true; + return res.status(500).json({ + error: + "Error checking team credits. Please contact hello@firecrawl.com for help.", + }); + } + + const jobId = uuidv4(); + + const startTime = new Date().getTime(); + const result = await scrapeHelper( + jobId, + req, + team_id, + crawlerOptions, + pageOptions, + extractorOptions, + timeout, + plan + ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = + result.data && result.data.markdown + ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + : 0; + + if (result.success) { + let creditsToBeBilled = 1; + const creditsPerLLMExtract = 4; + + if (extractorOptions.mode.includes("llm-extraction")) { + // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); + creditsToBeBilled += creditsPerLLMExtract; + } + + let startTimeBilling = new Date().getTime(); + + if (earlyReturn) { + // Don't bill if we're early returning + return; + } + if (creditsToBeBilled > 0) { + // billing for doc done on queue end, bill only for llm extraction + billTeam(team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); + } + } + + let doc = result.data; + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } + + logJob({ + job_id: jobId, + success: result.success, + message: result.error, + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + origin: origin, + extractor_options: extractorOptions, + num_tokens: numTokens, + }); + + return res.status(result.returnCode).json(result); + } catch (error) { + Sentry.captureException(error); + Logger.error(error); + return res.status(500).json({ + error: + typeof error === "string" + ? error + : error?.message ?? "Internal Server Error", + }); + } +} diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/v0/search.ts similarity index 72% rename from apps/api/src/controllers/search.ts rename to apps/api/src/controllers/v0/search.ts index aeb044d8..5ef2b767 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -1,17 +1,18 @@ import { Request, Response } from "express"; -import { WebScraperDataProvider } from "../scraper/WebScraper"; -import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../types"; -import { logJob } from "../services/logging/log_job"; -import { PageOptions, SearchOptions } from "../lib/entities"; -import { search } from "../search"; -import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { WebScraperDataProvider } from "../../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; +import { authenticateUser } from "../auth"; +import { PlanType, RateLimiterMode } from "../../types"; +import { logJob } from "../../services/logging/log_job"; +import { PageOptions, SearchOptions } from "../../lib/entities"; +import { search } from "../../search"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../lib/logger"; -import { getScrapeQueue } from "../services/queue-service"; +import { Logger } from "../../lib/logger"; +import { getScrapeQueue } from "../../services/queue-service"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import * as Sentry from "@sentry/node"; -import { addScrapeJob } from "../services/queue-jobs"; +import { getJobPriority } from "../../lib/job-priority"; export async function searchHelper( jobId: string, @@ -20,6 +21,7 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, + plan: PlanType ): Promise<{ success: boolean; error?: string; @@ -52,18 +54,10 @@ export async function searchHelper( if (justSearch) { - const billingResult = await billTeam( - team_id, - res.length - ); - if (!billingResult.success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + billTeam(team_id, res.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); return { success: true, data: res, returnCode: 200 }; } @@ -76,6 +70,8 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } + const jobPriority = await getJobPriority({plan, team_id, basePriority: 20}); + // filter out social media links const jobDatas = res.map(x => { @@ -92,7 +88,7 @@ export async function searchHelper( }, opts: { jobId: uuid, - priority: 20, + priority: jobPriority, } }; }) @@ -108,24 +104,7 @@ export async function searchHelper( await getScrapeQueue().addBulk(jobs); } - const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => { - const start = Date.now(); - const int = setInterval(async () => { - if (Date.now() >= start + 60000) { - clearInterval(int); - reject(new Error("Job wait ")); - } else { - const state = await x.getState(); - if (state === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(x.id)).returnvalue); - } else if (state === "failed") { - clearInterval(int); - reject((await getScrapeQueue().getJob(x.id)).failedReason); - } - } - }, 1000); - })))).map(x => x[0]); + const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]); if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; @@ -152,7 +131,7 @@ export async function searchHelper( export async function searchController(req: Request, res: Response) { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser( + const { success, team_id, error, status, plan } = await authenticateUser( req, res, RateLimiterMode.Search @@ -162,17 +141,16 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: false, - onlyMainContent: true, - fetchPageContent: true, - removeTags: [], - fallback: false, + includeHtml: req.body.pageOptions?.includeHtml ?? false, + onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, + fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, + removeTags: req.body.pageOptions?.removeTags ?? [], + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 5 }; - const jobId = uuidv4(); try { @@ -194,6 +172,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, + plan ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/v0/status.ts similarity index 86% rename from apps/api/src/controllers/status.ts rename to apps/api/src/controllers/v0/status.ts index 362f1f24..bf8d2834 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; +import { Logger } from "../../../src/lib/logger"; +import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; @@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons // } // } - const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); const jobStatuses = await Promise.all(jobs.map(x => x.getState())); const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; diff --git a/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP b/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP new file mode 100644 index 00000000..621c7436 --- /dev/null +++ b/apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP @@ -0,0 +1,47 @@ +import { crawlController } from '../crawl' +import { Request, Response } from 'express'; +import { authenticateUser } from '../auth'; // Ensure this import is correct +import { createIdempotencyKey } from '../../services/idempotency/create'; +import { validateIdempotencyKey } from '../../services/idempotency/validate'; +import { v4 as uuidv4 } from 'uuid'; + +jest.mock('../auth', () => ({ + authenticateUser: jest.fn().mockResolvedValue({ + success: true, + team_id: 'team123', + error: null, + status: 200 + }), + reduce: jest.fn() +})); +jest.mock('../../services/idempotency/validate'); + +describe('crawlController', () => { + it('should prevent duplicate requests using the same idempotency key', async () => { + const req = { + headers: { + 'x-idempotency-key': await uuidv4(), + 'Authorization': `Bearer ${process.env.TEST_API_KEY}` + }, + body: { + url: 'https://mendable.ai' + } + } as unknown as Request; + const res = { + status: jest.fn().mockReturnThis(), + json: jest.fn() + } as unknown as Response; + + // Mock the idempotency key validation to return false for the second call + (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false); + + // First request should succeed + await crawlController(req, res); + expect(res.status).not.toHaveBeenCalledWith(409); + + // Second request with the same key should fail + await crawlController(req, res); + expect(res.status).toHaveBeenCalledWith(409); + expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts new file mode 100644 index 00000000..0a9931d3 --- /dev/null +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -0,0 +1,64 @@ +import { url } from "../types"; + +describe("URL Schema Validation", () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + it("should prepend http:// to URLs without a protocol", () => { + const result = url.parse("example.com"); + expect(result).toBe("http://example.com"); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("http://example.com")).not.toThrow(); + expect(() => url.parse("https://example.com")).not.toThrow(); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("example.com")).not.toThrow(); + }); + + it("should reject URLs with unsupported protocols", () => { + expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL"); + }); + + it("should reject URLs without a valid top-level domain", () => { + expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path"); + }); + + it("should reject blocked URLs", () => { + expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with subdomains correctly", () => { + expect(() => url.parse("http://sub.example.com")).not.toThrow(); + expect(() => url.parse("https://blog.example.com")).not.toThrow(); + }); + + it("should handle URLs with paths correctly", () => { + expect(() => url.parse("http://example.com/path")).not.toThrow(); + expect(() => url.parse("https://example.com/another/path")).not.toThrow(); + }); + + it("should handle URLs with subdomains that are blocked", () => { + expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with paths that are blocked", () => { + expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should reject malformed URLs starting with 'http://http'", () => { + expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol."); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://example.com/http://example.com")).not.toThrow(); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); + }); +}) \ No newline at end of file diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts similarity index 81% rename from apps/api/src/controllers/crawl-cancel.ts rename to apps/api/src/controllers/v1/crawl-cancel.ts index 1de9af60..21fc7cf9 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -1,10 +1,12 @@ import { Request, Response } from "express"; -import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { supabase_service } from "../../src/services/supabase"; -import { Logger } from "../../src/lib/logger"; -import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis"; +import { authenticateUser } from "../auth"; +import { RateLimiterMode } from "../../types"; +import { supabase_service } from "../../services/supabase"; +import { Logger } from "../../lib/logger"; +import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts new file mode 100644 index 00000000..16a67682 --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -0,0 +1,162 @@ +import { authMiddleware } from "../../routes/v1"; +import { RateLimiterMode } from "../../types"; +import { authenticateUser } from "../auth"; +import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { WebSocket } from "ws"; +import { v4 as uuidv4 } from "uuid"; +import { Logger } from "../../lib/logger"; +import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis"; +import { getScrapeQueue } from "../../services/queue-service"; +import { getJob, getJobs } from "./crawl-status"; +import * as Sentry from "@sentry/node"; + +type ErrorMessage = { + type: "error", + error: string, +} + +type CatchupMessage = { + type: "catchup", + data: CrawlStatusResponse, +} + +type DocumentMessage = { + type: "document", + data: Document, +} + +type DoneMessage = { type: "done" } + +type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage; + +function send(ws: WebSocket, msg: Message) { + if (ws.readyState === 1) { + return new Promise((resolve, reject) => { + ws.send(JSON.stringify(msg), (err) => { + if (err) reject(err); + else resolve(null); + }); + }); + } +} + +function close(ws: WebSocket, code: number, msg: Message) { + if (ws.readyState <= 1) { + ws.close(code, JSON.stringify(msg)); + } +} + +async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return close(ws, 1008, { type: "error", error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return close(ws, 3003, { type: "error", error: "Forbidden" }); + } + + let doneJobIDs = []; + let finished = false; + + const loop = async () => { + if (finished) return; + + const jobIDs = await getCrawlJobs(req.params.jobId); + + if (jobIDs.length === doneJobIDs.length) { + return close(ws, 1000, { type: "done" }); + } + + const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x)); + const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)])); + const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); + + for (const jobID of newlyDoneJobIDs) { + const job = await getJob(jobID); + + if (job.returnvalue) { + send(ws, { + type: "document", + data: legacyDocumentConverter(job.returnvalue), + }) + } else { + return close(ws, 3000, { type: "error", error: job.failedReason }); + } + } + + doneJobIDs.push(...newlyDoneJobIDs); + + setTimeout(loop, 1000); + }; + + setTimeout(loop, 1000); + + doneJobIDs = await getDoneJobsOrdered(req.params.jobId); + + const jobIDs = await getCrawlJobs(req.params.jobId); + const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x))); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping"; + const doneJobs = await getJobs(doneJobIDs); + const data = doneJobs.map(x => x.returnvalue); + + send(ws, { + type: "catchup", + data: { + success: true, + status, + total: jobIDs.length, + completed: doneJobIDs.length, + creditsUsed: jobIDs.length, + expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), + data: data.map(x => legacyDocumentConverter(x)), + } + }); + + if (status !== "scraping") { + finished = true; + return close(ws, 1000, { type: "done" }); + } +} + +// Basically just middleware and error wrapping +export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth) { + try { + const { success, team_id, error, status, plan } = await authenticateUser( + req, + null, + RateLimiterMode.CrawlStatus, + ); + + if (!success) { + return close(ws, 3000, { + type: "error", + error, + }); + } + + req.auth = { team_id, plan }; + + await crawlStatusWS(ws, req); + } catch (err) { + Sentry.captureException(err); + + const id = uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); + return close(ws, 1011, { + type: "error", + error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + }); + } +} diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts new file mode 100644 index 00000000..2ee0638c --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -0,0 +1,130 @@ +import { Response } from "express"; +import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis"; +import { getScrapeQueue } from "../../services/queue-service"; +import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; +import { configDotenv } from "dotenv"; +configDotenv(); + +export async function getJob(id: string) { + const job = await getScrapeQueue().getJob(id); + if (!job) return job; + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const supabaseData = await supabaseGetJobById(id); + + if (supabaseData) { + job.returnvalue = supabaseData.docs; + } + } + + job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + + return job; +} + +export async function getJobs(ids: string[]) { + const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const supabaseData = await supabaseGetJobsById(ids); + + supabaseData.forEach(x => { + const job = jobs.find(y => y.id === x.job_id); + if (job) { + job.returnvalue = x.docs; + } + }) + } + + jobs.forEach(job => { + job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + }); + + return jobs; +} + +export async function crawlStatusController(req: RequestWithAuth, res: Response) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ success: false, error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return res.status(403).json({ success: false, error: "Forbidden" }); + } + + const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0; + const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined; + + const jobIDs = await getCrawlJobs(req.params.jobId); + const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x))); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping"; + const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); + const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); + + let doneJobs = []; + + if (end === undefined) { // determine 10 megabyte limit + let bytes = 0; + const bytesLimit = 10485760; // 10 MiB in bytes + const factor = 100; // chunking for faster retrieval + + for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) { + // get current chunk and retrieve jobs + const currentIDs = doneJobsOrder.slice(i, i+factor); + const jobs = await getJobs(currentIDs); + + // iterate through jobs and add them one them one to the byte counter + // both loops will break once we cross the byte counter + for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) { + const job = jobs[ii]; + doneJobs.push(job); + bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length; + } + } + + // if we ran over the bytes limit, remove the last document + if (bytes > bytesLimit) { + doneJobs.splice(doneJobs.length - 1, 1); + } + } else { + doneJobs = await getJobs(doneJobsOrder); + } + + const data = doneJobs.map(x => x.returnvalue); + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + + nextURL.searchParams.set("skip", (start + data.length).toString()); + + if (typeof req.query.limit === "string") { + nextURL.searchParams.set("limit", req.query.limit); + } + + if (data.length > 0) { + if (!doneJobs[0].data.pageOptions.includeRawHtml) { + for (let ii = 0; ii < doneJobs.length; ii++) { + if (data[ii]) { + delete data[ii].rawHtml; + } + } + } + } + + res.status(200).json({ + success: true, + status, + completed: doneJobsLength, + total: jobIDs.length, + creditsUsed: jobIDs.length, + expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), + next: + status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this + ? undefined + : nextURL.href, + data: data.map(x => legacyDocumentConverter(x)), + }); +} + diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts new file mode 100644 index 00000000..e0883fa8 --- /dev/null +++ b/apps/api/src/controllers/v1/crawl.ts @@ -0,0 +1,167 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + CrawlRequest, + crawlRequestSchema, + CrawlResponse, + legacyCrawlerOptions, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { addScrapeJob } from "../../services/queue-jobs"; +import { Logger } from "../../lib/logger"; +import { getJobPriority } from "../../lib/job-priority"; +import { callWebhook } from "../../services/webhook"; + +export async function crawlController( + req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, + res: Response +) { + req.body = crawlRequestSchema.parse(req.body); + + const id = uuidv4(); + + await logCrawl(id, req.auth.team_id); + + const { remainingCredits } = req.account; + + const crawlerOptions = legacyCrawlerOptions(req.body); + const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + + // TODO: @rafa, is this right? copied from v0 + if (Array.isArray(crawlerOptions.includes)) { + for (const x of crawlerOptions.includes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ success: false, error: e.message }); + } + } + } + + if (Array.isArray(crawlerOptions.excludes)) { + for (const x of crawlerOptions.excludes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ success: false, error: e.message }); + } + } + } + + crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); + + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + const crawler = crawlToCrawler(id, sc); + + try { + sc.robots = await crawler.getRobotsTxt(); + } catch (e) { + Logger.debug( + `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( + e + )}` + ); + } + + await saveCrawl(id, sc); + + const sitemap = sc.crawlerOptions.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); + + if (sitemap !== null && sitemap.length > 0) { + let jobPriority = 20; + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(sitemap.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } + const jobs = sitemap.map((x) => { + const url = x.url; + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + team_id: req.auth.team_id, + crawlerOptions, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + webhook: req.body.webhook, + v1: true, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); + await getScrapeQueue().addBulk(jobs); + } else { + await lockURL(id, sc, req.body.url); + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: req.auth.team_id, + pageOptions: pageOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + v1: true, + }, + { + priority: 15, + } + ); + await addCrawlJob(id, job.id); + } + + if(req.body.webhook) { + await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started"); + } + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + + return res.status(200).json({ + success: true, + id, + url: `${protocol}://${req.get("host")}/v1/crawl/${id}`, + }); +} + + diff --git a/apps/api/src/controllers/v1/liveness.ts b/apps/api/src/controllers/v1/liveness.ts new file mode 100644 index 00000000..8ff1a96f --- /dev/null +++ b/apps/api/src/controllers/v1/liveness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function livenessController(req: Request, res: Response) { + //TODO: add checks if the application is live and healthy like checking the redis connection + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts new file mode 100644 index 00000000..a9c61d04 --- /dev/null +++ b/apps/api/src/controllers/v1/map.ts @@ -0,0 +1,142 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + legacyCrawlerOptions, + mapRequestSchema, + RequestWithAuth, +} from "./types"; +import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; +import { MapResponse, MapRequest } from "./types"; +import { configDotenv } from "dotenv"; +import { + checkAndUpdateURLForMap, + isSameDomain, + isSameSubdomain, + removeDuplicateUrls, +} from "../../lib/validateUrl"; +import { fireEngineMap } from "../../search/fireEngine"; +import { billTeam } from "../../services/billing/credit_billing"; +import { logJob } from "../../services/logging/log_job"; +import { performCosineSimilarity } from "../../lib/map-cosine"; +import { Logger } from "../../lib/logger"; + +configDotenv(); + +export async function mapController( + req: RequestWithAuth<{}, MapResponse, MapRequest>, + res: Response +) { + const startTime = new Date().getTime(); + + req.body = mapRequestSchema.parse(req.body); + + + const limit : number = req.body.limit ?? 5000; + + const id = uuidv4(); + let links: string[] = [req.body.url]; + + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions: legacyCrawlerOptions(req.body), + pageOptions: {}, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + const crawler = crawlToCrawler(id, sc); + + const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); + + if (sitemap !== null) { + sitemap.map((x) => { + links.push(x.url); + }); + } + + let urlWithoutWww = req.body.url.replace("www.", ""); + + let mapUrl = req.body.search + ? `"${req.body.search}" site:${urlWithoutWww}` + : `site:${req.body.url}`; + // www. seems to exclude subdomains in some cases + const mapResults = await fireEngineMap(mapUrl, { + // limit to 100 results (beta) + numResults: Math.min(limit, 100), + }); + + if (mapResults.length > 0) { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; + } else { + mapResults.map((x) => { + links.push(x.url); + }); + } + } + + // Perform cosine similarity between the search query and the list of links + if (req.body.search) { + const searchQuery = req.body.search.toLowerCase(); + + links = performCosineSimilarity(links, searchQuery); + } + + links = links.map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim() + } catch (_) { + return null; + } + }).filter(x => x !== null); + + // allows for subdomains to be included + links = links.filter((x) => isSameDomain(x, req.body.url)); + + // if includeSubdomains is false, filter out subdomains + if (!req.body.includeSubdomains) { + links = links.filter((x) => isSameSubdomain(x, req.body.url)); + } + + // remove duplicates that could be due to http/https or www + links = removeDuplicateUrls(links); + + billTeam(req.auth.team_id, 1).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + + const linksToReturn = links.slice(0, limit); + + logJob({ + job_id: id, + success: links.length > 0, + message: "Map completed", + num_docs: linksToReturn.length, + docs: linksToReturn, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "map", + url: req.body.url, + crawlerOptions: {}, + pageOptions: {}, + origin: req.body.origin, + extractor_options: { mode: "markdown" }, + num_tokens: 0, + }); + + return res.status(200).json({ + success: true, + links: linksToReturn, + scrape_id: req.body.origin?.includes("website") ? id : undefined, + }); +} diff --git a/apps/api/src/controllers/v1/readiness.ts b/apps/api/src/controllers/v1/readiness.ts new file mode 100644 index 00000000..cdb1f02c --- /dev/null +++ b/apps/api/src/controllers/v1/readiness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function readinessController(req: Request, res: Response) { + // TODO: add checks when the application is ready to serve traffic + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts new file mode 100644 index 00000000..5e0aecb6 --- /dev/null +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -0,0 +1,38 @@ +import { Response } from "express"; +import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs"; +import { scrapeStatusRateLimiter } from "../../services/rate-limiter"; + +export async function scrapeStatusController(req: any, res: any) { + try { + const rateLimiter = scrapeStatusRateLimiter; + const incomingIP = (req.headers["x-forwarded-for"] || + req.socket.remoteAddress) as string; + const iptoken = incomingIP; + await rateLimiter.consume(iptoken); + + const job = await supabaseGetJobByIdOnlyData(req.params.jobId); + + if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ + return res.status(403).json({ + success: false, + error: "You are not allowed to access this resource.", + }); + } + return res.status(200).json({ + success: true, + data: job?.docs[0], + }); + } catch (error) { + if (error instanceof Error && error.message == "Too Many Requests") { + return res.status(429).json({ + success: false, + error: "Rate limit exceeded. Please try again later.", + }); + } else { + return res.status(500).json({ + success: false, + error: "An unexpected error occurred.", + }); + } + } +} diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts new file mode 100644 index 00000000..f0744c22 --- /dev/null +++ b/apps/api/src/controllers/v1/scrape.ts @@ -0,0 +1,148 @@ +import { Request, Response } from "express"; +import { Logger } from "../../lib/logger"; +import { + Document, + legacyDocumentConverter, + legacyExtractorOptions, + legacyScrapeOptions, + RequestWithAuth, + ScrapeRequest, + scrapeRequestSchema, + ScrapeResponse, +} from "./types"; +import { billTeam } from "../../services/billing/credit_billing"; +import { v4 as uuidv4 } from "uuid"; +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { logJob } from "../../services/logging/log_job"; +import { getJobPriority } from "../../lib/job-priority"; +import { PlanType } from "../../types"; + +export async function scrapeController( + req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, + res: Response +) { + req.body = scrapeRequestSchema.parse(req.body); + let earlyReturn = false; + + const origin = req.body.origin; + const timeout = req.body.timeout; + const pageOptions = legacyScrapeOptions(req.body); + const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; + const jobId = uuidv4(); + + const startTime = new Date().getTime(); + const jobPriority = await getJobPriority({ + plan: req.auth.plan as PlanType, + team_id: req.auth.team_id, + basePriority: 10, + }); + + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: {}, + team_id: req.auth.team_id, + pageOptions, + extractorOptions, + origin: req.body.origin, + is_scrape: true, + }, + {}, + jobId, + jobPriority + ); + + let doc: any | undefined; + try { + doc = (await waitForJob(job.id, timeout))[0]; + } catch (e) { + Logger.error(`Error in scrapeController: ${e}`); + if (e instanceof Error && e.message.startsWith("Job wait")) { + return res.status(408).json({ + success: false, + error: "Request timed out", + }); + } else { + return res.status(500).json({ + success: false, + error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ + extractorOptions && extractorOptions.mode !== "markdown" + ? " - Could be due to LLM parsing issues" + : "" + }`, + }); + } + } + + await job.remove(); + + if (!doc) { + console.error("!!! PANIC DOC IS", doc, job); + return res.status(200).json({ + success: true, + warning: "No page found", + data: doc, + }); + } + + delete doc.index; + delete doc.provider; + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = + doc && doc.markdown + ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") + : 0; + + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (earlyReturn) { + // Don't bill if we're early returning + return; + } + if(req.body.extract && req.body.formats.includes("extract")) { + creditsToBeBilled = 5; + } + + billTeam(req.auth.team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); + + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } + + logJob({ + job_id: jobId, + success: true, + message: "Scrape completed", + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: {}, + pageOptions: pageOptions, + origin: origin, + extractor_options: { mode: "markdown" }, + num_tokens: numTokens, + }); + + return res.status(200).json({ + success: true, + data: legacyDocumentConverter(doc), + scrape_id: origin?.includes("website") ? jobId : undefined, + }); +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts new file mode 100644 index 00000000..c44c1cc5 --- /dev/null +++ b/apps/api/src/controllers/v1/types.ts @@ -0,0 +1,380 @@ +import { Request, Response } from "express"; +import { z } from "zod"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import { ExtractorOptions, PageOptions } from "../../lib/entities"; +import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; +import { PlanType } from "../../types"; + +export type Format = + | "markdown" + | "html" + | "rawHtml" + | "links" + | "screenshot" + | "screenshot@fullPage" + | "extract"; + +export const url = z.preprocess( + (x) => { + if (!protocolIncluded(x as string)) { + return `http://${x}`; + } + return x; + }, + z + .string() + .url() + .regex(/^https?:\/\//, "URL uses unsupported protocol") + .refine( + (x) => /\.[a-z]{2,}(\/|$)/i.test(x), + "URL must have a valid top-level domain or be a valid path" + ) + .refine( + (x) => { + try { + checkUrl(x as string) + return true; + } catch (_) { + return false; + } + }, + "Invalid URL" + ) + .refine( + (x) => !isUrlBlocked(x as string), + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ) +); + +const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; + +export const extractOptions = z.object({ + mode: z.enum(["llm"]).default("llm"), + schema: z.any().optional(), + systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."), + prompt: z.string().optional() +}).strict(strictMessage); + +export type ExtractOptions = z.infer; + +export const scrapeOptions = z.object({ + formats: z + .enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract" + ]) + .array() + .optional() + .default(["markdown"]), + headers: z.record(z.string(), z.string()).optional(), + includeTags: z.string().array().optional(), + excludeTags: z.string().array().optional(), + onlyMainContent: z.boolean().default(true), + timeout: z.number().int().positive().finite().safe().default(30000), + waitFor: z.number().int().nonnegative().finite().safe().default(0), + extract: extractOptions.optional(), + parsePDF: z.boolean().default(true), +}).strict(strictMessage) + + +export type ScrapeOptions = z.infer; + +export const scrapeRequestSchema = scrapeOptions.extend({ + url, + origin: z.string().optional().default("api"), +}).strict(strictMessage).refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); + }, + { + message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + } +).transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; +}); + +// export type ScrapeRequest = { +// url: string; +// formats?: Format[]; +// headers?: { [K: string]: string }; +// includeTags?: string[]; +// excludeTags?: string[]; +// onlyMainContent?: boolean; +// timeout?: number; +// waitFor?: number; +// } + +export type ScrapeRequest = z.infer; + +const crawlerOptions = z.object({ + includePaths: z.string().array().default([]), + excludePaths: z.string().array().default([]), + maxDepth: z.number().default(10), // default? + limit: z.number().default(10000), // default? + allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? + allowExternalLinks: z.boolean().default(false), + ignoreSitemap: z.boolean().default(true), +}).strict(strictMessage); + +// export type CrawlerOptions = { +// includePaths?: string[]; +// excludePaths?: string[]; +// maxDepth?: number; +// limit?: number; +// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME??? +// allowExternalLinks?: boolean; +// ignoreSitemap?: boolean; +// }; + +export type CrawlerOptions = z.infer; + +export const crawlRequestSchema = crawlerOptions.extend({ + url, + origin: z.string().optional().default("api"), + scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), + webhook: z.string().url().optional(), + limit: z.number().default(10000), +}).strict(strictMessage); + +// export type CrawlRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// scrapeOptions?: Exclude; +// }; + +// export type ExtractorOptions = { +// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; +// extractionPrompt?: string; +// extractionSchema?: Record; +// } + + +export type CrawlRequest = z.infer; + +export const mapRequestSchema = crawlerOptions.extend({ + url, + origin: z.string().optional().default("api"), + includeSubdomains: z.boolean().default(true), + search: z.string().optional(), + ignoreSitemap: z.boolean().default(false), + limit: z.number().min(1).max(5000).default(5000).optional(), +}).strict(strictMessage); + +// export type MapRequest = { +// url: string; +// crawlerOptions?: CrawlerOptions; +// }; + +export type MapRequest = z.infer; + +export type Document = { + markdown?: string; + extract?: string; + html?: string; + rawHtml?: string; + links?: string[]; + screenshot?: string; + metadata: { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dcTermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dcTermsType?: string; + dcType?: string; + dcTermsAudience?: string; + dcTermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dcTermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + statusCode?: number; + error?: string; + }; +}; + +export type ErrorResponse = { + success: false; + error: string; + details?: any; +}; + +export type ScrapeResponse = + | ErrorResponse + | { + success: true; + warning?: string; + data: Document; + scrape_id?: string; + }; + +export interface ScrapeResponseRequestTest { + statusCode: number; + body: ScrapeResponse; + error?: string; +} + +export type CrawlResponse = + | ErrorResponse + | { + success: true; + id: string; + url: string; + }; + +export type MapResponse = + | ErrorResponse + | { + success: true; + links: string[]; + scrape_id?: string; + }; + +export type CrawlStatusParams = { + jobId: string; +}; + +export type CrawlStatusResponse = + | ErrorResponse + | { + success: true; + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: string; + next?: string; + data: Document[]; + }; + +type AuthObject = { + team_id: string; + plan: PlanType; +}; + +type Account = { + remainingCredits: number; +}; + +export interface RequestWithMaybeAuth< + ReqParams = {}, + ReqBody = undefined, + ResBody = undefined +> extends Request { + auth?: AuthObject; + account?: Account; +} + +export interface RequestWithAuth< + ReqParams = {}, + ReqBody = undefined, + ResBody = undefined, +> extends Request { + auth: AuthObject; + account?: Account; +} + +export interface ResponseWithSentry< + ResBody = undefined, +> extends Response { + sentry?: string, +} + +export function legacyCrawlerOptions(x: CrawlerOptions) { + return { + includes: x.includePaths, + excludes: x.excludePaths, + maxCrawledLinks: x.limit, + maxDepth: x.maxDepth, + limit: x.limit, + generateImgAltText: false, + allowBackwardCrawling: x.allowBackwardLinks, + allowExternalContentLinks: x.allowExternalLinks, + }; +} + +export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { + return { + includeMarkdown: x.formats.includes("markdown"), + includeHtml: x.formats.includes("html"), + includeRawHtml: x.formats.includes("rawHtml"), + includeExtract: x.formats.includes("extract"), + onlyIncludeTags: x.includeTags, + removeTags: x.excludeTags, + onlyMainContent: x.onlyMainContent, + waitFor: x.waitFor, + headers: x.headers, + includeLinks: x.formats.includes("links"), + screenshot: x.formats.includes("screenshot"), + fullPageScreenshot: x.formats.includes("screenshot@fullPage"), + parsePDF: x.parsePDF, + }; +} + +export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { + return { + mode: x.mode ? "llm-extraction" : "markdown", + extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.", + extractionSchema: x.schema, + userPrompt: x.prompt ?? "", + }; +} + +export function legacyDocumentConverter(doc: any): Document { + if (doc === null || doc === undefined) return null; + + if (doc.metadata) { + if (doc.metadata.screenshot) { + doc.screenshot = doc.metadata.screenshot; + delete doc.metadata.screenshot; + } + + if (doc.metadata.fullPageScreenshot) { + doc.fullPageScreenshot = doc.metadata.fullPageScreenshot; + delete doc.metadata.fullPageScreenshot; + } + } + + return { + markdown: doc.markdown, + links: doc.linksOnPage, + rawHtml: doc.rawHtml, + html: doc.html, + extract: doc.llm_extraction, + screenshot: doc.screenshot ?? doc.fullPageScreenshot, + metadata: { + ...doc.metadata, + pageError: undefined, + pageStatusCode: undefined, + error: doc.metadata.pageError, + statusCode: doc.metadata.pageStatusCode, + }, + }; +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0674a46f..7d8817af 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,7 +1,7 @@ import "dotenv/config"; import "./services/sentry" import * as Sentry from "@sentry/node"; -import express from "express"; +import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; import cors from "cors"; import { getScrapeQueue } from "./services/queue-service"; @@ -15,8 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events"; import http from 'node:http'; import https from 'node:https'; import CacheableLookup from 'cacheable-lookup'; - - +import { v1Router } from "./routes/v1"; +import expressWs from "express-ws"; +import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; +import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; +import { ZodError } from "zod"; +import { v4 as uuidv4 } from "uuid"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -49,7 +53,8 @@ if (cluster.isMaster) { } }); } else { - const app = express(); + const ws = expressWs(express()); + const app = ws.app; global.isProduction = process.env.IS_PRODUCTION === "true"; @@ -82,6 +87,7 @@ if (cluster.isMaster) { // register router app.use(v0Router); + app.use("/v1", v1Router); app.use(adminRouter); const DEFAULT_PORT = process.env.PORT ?? 3002; @@ -184,11 +190,42 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { + next(err); + } + }); + Sentry.setupExpressErrorHandler(app); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { + return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); + } + + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + }); + Logger.info(`Worker ${process.pid} started`); } + + // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index af8b0bb1..d05f9bd7 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -15,7 +15,8 @@ export async function generateCompletions( // const schema = zodToJsonSchema(options.schema) const schema = extractionOptions.extractionSchema; - const prompt = extractionOptions.extractionPrompt; + const systemPrompt = extractionOptions.extractionPrompt; + const prompt = extractionOptions.userPrompt; const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider @@ -24,30 +25,35 @@ export async function generateCompletions( switch (switchVariable) { case "openAI": const llm = new OpenAI(); - try{ - const completionResult = await generateOpenAICompletions({ - client: llm, - document: document, - schema: schema, - prompt: prompt, - mode: mode, - }); - // Validate the JSON output against the schema using AJV - const validate = ajv.compile(schema); - if (!validate(completionResult.llm_extraction)) { - //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. - throw new Error( - `JSON parsing error(s): ${validate.errors - ?.map((err) => err.message) - .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` - ); - } + try { + const completionResult = await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt, + systemPrompt: systemPrompt, + mode: mode, + }); + // Validate the JSON output against the schema using AJV + if (schema) { + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. + throw new Error( + `JSON parsing error(s): ${validate.errors + ?.map((err) => err.message) + .join( + ", " + )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` + ); + } + } - return completionResult; - } catch (error) { - Logger.error(`Error generating completions: ${error}`); - throw error; - } + return completionResult; + } catch (error) { + Logger.error(`Error generating completions: ${error}`); + throw error; + } default: throw new Error("Invalid client"); } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 8ca6bbd4..23147b12 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -16,7 +16,6 @@ function prepareOpenAIDoc( document: Document, mode: "markdown" | "raw-html" ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { - let markdown = document.markdown; let extractionTarget = document.markdown; @@ -33,34 +32,32 @@ function prepareOpenAIDoc( // ); } - - - // count number of tokens const numTokens = numTokensFromString(extractionTarget, "gpt-4"); if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); + extractionTarget = extractionTarget.slice(0, maxTokens * modifier); } - return [[{ type: "text", text: extractionTarget }], numTokens]; } export async function generateOpenAICompletions({ client, - model = process.env.MODEL_NAME || "gpt-4o", + model = process.env.MODEL_NAME || "gpt-4o-mini", document, schema, //TODO - add zod dynamic type checking - prompt = defaultPrompt, + systemPrompt = defaultPrompt, + prompt, temperature, - mode + mode, }: { client: OpenAI; model?: string; document: Document; schema: any; // This should be replaced with a proper Zod schema type when available prompt?: string; + systemPrompt?: string; temperature?: number; mode: "markdown" | "raw-html"; }): Promise { @@ -70,45 +67,79 @@ export async function generateOpenAICompletions({ if (preparedDoc === null) { return { ...document, - warning: "LLM extraction was not performed since the document's content is empty or missing.", + warning: + "LLM extraction was not performed since the document's content is empty or missing.", }; } - const [content, numTokens] = preparedDoc; - const completion = await openai.chat.completions.create({ - model, - messages: [ - { - role: "system", - content: prompt, - }, - { role: "user", content }, - ], - tools: [ - { - type: "function", - function: { - name: "extract_content", - description: "Extracts the content from the given webpage(s)", - parameters: schema, + let completion; + let llmExtraction; + if (prompt && !schema) { + const jsonCompletion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, }, - }, - ], - tool_choice: { "type": "function", "function": {"name": "extract_content"}}, - temperature, - }); + { role: "user", content }, + { + role: "user", + content: `Transform the above content into structured json output based on the following user request: ${prompt}`, + }, + ], + response_format: { type: "json_object" }, + temperature, + }); - const c = completion.choices[0].message.tool_calls[0].function.arguments; + try { + llmExtraction = JSON.parse( + jsonCompletion.choices[0].message.content.trim() + ); + } catch (e) { + throw new Error("Invalid JSON"); + } + } else { + completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, + }, + { role: "user", content }, + ], + tools: [ + { + type: "function", + function: { + name: "extract_content", + description: "Extracts the content from the given webpage(s)", + parameters: schema, + }, + }, + ], + tool_choice: { type: "function", function: { name: "extract_content" } }, + temperature, + }); + const c = completion.choices[0].message.tool_calls[0].function.arguments; - // Extract the LLM extraction content from the completion response - const llmExtraction = JSON.parse(c); + // Extract the LLM extraction content from the completion response + try { + llmExtraction = JSON.parse(c); + } catch (e) { + throw new Error("Invalid JSON"); + } + } // Return the document with the LLM extraction content added return { ...document, llm_extraction: llmExtraction, - warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, + warning: + numTokens > maxTokens + ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` + : undefined, }; } - diff --git a/apps/api/src/lib/__tests__/html-to-markdown.test.ts b/apps/api/src/lib/__tests__/html-to-markdown.test.ts new file mode 100644 index 00000000..3c68c959 --- /dev/null +++ b/apps/api/src/lib/__tests__/html-to-markdown.test.ts @@ -0,0 +1,40 @@ +import { parseMarkdown } from '../html-to-markdown'; + +describe('parseMarkdown', () => { + it('should correctly convert simple HTML to Markdown', async () => { + const html = '

Hello, world!

'; + const expectedMarkdown = 'Hello, world!'; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should convert complex HTML with nested elements to Markdown', async () => { + const html = '

Hello bold world!

  • List item
'; + const expectedMarkdown = 'Hello **bold** world!\n\n- List item'; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should return empty string when input is empty', async () => { + const html = ''; + const expectedMarkdown = ''; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should handle null input gracefully', async () => { + const html = null; + const expectedMarkdown = ''; + await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); + }); + + it('should handle various types of invalid HTML gracefully', async () => { + const invalidHtmls = [ + { html: '

Unclosed tag', expected: 'Unclosed tag' }, + { html: '

Missing closing div', expected: 'Missing closing div' }, + { html: '

Wrong nesting

', expected: '**Wrong nesting**' }, + { html: 'Link without closing tag', expected: '[Link without closing tag](http://example.com)' } + ]; + + for (const { html, expected } of invalidHtmls) { + await expect(parseMarkdown(html)).resolves.toBe(expected); + } + }); +}); diff --git a/apps/api/src/lib/__tests__/job-priority.test.ts b/apps/api/src/lib/__tests__/job-priority.test.ts new file mode 100644 index 00000000..82477379 --- /dev/null +++ b/apps/api/src/lib/__tests__/job-priority.test.ts @@ -0,0 +1,134 @@ +import { + getJobPriority, + addJobPriority, + deleteJobPriority, +} from "../job-priority"; +import { redisConnection } from "../../services/queue-service"; +import { PlanType } from "../../types"; + +jest.mock("../../services/queue-service", () => ({ + redisConnection: { + sadd: jest.fn(), + srem: jest.fn(), + scard: jest.fn(), + expire: jest.fn(), + }, +})); + +describe("Job Priority Tests", () => { + afterEach(() => { + jest.clearAllMocks(); + }); + + test("addJobPriority should add job_id to the set and set expiration", async () => { + const team_id = "team1"; + const job_id = "job1"; + await addJobPriority(team_id, job_id); + expect(redisConnection.sadd).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + job_id + ); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + }); + + test("deleteJobPriority should remove job_id from the set", async () => { + const team_id = "team1"; + const job_id = "job1"; + await deleteJobPriority(team_id, job_id); + expect(redisConnection.srem).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + job_id + ); + }); + + test("getJobPriority should return correct priority based on plan and set length", async () => { + const team_id = "team1"; + const plan: PlanType = "standard"; + (redisConnection.scard as jest.Mock).mockResolvedValue(150); + + const priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(250); + const priorityExceeded = await getJobPriority({ plan, team_id }); + expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4) + }); + + test("getJobPriority should handle different plans correctly", async () => { + const team_id = "team1"; + + (redisConnection.scard as jest.Mock).mockResolvedValue(50); + let plan: PlanType = "hobby"; + let priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(150); + plan = "hobby"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3) + + (redisConnection.scard as jest.Mock).mockResolvedValue(25); + plan = "free"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(10); + + (redisConnection.scard as jest.Mock).mockResolvedValue(60); + plan = "free"; + priority = await getJobPriority({ plan, team_id }); + expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5) + }); + + test("addJobPriority should reset expiration time when adding new job", async () => { + const team_id = "team1"; + const job_id1 = "job1"; + const job_id2 = "job2"; + + await addJobPriority(team_id, job_id1); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + + // Clear the mock calls + (redisConnection.expire as jest.Mock).mockClear(); + + // Add another job + await addJobPriority(team_id, job_id2); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + }); + + test("Set should expire after 60 seconds", async () => { + const team_id = "team1"; + const job_id = "job1"; + + jest.useFakeTimers(); + + await addJobPriority(team_id, job_id); + expect(redisConnection.expire).toHaveBeenCalledWith( + `limit_team_id:${team_id}`, + 60 + ); + + // Fast-forward time by 59 seconds + jest.advanceTimersByTime(59000); + + // The set should still exist + expect(redisConnection.scard).not.toHaveBeenCalled(); + + // Fast-forward time by 2 more seconds (total 61 seconds) + jest.advanceTimersByTime(2000); + + // Check if the set has been removed (scard should return 0) + (redisConnection.scard as jest.Mock).mockResolvedValue(0); + const setSize = await redisConnection.scard(`limit_team_id:${team_id}`); + expect(setSize).toBe(0); + + jest.useRealTimers(); + }); +}); diff --git a/apps/api/src/lib/checkCredits.ts b/apps/api/src/lib/checkCredits.ts new file mode 100644 index 00000000..7e9d988d --- /dev/null +++ b/apps/api/src/lib/checkCredits.ts @@ -0,0 +1,32 @@ +import { checkTeamCredits } from "../services/billing/credit_billing"; +import { Logger } from "./logger"; + +type checkCreditsResponse = { + status: number; + error: string | null; +} + +export const checkCredits = async (team_id: string): Promise => { + try { + const { + success: creditsCheckSuccess, + message: creditsCheckMessage + } = await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return { + status: 402, + error: "Insufficient credits" + }; + } + } catch (error) { + Logger.error(error); + return { + status: 500, + error: "Error checking team credits. Please contact hello@firecrawl.com for help." + }; + } + return { + status: 200, + error: null + } +}; \ No newline at end of file diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 9e8a0cf6..9240018e 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -6,6 +6,7 @@ export type StoredCrawl = { crawlerOptions: any; pageOptions: any; team_id: string; + plan: string; robots?: string; cancelled?: boolean; createdAt: number; @@ -26,6 +27,14 @@ export async function getCrawl(id: string): Promise { return JSON.parse(x); } +export async function getCrawlExpiry(id: string): Promise { + const d = new Date(); + const ttl = await redisConnection.pttl("crawl:" + id); + d.setMilliseconds(d.getMilliseconds() + ttl); + d.setMilliseconds(0); + return d; +} + export async function addCrawlJob(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs", job_id); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); @@ -38,13 +47,27 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); + await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); +} + +export async function getDoneJobsOrderedLength(id: string): Promise { + return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); +} + +export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise { + return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end); } export async function isCrawlFinished(id: string) { return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs")); } +export async function isCrawlFinishedLocked(id: string) { + return (await redisConnection.exists("crawl:" + id + ":finish")); +} + export async function finishCrawl(id: string) { if (await isCrawlFinished(id)) { const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); diff --git a/apps/api/src/lib/custom-error.ts b/apps/api/src/lib/custom-error.ts index 20a01cb6..2ffe52e9 100644 --- a/apps/api/src/lib/custom-error.ts +++ b/apps/api/src/lib/custom-error.ts @@ -19,3 +19,4 @@ export class CustomError extends Error { Object.setPrototypeOf(this, CustomError.prototype); } } + diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d833bda0..d7ec2a83 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -11,6 +11,8 @@ export interface Progress { } export type PageOptions = { + includeMarkdown?: boolean; + includeExtract?: boolean; onlyMainContent?: boolean; includeHtml?: boolean; includeRawHtml?: boolean; @@ -24,8 +26,9 @@ export type PageOptions = { parsePDF?: boolean; removeTags?: string | string[]; onlyIncludeTags?: string | string[]; + includeLinks?: boolean; useFastMode?: boolean; // beta - disableJSDom?: boolean; // beta + disableJsDom?: boolean; // beta atsv?: boolean; // beta }; @@ -33,6 +36,7 @@ export type ExtractorOptions = { mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; + userPrompt?: string; } export type SearchOptions = { diff --git a/apps/api/src/lib/go-html-to-md/README.md b/apps/api/src/lib/go-html-to-md/README.md new file mode 100644 index 00000000..4ad510c3 --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/README.md @@ -0,0 +1,7 @@ +To build the go-html-to-md library, run the following command: + +```bash +cd apps/api/src/lib/go-html-to-md +go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go +chmod +x html-to-markdown.so +``` \ No newline at end of file diff --git a/apps/api/src/lib/go-html-to-md/go.mod b/apps/api/src/lib/go-html-to-md/go.mod new file mode 100644 index 00000000..0836f441 --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/go.mod @@ -0,0 +1,14 @@ +module html-to-markdown.go + +go 1.19 + +require github.com/JohannesKaufmann/html-to-markdown v1.6.0 + +require ( + github.com/PuerkitoBio/goquery v1.9.2 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/kr/pretty v0.3.0 // indirect + golang.org/x/net v0.25.0 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect +) diff --git a/apps/api/src/lib/go-html-to-md/go.sum b/apps/api/src/lib/go-html-to-md/go.sum new file mode 100644 index 00000000..7961629d --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/go.sum @@ -0,0 +1,93 @@ +github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k= +github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ= +github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= +github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= +github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= +github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= +github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/apps/api/src/lib/go-html-to-md/html-to-markdown.go b/apps/api/src/lib/go-html-to-md/html-to-markdown.go new file mode 100644 index 00000000..9905a69a --- /dev/null +++ b/apps/api/src/lib/go-html-to-md/html-to-markdown.go @@ -0,0 +1,25 @@ +package main + +import ( + "C" + "log" + + md "github.com/JohannesKaufmann/html-to-markdown" + "github.com/JohannesKaufmann/html-to-markdown/plugin" +) + +//export ConvertHTMLToMarkdown +func ConvertHTMLToMarkdown(html *C.char) *C.char { + converter := md.NewConverter("", true, nil) + converter.Use(plugin.GitHubFlavored()) + + markdown, err := converter.ConvertString(C.GoString(html)) + if err != nil { + log.Fatal(err) + } + return C.CString(markdown) +} + +func main() { + // This function is required for the main package +} diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 002cb7be..a542a434 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,8 +1,68 @@ -export async function parseMarkdown(html: string) { +import koffi from 'koffi'; +import { join } from 'path'; +import "../services/sentry" +import * as Sentry from "@sentry/node"; + +import dotenv from 'dotenv'; +import { Logger } from './logger'; +dotenv.config(); + +// TODO: add a timeout to the Go parser + +class GoMarkdownConverter { + private static instance: GoMarkdownConverter; + private convert: any; + + private constructor() { + const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so'); + const lib = koffi.load(goExecutablePath); + this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); + } + + public static getInstance(): GoMarkdownConverter { + if (!GoMarkdownConverter.instance) { + GoMarkdownConverter.instance = new GoMarkdownConverter(); + } + return GoMarkdownConverter.instance; + } + + public async convertHTMLToMarkdown(html: string): Promise { + return new Promise((resolve, reject) => { + this.convert.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + resolve(res); + } + }); + }); + } +} + +export async function parseMarkdown(html: string): Promise { + if (!html) { + return ''; + } + + try { + if (process.env.USE_GO_MARKDOWN_PARSER == "true") { + const converter = GoMarkdownConverter.getInstance(); + let markdownContent = await converter.convertHTMLToMarkdown(html); + + markdownContent = processMultiLineLinks(markdownContent); + markdownContent = removeSkipToContentLinks(markdownContent); + Logger.info(`HTML to Markdown conversion using Go parser successful`); + return markdownContent; + } + } catch (error) { + Sentry.captureException(error); + Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); + } + + // Fallback to TurndownService if Go parser fails or is not enabled var TurndownService = require("turndown"); - var turndownPluginGfm = require('joplin-turndown-plugin-gfm') - + var turndownPluginGfm = require('joplin-turndown-plugin-gfm'); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { @@ -21,29 +81,20 @@ export async function parseMarkdown(html: string) { }); var gfm = turndownPluginGfm.gfm; turndownService.use(gfm); - let markdownContent = ""; - const turndownPromise = new Promise((resolve, reject) => { - try { - const result = turndownService.turndown(html); - resolve(result); - } catch (error) { - reject("Error converting HTML to Markdown: " + error); - } - }); - - const timeoutPromise = new Promise((resolve, reject) => { - const timeout = 5000; // Timeout in milliseconds - setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout); - }); try { - markdownContent = await Promise.race([turndownPromise, timeoutPromise]); + let markdownContent = await turndownService.turndown(html); + markdownContent = processMultiLineLinks(markdownContent); + markdownContent = removeSkipToContentLinks(markdownContent); + + return markdownContent; } catch (error) { - console.error(error); + console.error("Error converting HTML to Markdown: ", error); return ""; // Optionally return an empty string or handle the error as needed } +} - // multiple line links +function processMultiLineLinks(markdownContent: string): string { let insideLinkContent = false; let newMarkdownContent = ""; let linkOpenCount = 0; @@ -63,12 +114,14 @@ export async function parseMarkdown(html: string) { newMarkdownContent += char; } } - markdownContent = newMarkdownContent; + return newMarkdownContent; +} +function removeSkipToContentLinks(markdownContent: string): string { // Remove [Skip to Content](#page) and [Skip to content](#skip) - markdownContent = markdownContent.replace( + const newMarkdownContent = markdownContent.replace( /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - return markdownContent; -} + return newMarkdownContent; +} \ No newline at end of file diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts new file mode 100644 index 00000000..bb6158f9 --- /dev/null +++ b/apps/api/src/lib/job-priority.ts @@ -0,0 +1,91 @@ +import { redisConnection } from "../../src/services/queue-service"; +import { PlanType } from "../../src/types"; +import { Logger } from "./logger"; + +const SET_KEY_PREFIX = "limit_team_id:"; +export async function addJobPriority(team_id, job_id) { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // Add scrape job id to the set + await redisConnection.sadd(setKey, job_id); + + // This approach will reset the expiration time to 60 seconds every time a new job is added to the set. + await redisConnection.expire(setKey, 60); + } catch (e) { + Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); + } +} + +export async function deleteJobPriority(team_id, job_id) { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // remove job_id from the set + await redisConnection.srem(setKey, job_id); + } catch (e) { + Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); + } +} + +export async function getJobPriority({ + plan, + team_id, + basePriority = 10, +}: { + plan: PlanType; + team_id: string; + basePriority?: number; +}): Promise { + try { + const setKey = SET_KEY_PREFIX + team_id; + + // Get the length of the set + const setLength = await redisConnection.scard(setKey); + + // Determine the priority based on the plan and set length + let planModifier = 1; + let bucketLimit = 0; + + switch (plan) { + case "free": + bucketLimit = 25; + planModifier = 0.5; + break; + case "hobby": + bucketLimit = 100; + planModifier = 0.3; + break; + case "standard": + case "standardnew": + bucketLimit = 200; + planModifier = 0.2; + break; + case "growth": + case "growthdouble": + bucketLimit = 400; + planModifier = 0.1; + break; + + default: + bucketLimit = 25; + planModifier = 1; + break; + } + + // if length set is smaller than set, just return base priority + if (setLength <= bucketLimit) { + return basePriority; + } else { + // If not, we keep base priority + planModifier + return Math.ceil( + basePriority + Math.ceil((setLength - bucketLimit) * planModifier) + ); + } + } catch (e) { + Logger.error( + `Get job priority failed: ${team_id}, ${plan}, ${basePriority}` + ); + return basePriority; + } +} diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 872dbf51..cb8b4119 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -1,3 +1,6 @@ +import { configDotenv } from "dotenv"; +configDotenv(); + enum LogLevel { NONE = 'NONE', // No logs will be output. ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. @@ -25,7 +28,8 @@ export class Logger { const color = Logger.colors[level]; console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); - // if (process.env.USE_DB_AUTH) { + // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + // if (useDbAuthentication) { // save to supabase? another place? // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); // } diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts new file mode 100644 index 00000000..db2491a9 --- /dev/null +++ b/apps/api/src/lib/map-cosine.ts @@ -0,0 +1,46 @@ +import { Logger } from "./logger"; + +export function performCosineSimilarity(links: string[], searchQuery: string) { + try { + // Function to calculate cosine similarity + const cosineSimilarity = (vec1: number[], vec2: number[]): number => { + const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0); + const magnitude1 = Math.sqrt( + vec1.reduce((sum, val) => sum + val * val, 0) + ); + const magnitude2 = Math.sqrt( + vec2.reduce((sum, val) => sum + val * val, 0) + ); + if (magnitude1 === 0 || magnitude2 === 0) return 0; + return dotProduct / (magnitude1 * magnitude2); + }; + + // Function to convert text to vector + const textToVector = (text: string): number[] => { + const words = searchQuery.toLowerCase().split(/\W+/); + return words.map((word) => { + const count = (text.toLowerCase().match(new RegExp(word, "g")) || []) + .length; + return count / text.length; + }); + }; + + // Calculate similarity scores + const similarityScores = links.map((link) => { + const linkVector = textToVector(link); + const searchVector = textToVector(searchQuery); + return cosineSimilarity(linkVector, searchVector); + }); + + // Sort links based on similarity scores and print scores + const a = links + .map((link, index) => ({ link, score: similarityScores[index] })) + .sort((a, b) => b.score - a.score); + + links = a.map((item) => item.link); + return links; + } catch (error) { + Logger.error(`Error performing cosine similarity: ${error}`); + return links; + } +} diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 04850b4e..ad70dfef 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -2,6 +2,8 @@ import { Job } from "bullmq"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; import { Logger } from "./logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export type ScrapeErrorEvent = { type: "error", @@ -36,7 +38,8 @@ export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - if (process.env.USE_DB_AUTHENTICATION) { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const result = await supabase.from("scrape_events").insert({ job_id: jobId, diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index b4247883..c418a6e0 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -1,10 +1,17 @@ import { supabase_service } from "../services/supabase"; +import { Logger } from "./logger"; +import * as Sentry from "@sentry/node"; +/** + * Get a single firecrawl_job by ID + * @param jobId ID of Job + * @returns {any | null} Job + */ export const supabaseGetJobById = async (jobId: string) => { const { data, error } = await supabase_service - .from('firecrawl_jobs') - .select('*') - .eq('job_id', jobId) + .from("firecrawl_jobs") + .select("*") + .eq("job_id", jobId) .single(); if (error) { @@ -16,15 +23,22 @@ export const supabaseGetJobById = async (jobId: string) => { } return data; -} +}; +/** + * Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once. + * @param jobIds IDs of Jobs + * @returns {any[]} Jobs + */ export const supabaseGetJobsById = async (jobIds: string[]) => { const { data, error } = await supabase_service - .from('firecrawl_jobs') - .select('*') - .in('job_id', jobIds); + .from("firecrawl_jobs") + .select() + .in("job_id", jobIds); if (error) { + Logger.error(`Error in supabaseGetJobsById: ${error}`); + Sentry.captureException(error); return []; } @@ -33,5 +47,47 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { } return data; -} +}; +/** + * Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once. + * @param crawlId ID of crawl + * @returns {any[]} Jobs + */ +export const supabaseGetJobsByCrawlId = async (crawlId: string) => { + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .select() + .eq("crawl_id", crawlId) + + if (error) { + Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); + Sentry.captureException(error); + return []; + } + + if (!data) { + return []; + } + + return data; +}; + + +export const supabaseGetJobByIdOnlyData = async (jobId: string) => { + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .select("docs, team_id") + .eq("job_id", jobId) + .single(); + + if (error) { + return null; + } + + if (!data) { + return null; + } + + return data; +}; \ No newline at end of file diff --git a/apps/api/src/lib/validateUrl.test.ts b/apps/api/src/lib/validateUrl.test.ts new file mode 100644 index 00000000..eec39f97 --- /dev/null +++ b/apps/api/src/lib/validateUrl.test.ts @@ -0,0 +1,159 @@ +import { isSameDomain, removeDuplicateUrls } from "./validateUrl"; +import { isSameSubdomain } from "./validateUrl"; + +describe("isSameDomain", () => { + it("should return true for a subdomain", () => { + const result = isSameDomain("http://sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same domain", () => { + const result = isSameDomain("http://example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return false for different domains", () => { + const result = isSameDomain("http://example.com", "http://another.com"); + expect(result).toBe(false); + }); + + it("should return true for a subdomain with different protocols", () => { + const result = isSameDomain("https://sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return false for invalid URLs", () => { + const result = isSameDomain("invalid-url", "http://example.com"); + expect(result).toBe(false); + const result2 = isSameDomain("http://example.com", "invalid-url"); + expect(result2).toBe(false); + }); + + it("should return true for a subdomain with www prefix", () => { + const result = isSameDomain("http://www.sub.example.com", "http://example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same domain with www prefix", () => { + const result = isSameDomain("http://docs.s.s.example.com", "http://example.com"); + expect(result).toBe(true); + }); +}); + + + + +describe("isSameSubdomain", () => { + it("should return false for a subdomain", () => { + const result = isSameSubdomain("http://example.com", "http://docs.example.com"); + expect(result).toBe(false); + }); + + it("should return true for the same subdomain", () => { + const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return false for different subdomains", () => { + const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com"); + expect(result).toBe(false); + }); + + it("should return false for different domains", () => { + const result = isSameSubdomain("http://example.com", "http://another.com"); + expect(result).toBe(false); + }); + + it("should return false for invalid URLs", () => { + const result = isSameSubdomain("invalid-url", "http://example.com"); + expect(result).toBe(false); + const result2 = isSameSubdomain("http://example.com", "invalid-url"); + expect(result2).toBe(false); + }); + + it("should return true for the same subdomain with different protocols", () => { + const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return true for the same subdomain with www prefix", () => { + const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com"); + expect(result).toBe(true); + }); + + it("should return false for a subdomain with www prefix and different subdomain", () => { + const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com"); + expect(result).toBe(false); + }); +}); + +describe("removeDuplicateUrls", () => { + it("should remove duplicate URLs with different protocols", () => { + const urls = [ + "http://example.com", + "https://example.com", + "http://www.example.com", + "https://www.example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should keep URLs with different paths", () => { + const urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]); + }); + + it("should prefer https over http", () => { + const urls = [ + "http://example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should prefer non-www over www", () => { + const urls = [ + "https://www.example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should handle empty input", () => { + const urls: string[] = []; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([]); + }); + + it("should handle URLs with different cases", () => { + const urls = [ + "https://EXAMPLE.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://EXAMPLE.com"]); + }); + + it("should handle URLs with trailing slashes", () => { + const urls = [ + "https://example.com", + "https://example.com/" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); +}); diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index 2d2111c8..14a74de8 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -1,9 +1,8 @@ - -const protocolIncluded = (url: string) => { +export const protocolIncluded = (url: string) => { // if :// not in the start of the url assume http (maybe https?) // regex checks if :// appears before any . - return(/^([^.:]+:\/\/)/.test(url)); -} + return /^([^.:]+:\/\/)/.test(url); +}; const getURLobj = (s: string) => { // URL fails if we dont include the protocol ie google.com @@ -18,7 +17,6 @@ const getURLobj = (s: string) => { }; export const checkAndUpdateURL = (url: string) => { - if (!protocolIncluded(url)) { url = `http://${url}`; } @@ -30,9 +28,143 @@ export const checkAndUpdateURL = (url: string) => { const typedUrlObj = urlObj as URL; - if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { throw new Error("Invalid URL"); } return { urlObj: typedUrlObj, url: url }; +}; + +export const checkUrl = (url: string) => { + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + if ((url.split(".")[0].match(/:/g) || []).length !== 1) { + throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com + } + + return url; +}; + +/** + * Same domain check + * It checks if the domain of the url is the same as the base url + * It accounts true for subdomains and www.subdomains + * @param url + * @param baseUrl + * @returns + */ +export function isSameDomain(url: string, baseUrl: string) { + const { urlObj: urlObj1, error: error1 } = getURLobj(url); + const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl); + + if (error1 || error2) { + return false; + } + + const typedUrlObj1 = urlObj1 as URL; + const typedUrlObj2 = urlObj2 as URL; + + const cleanHostname = (hostname: string) => { + return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + }; + + const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); + const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + + return domain1 === domain2; +} + + +export function isSameSubdomain(url: string, baseUrl: string) { + const { urlObj: urlObj1, error: error1 } = getURLobj(url); + const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl); + + if (error1 || error2) { + return false; + } + + const typedUrlObj1 = urlObj1 as URL; + const typedUrlObj2 = urlObj2 as URL; + + const cleanHostname = (hostname: string) => { + return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + }; + + const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); + const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + + const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.'); + const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.'); + + // Check if the domains are the same and the subdomains are the same + return domain1 === domain2 && subdomain1 === subdomain2; +} + + +export const checkAndUpdateURLForMap = (url: string) => { + if (!protocolIncluded(url)) { + url = `http://${url}`; + } + // remove last slash if present + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + + + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + // remove any query params + url = url.split("?")[0].trim(); + + return { urlObj: typedUrlObj, url: url }; +}; + + + + + +export function removeDuplicateUrls(urls: string[]): string[] { + const urlMap = new Map(); + + for (const url of urls) { + const parsedUrl = new URL(url); + const protocol = parsedUrl.protocol; + const hostname = parsedUrl.hostname.replace(/^www\./, ''); + const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash; + + const key = `${hostname}${path}`; + + if (!urlMap.has(key)) { + urlMap.set(key, url); + } else { + const existingUrl = new URL(urlMap.get(key)!); + const existingProtocol = existingUrl.protocol; + + if (protocol === 'https:' && existingProtocol === 'http:') { + urlMap.set(key, url); + } else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) { + urlMap.set(key, url); + } + } + } + + return [...new Set(Array.from(urlMap.values()))]; } \ No newline at end of file diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 353c144b..b45b8973 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,5 +1,8 @@ import { AuthResponse } from "../../src/types"; import { Logger } from "./logger"; +import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); let warningCount = 0; @@ -7,7 +10,8 @@ export function withAuth( originalFunction: (...args: U) => Promise ) { return async function (...args: U): Promise { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { if (warningCount < 5) { Logger.warn("You're bypassing authentication"); warningCount++; @@ -17,6 +21,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { + Sentry.captureException(error); Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index aea7876e..f67a1cd0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -12,6 +12,8 @@ import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function startWebScraperPipeline({ job, @@ -26,7 +28,12 @@ export async function startWebScraperPipeline({ mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, extractorOptions: job.data.extractorOptions, - pageOptions: job.data.pageOptions, + pageOptions: { + ...job.data.pageOptions, + ...(job.data.crawl_id ? ({ + includeRawHtml: true, + }): {}), + }, inProgress: (progress) => { Logger.debug(`🐂 Job in progress ${job.id}`); if (progress.currentDocument) { @@ -49,6 +56,7 @@ export async function startWebScraperPipeline({ team_id: job.data.team_id, bull_job_id: job.id.toString(), priority: job.opts.priority, + is_scrape: job.data.is_scrape ?? false, })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -63,6 +71,7 @@ export async function runWebScraper({ team_id, bull_job_id, priority, + is_scrape=false, }: RunWebScraperParams): Promise { try { const provider = new WebScraperDataProvider(); @@ -109,18 +118,16 @@ export async function runWebScraper({ } }) : docs; - - const billingResult = await billTeam(team_id, filteredDocs.length); - if (!billingResult.success) { - // throw new Error("Failed to bill team, no subscription was found"); - return { - success: false, - message: "Failed to bill team, no subscription was found", - docs: [], - }; + if(is_scrape === false) { + billTeam(team_id, filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); } + + // This is where the returnvalue from the job is set onSuccess(filteredDocs, mode); @@ -134,7 +141,8 @@ export async function runWebScraper({ const saveJob = async (job: Job, result: any, token: string, mode: string) => { try { - if (process.env.USE_DB_AUTHENTICATION === "true") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { const { data, error } = await supabase_service .from("firecrawl_jobs") .update({ docs: result }) diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index d32808ce..38611eac 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -1,11 +1,11 @@ import express from "express"; -import { redisHealthController } from "../controllers/admin/redis-health"; +import { redisHealthController } from "../controllers/v0/admin/redis-health"; import { autoscalerController, checkQueuesController, cleanBefore24hCompleteJobsController, queuesController, -} from "../controllers/admin/queue"; +} from "../controllers/v0/admin/queue"; export const adminRouter = express.Router(); diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 9c68d9bb..3a7bda65 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -1,14 +1,14 @@ import express from "express"; -import { crawlController } from "../../src/controllers/crawl"; -import { crawlStatusController } from "../../src/controllers/crawl-status"; -import { scrapeController } from "../../src/controllers/scrape"; -import { crawlPreviewController } from "../../src/controllers/crawlPreview"; -import { crawlJobStatusPreviewController } from "../../src/controllers/status"; -import { searchController } from "../../src/controllers/search"; -import { crawlCancelController } from "../../src/controllers/crawl-cancel"; -import { keyAuthController } from "../../src/controllers/keyAuth"; -import { livenessController } from "../controllers/liveness"; -import { readinessController } from "../controllers/readiness"; +import { crawlController } from "../../src/controllers/v0/crawl"; +import { crawlStatusController } from "../../src/controllers/v0/crawl-status"; +import { scrapeController } from "../../src/controllers/v0/scrape"; +import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview"; +import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status"; +import { searchController } from "../../src/controllers/v0/search"; +import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel"; +import { keyAuthController } from "../../src/controllers/v0/keyAuth"; +import { livenessController } from "../controllers/v0/liveness"; +import { readinessController } from "../controllers/v0/readiness"; export const v0Router = express.Router(); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts new file mode 100644 index 00000000..484ab5dc --- /dev/null +++ b/apps/api/src/routes/v1.ts @@ -0,0 +1,164 @@ +import express, { NextFunction, Request, Response } from "express"; +import { crawlController } from "../controllers/v1/crawl"; +// import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; +import { scrapeController } from "../../src/controllers/v1/scrape"; +import { crawlStatusController } from "../controllers/v1/crawl-status"; +import { mapController } from "../controllers/v1/map"; +import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; +import { RateLimiterMode } from "../types"; +import { authenticateUser } from "../controllers/auth"; +import { createIdempotencyKey } from "../services/idempotency/create"; +import { validateIdempotencyKey } from "../services/idempotency/validate"; +import { checkTeamCredits } from "../services/billing/credit_billing"; +import expressWs from "express-ws"; +import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { crawlCancelController } from "../controllers/v1/crawl-cancel"; +import { Logger } from "../lib/logger"; +import { scrapeStatusController } from "../controllers/v1/scrape-status"; +// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; +// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; +// import { searchController } from "../../src/controllers/v1/search"; +// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel"; +// import { keyAuthController } from "../../src/controllers/v1/keyAuth"; +// import { livenessController } from "../controllers/v1/liveness"; +// import { readinessController } from "../controllers/v1/readiness"; + +function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + if (!minimum && req.body) { + minimum = (req.body as any)?.limit ?? 1; + } + const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); + if (!success) { + Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); + if (!res.headersSent) { + return res.status(402).json({ success: false, error: "Insufficient credits" }); + } + } + req.account = { remainingCredits } + next(); + })() + .catch(err => next(err)); + }; +} + +export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + const { success, team_id, error, status, plan } = await authenticateUser( + req, + res, + rateLimiterMode, + ); + + if (!success) { + if (!res.headersSent) { + return res.status(status).json({ success: false, error }); + } + } + + req.auth = { team_id, plan }; + next(); + })() + .catch(err => next(err)); + } +} + +function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) { + (async () => { + if (req.headers["x-idempotency-key"]) { + const isIdempotencyValid = await validateIdempotencyKey(req); + if (!isIdempotencyValid) { + if (!res.headersSent) { + return res.status(409).json({ success: false, error: "Idempotency key already used" }); + } + } + createIdempotencyKey(req); + } + next(); + })() + .catch(err => next(err)); +} + +function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { + if (!res.headersSent) { + return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + } + } + next(); +} + +function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { + return (req, res, next) => { + controller(req, res) + .catch(err => next(err)) + } +} + +expressWs(express()); + +export const v1Router = express.Router(); + +v1Router.post( + "/scrape", + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(scrapeController) +); + +v1Router.post( + "/crawl", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(crawlController) +); + +v1Router.post( + "/map", + authMiddleware(RateLimiterMode.Map), + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(mapController) +); + +v1Router.get( + "/crawl/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + +v1Router.get( + "/scrape/:jobId", + wrap(scrapeStatusController) +); + +v1Router.ws( + "/crawl/:jobId", + crawlStatusWSController +); + +// v1Router.post("/crawlWebsitePreview", crawlPreviewController); + + +v1Router.delete( + "/crawl/:jobId", + authMiddleware(RateLimiterMode.Crawl), + crawlCancelController +); +// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController); + +// // Auth route for key based authentication +// v1Router.get("/keyAuth", keyAuthController); + +// // Search routes +// v0Router.post("/search", searchController); + +// Health/Probe routes +// v1Router.get("/health/liveness", livenessController); +// v1Router.get("/health/readiness", readinessController); diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index da66830b..02c8a7e0 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -34,4 +34,4 @@ it('should return a list of links on the firecrawl.ai page', async () => { expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage).toContain('https://flutterbricks.com/features') -}, 10000); +}, 15000); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 92b9ae40..d5dadaf8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -309,6 +309,23 @@ export class WebCrawler { return null; } + public extractLinksFromHTML(html: string, url: string) { + let links: string[] = []; + + const $ = load(html); + $("a").each((_, element) => { + const href = $(element).attr("href"); + if (href) { + const u = this.filterURL(href, url); + if (u !== null) { + links.push(u); + } + } + }); + + return links; + } + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; @@ -352,15 +369,7 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } - $("a").each((_, element) => { - const href = $(element).attr("href"); - if (href) { - const u = this.filterURL(href, url); - if (u !== null) { - links.push({ url: u, html: content, pageStatusCode, pageError }); - } - } - }); + links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError }))); if (this.visited.size === 1) { return links; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 38d0cc32..2f7efa47 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -294,28 +294,32 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(this.urls[0], documents); } - documents = this.applyPathReplacements(documents); - // documents = await this.applyImgAltText(documents); - if ( - (this.extractorOptions.mode === "llm-extraction" || - this.extractorOptions.mode === "llm-extraction-from-markdown") && - this.mode === "single_urls" - ) { - documents = await generateCompletions( - documents, - this.extractorOptions, - "markdown" - ); + if (this.pageOptions.includeMarkdown) { + documents = this.applyPathReplacements(documents); } - if ( - this.extractorOptions.mode === "llm-extraction-from-raw-html" && - this.mode === "single_urls" - ) { - documents = await generateCompletions( - documents, - this.extractorOptions, - "raw-html" - ); + + if (!this.pageOptions.includeHtml) { + for (let document of documents) { + delete document.html; + } + } + + // documents = await this.applyImgAltText(documents); + if (this.mode === "single_urls" && this.pageOptions.includeExtract) { + const extractionMode = this.extractorOptions?.mode ?? "markdown"; + const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown"; + + if ( + extractionMode === "llm-extraction" || + extractionMode === "llm-extraction-from-markdown" || + extractionMode === "llm-extraction-from-raw-html" + ) { + documents = await generateCompletions( + documents, + this.extractorOptions, + completionMode + ); + } } return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -347,6 +351,7 @@ export class WebScraperDataProvider { }); return { content: content, + markdown: content, metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, provider: "web-scraper", }; @@ -569,12 +574,24 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { - onlyMainContent: false, - includeHtml: false, - replaceAllPathsWithAbsolutePaths: false, - parsePDF: true, - removeTags: [], + this.pageOptions = { + onlyMainContent: options.pageOptions?.onlyMainContent ?? false, + includeHtml: options.pageOptions?.includeHtml ?? false, + replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true, + parsePDF: options.pageOptions?.parsePDF ?? true, + onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [], + removeTags: options.pageOptions?.removeTags ?? [], + includeMarkdown: options.pageOptions?.includeMarkdown ?? true, + includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false, + waitFor: options.pageOptions?.waitFor ?? undefined, + headers: options.pageOptions?.headers ?? undefined, + includeLinks: options.pageOptions?.includeLinks ?? true, + fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, + screenshot: options.pageOptions?.screenshot ?? false, + useFastMode: options.pageOptions?.useFastMode ?? false, + disableJsDom: options.pageOptions?.disableJsDom ?? false, + atsv: options.pageOptions?.atsv ?? false }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = @@ -599,6 +616,8 @@ export class WebScraperDataProvider { this.priority = options.priority; this.teamId = options.teamId ?? null; + + // make sure all urls start with https:// this.urls = this.urls.map((url) => { if (!url.trim().startsWith("http")) { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index aa86ad5e..80ac7924 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -55,7 +55,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); let waitParam = reqParams["params"]?.wait ?? waitFor; - let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; + let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -69,15 +69,15 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? - Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` - ); - if (pageOptions?.useFastMode) { fireEngineOptionsParam.engine = "tlsclient"; engine = "tlsclient"; } + Logger.info( + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + ); + // atsv is only available for beta customers const betaCustomersString = process.env.BETA_CUSTOMERS; const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : []; @@ -96,6 +96,7 @@ export async function scrapWithFireEngine({ const _response = await Sentry.startSpan({ name: "Call to fire-engine" }, async span => { + return await axiosInstance.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { @@ -104,12 +105,13 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, fullPageScreenshot: fullPageScreenshotParam, headers: headers, - pageOptions: pageOptions, disableJsDom: pageOptions?.disableJsDom ?? false, priority, engine, instantReturn: true, ...fireEngineOptionsParam, + atsv: pageOptions?.atsv ?? false, + scrollXPaths: pageOptions?.scrollXPaths ?? [], }, { headers: { @@ -125,7 +127,7 @@ export async function scrapWithFireEngine({ let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { - await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second + await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6998a665..2be65899 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,12 +23,15 @@ import { clientSideError } from "../../strings"; dotenv.config(); +const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; +const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; + export const baseScrapers = [ - "fire-engine;chrome-cdp", - "fire-engine", - "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", - "scrapingBeeLoad", + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useFireEngine ? "fire-engine" : undefined, + useScrapingBee ? "scrapingBee" : undefined, + useFireEngine ? undefined : "playwright", + useScrapingBee ? "scrapingBeeLoad" : undefined, "fetch", ].filter(Boolean); @@ -85,23 +88,23 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", - "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", - "scrapingBeeLoad", + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useFireEngine ? "fire-engine" : undefined, + useScrapingBee ? "scrapingBee" : undefined, + useScrapingBee ? "scrapingBeeLoad" : undefined, + useFireEngine ? undefined : "playwright", "fetch", ].filter(Boolean); - if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { - defaultOrder = [ - "fire-engine", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", - ...defaultOrder.filter( - (scraper) => scraper !== "fire-engine" && scraper !== "playwright" - ), - ].filter(Boolean); - } + // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { + // defaultOrder = [ + // "fire-engine", + // useFireEngine ? undefined : "playwright", + // ...defaultOrder.filter( + // (scraper) => scraper !== "fire-engine" && scraper !== "playwright" + // ), + // ].filter(Boolean); + // } const filteredDefaultOrder = defaultOrder.filter( (scraper: (typeof baseScrapers)[number]) => @@ -122,22 +125,42 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( jobId: string, urlToScrap: string, - pageOptions: PageOptions = { - onlyMainContent: true, - includeHtml: false, - includeRawHtml: false, - waitFor: 0, - screenshot: false, - fullPageScreenshot: false, - headers: undefined, - }, - extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown", - }, - existingHtml: string = "", + pageOptions: PageOptions, + extractorOptions?: ExtractorOptions, + existingHtml?: string, priority?: number, teamId?: string ): Promise { + pageOptions = { + includeMarkdown: pageOptions.includeMarkdown ?? true, + includeExtract: pageOptions.includeExtract ?? false, + onlyMainContent: pageOptions.onlyMainContent ?? false, + includeHtml: pageOptions.includeHtml ?? false, + includeRawHtml: pageOptions.includeRawHtml ?? false, + waitFor: pageOptions.waitFor ?? undefined, + screenshot: pageOptions.screenshot ?? false, + fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, + headers: pageOptions.headers ?? undefined, + includeLinks: pageOptions.includeLinks ?? true, + replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true, + parsePDF: pageOptions.parsePDF ?? true, + removeTags: pageOptions.removeTags ?? [], + onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], + useFastMode: pageOptions.useFastMode ?? false, + disableJsDom: pageOptions.disableJsDom ?? false, + atsv: pageOptions.atsv ?? false + } + + if (extractorOptions) { + extractorOptions = { + mode: extractorOptions?.mode ?? "llm-extraction-from-markdown", + } + } + + if (!existingHtml) { + existingHtml = ""; + } + urlToScrap = urlToScrap.trim(); const attemptScraping = async ( @@ -180,6 +203,7 @@ export async function scrapSingleUrl( fireEngineOptions: { engine: engine, atsv: pageOptions.atsv, + disableJsDom: pageOptions.disableJsDom, }, priority, teamId, @@ -341,8 +365,8 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) { @@ -364,20 +388,22 @@ export async function scrapSingleUrl( let linksOnPage: string[] | undefined; - linksOnPage = extractLinks(rawHtml, urlToScrap); + if (pageOptions.includeLinks) { + linksOnPage = extractLinks(rawHtml, urlToScrap); + } let document: Document; if (screenshot && screenshot.length > 0) { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) ? rawHtml : undefined, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -389,11 +415,11 @@ export async function scrapSingleUrl( } else { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) ? rawHtml : undefined, metadata: { @@ -402,7 +428,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, }; } @@ -416,9 +442,9 @@ export async function scrapSingleUrl( }); return { content: "", - markdown: "", + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, html: "", - linksOnPage: [], + linksOnPage: pageOptions.includeLinks ? [] : undefined, metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 99eb6bd2..e076d890 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -17,6 +17,8 @@ const socialMediaBlocklist = [ 'researchhub.com', 'youtube.com', 'corterix.com', + 'southwest.com', + 'ryanair.com' ]; const allowedKeywords = [ diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index af8d1f34..8169d9d3 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -242,5 +242,13 @@ export const urlSpecificParams = { engine: "chrome-cdp", }, }, + }, + "lorealparis.hu":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + engine: "tlsclient", + }, + }, } }; diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index bb9c5194..400ef84f 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -39,16 +39,8 @@ export const excludeNonMainTags = [ "#search", ".share", "#share", - ".pagination", - "#pagination", ".widget", "#widget", - ".related", - "#related", - ".tag", - "#tag", - ".category", - "#category", ".cookie", "#cookie" ]; diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 9496d569..fac53b38 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { description = soup('meta[name="description"]').attr("content") || null; // Assuming the language is part of the URL as per the regex pattern - const pattern = /([a-zA-Z]+-[A-Z]{2})/; - const match = pattern.exec(url); - language = match ? match[1] : null; + language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts new file mode 100644 index 00000000..7c6d8a4d --- /dev/null +++ b/apps/api/src/search/fireEngine.ts @@ -0,0 +1,45 @@ +import axios from "axios"; +import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; + +dotenv.config(); + +export async function fireEngineMap(q: string, options: { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + numResults: number; + page?: number; +}): Promise { + let data = JSON.stringify({ + query: q, + lang: options.lang, + country: options.country, + location: options.location, + tbs: options.tbs, + numResults: options.numResults, + page: options.page ?? 1, + }); + + if (!process.env.FIRE_ENGINE_BETA_URL) { + console.warn("(v1/map Beta) Results might differ from cloud offering currently."); + return []; + } + + let config = { + method: "POST", + url: `${process.env.FIRE_ENGINE_BETA_URL}/search`, + headers: { + "Content-Type": "application/json", + }, + data: data, + }; + const response = await axios(config); + if (response && response) { + return response.data + } else { + return []; + } +} diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 060f4bd8..0e247702 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string -export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { +export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { let proxies = null; if (proxy) { if (proxy.startsWith("https")) { diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f5bc06e3..f4c5b6d0 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,11 +1,9 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; -import { google_search } from "./googlesearch"; +import { googleSearch } from "./googlesearch"; +import { fireEngineMap } from "./fireEngine"; import { serper_search } from "./serper"; - - - export async function search({ query, advanced = false, @@ -30,12 +28,20 @@ export async function search({ proxy?: string; sleep_interval?: number; timeout?: number; -}) : Promise { +}): Promise { try { - if (process.env.SERPER_API_KEY ) { - return await serper_search(query, {num_results, tbs, filter, lang, country, location}); + + if (process.env.SERPER_API_KEY) { + return await serper_search(query, { + num_results, + tbs, + filter, + lang, + country, + location, + }); } - return await google_search( + return await googleSearch( query, advanced, num_results, @@ -49,7 +55,6 @@ export async function search({ ); } catch (error) { Logger.error(`Error in search function: ${error}`); - return [] + return []; } - // if process.env.SERPER_API_KEY is set, use serper } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 2ad07318..6a71b40a 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -5,7 +5,7 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../lib/logger"; import { getValue, setValue } from "../redis"; import { redlock } from "../redlock"; - +import * as Sentry from "@sentry/node"; const FREE_CREDITS = 500; @@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) { ]); let couponCredits = 0; + let sortedCoupons = []; + if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( (total, coupon) => total + coupon.credits, 0 ); + sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits); } - - let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits); // using coupon credits: if (couponCredits > 0) { // if there is no subscription and they have enough coupon credits @@ -175,9 +176,25 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; } - // Retrieve the team's active subscription and check for available coupons concurrently - const [{ data: subscription, error: subscriptionError }, { data: coupons }] = - await Promise.all([ + + let cacheKeySubscription = `subscription_${team_id}`; + let cacheKeyCoupons = `coupons_${team_id}`; + + // Try to get data from cache first + const [cachedSubscription, cachedCoupons] = await Promise.all([ + getValue(cacheKeySubscription), + getValue(cacheKeyCoupons) + ]); + + let subscription, subscriptionError; + let coupons : {credits: number}[]; + + if (cachedSubscription && cachedCoupons) { + subscription = JSON.parse(cachedSubscription); + coupons = JSON.parse(cachedCoupons); + } else { + // If not in cache, retrieve from database + const [subscriptionResult, couponsResult] = await Promise.all([ supabase_service .from("subscriptions") .select("id, price_id, current_period_start, current_period_end") @@ -191,6 +208,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { .eq("status", "active"), ]); + subscription = subscriptionResult.data; + subscriptionError = subscriptionResult.error; + coupons = couponsResult.data; + + // Cache the results for a minute, sub can be null and that's fine + await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null + await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute + + } + let couponCredits = 0; if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( @@ -199,30 +226,67 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } + + // If there are available coupons and they are enough for the operation + if (couponCredits >= credits) { + return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; + } + + // Free credits, no coupons - if (subscriptionError || !subscription) { - // If there is no active subscription but there are available coupons - if (couponCredits >= credits) { - return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; - } + if (!subscription || subscriptionError) { - const { data: creditUsages, error: creditUsageError } = - await supabase_service - .from("credit_usage") - .select("credits_used") - .is("subscription_id", null) - .eq("team_id", team_id); + let creditUsages; + let creditUsageError; + let totalCreditsUsed = 0; + const cacheKeyCreditUsage = `credit_usage_${team_id}`; - if (creditUsageError) { - throw new Error( - `Failed to retrieve credit usage for team_id: ${team_id}` + // Try to get credit usage from cache + const cachedCreditUsage = await getValue(cacheKeyCreditUsage); + + if (cachedCreditUsage) { + totalCreditsUsed = parseInt(cachedCreditUsage); + } else { + let retries = 0; + const maxRetries = 3; + const retryInterval = 2000; // 2 seconds + + while (retries < maxRetries) { + // Reminder, this has an 1000 limit. + const result = await supabase_service + .from("credit_usage") + .select("credits_used") + .is("subscription_id", null) + .eq("team_id", team_id); + + creditUsages = result.data; + creditUsageError = result.error; + + if (!creditUsageError) { + break; + } + + retries++; + if (retries < maxRetries) { + await new Promise(resolve => setTimeout(resolve, retryInterval)); + } + } + + if (creditUsageError) { + Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); + throw new Error( + `Failed to retrieve credit usage for team_id: ${team_id}` + ); + } + + totalCreditsUsed = creditUsages.reduce( + (acc, usage) => acc + usage.credits_used, + 0 ); - } - const totalCreditsUsed = creditUsages.reduce( - (acc, usage) => acc + usage.credits_used, - 0 - ); + // Cache the result for 30 seconds + await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30); + } Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); @@ -230,9 +294,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { end.setDate(end.getDate() + 30); // check if usage is within 80% of the limit const creditLimit = FREE_CREDITS; - const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit; + const creditUsagePercentage = totalCreditsUsed / creditLimit; - if (creditUsagePercentage >= 0.8) { + // Add a check to ensure totalCreditsUsed is greater than 0 + if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { + Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`); await sendNotification( team_id, NotificationType.APPROACHING_LIMIT, @@ -242,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } // 5. Compare the total credits used with the credits allowed by the plan. - if (totalCreditsUsed + credits > FREE_CREDITS) { + if (totalCreditsUsed >= FREE_CREDITS) { // Send email notification for insufficient credits await sendNotification( team_id, @@ -286,7 +352,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; - await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes + await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes // Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`); } } @@ -299,24 +365,47 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Adjust total credits used by subtracting coupon value const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); - // Get the price details - const { data: price, error: priceError } = await supabase_service - .from("prices") - .select("credits") - .eq("id", subscription.price_id) - .single(); - if (priceError) { - throw new Error( - `Failed to retrieve price for price_id: ${subscription.price_id}` - ); + // Get the price details from cache or database + const priceCacheKey = `price_${subscription.price_id}`; + let price : {credits: number}; + + try { + const cachedPrice = await getValue(priceCacheKey); + if (cachedPrice) { + price = JSON.parse(cachedPrice); + } else { + const { data, error: priceError } = await supabase_service + .from("prices") + .select("credits") + .eq("id", subscription.price_id) + .single(); + + if (priceError) { + throw new Error( + `Failed to retrieve price for price_id: ${subscription.price_id}` + ); + } + + price = data; + // There are only 21 records, so this is super fine + // Cache the price for a long time (e.g., 1 day) + await setValue(priceCacheKey, JSON.stringify(price), 86400); + } + } catch (error) { + Logger.error(`Error retrieving or caching price: ${error}`); + Sentry.captureException(error); + // If errors, just assume it's a big number so user don't get an error + price = { credits: 10000000 }; } const creditLimit = price.credits; - const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit; + + // Removal of + credits + const creditUsagePercentage = adjustedCreditsUsed / creditLimit; // Compare the adjusted total credits used with the credits allowed by the plan - if (adjustedCreditsUsed + credits > price.credits) { + if (adjustedCreditsUsed >= price.credits) { await sendNotification( team_id, NotificationType.LIMIT_REACHED, @@ -324,7 +413,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { subscription.current_period_end ); return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; - } else if (creditUsagePercentage >= 0.8) { + } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit await sendNotification( team_id, @@ -439,8 +528,8 @@ async function createCreditUsage({ subscription_id?: string; credits: number; }) { - const { data: credit_usage } = await supabase_service - .from("credit_usage") + await supabase_service + .from("credit_usage") .insert([ { team_id, @@ -448,8 +537,7 @@ async function createCreditUsage({ subscription_id: subscription_id || null, created_at: new Date(), }, - ]) - .select(); + ]); - return { success: true, credit_usage }; + return { success: true }; } diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 68008e02..3850e05b 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,9 +1,11 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../../src/lib/logger"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logCrawl(job_id: string, team_id: string) { - if (process.env.USE_DB_AUTHENTICATION === 'true') { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const { data, error } = await supabase_service .from("bulljobs_teams") diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 61983be0..4d8ee014 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logJob(job: FirecrawlJob) { try { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { return; } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 099e4a0b..fbe41653 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { Logger.debug("Skipping logging scrape to Supabase"); return; } diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 888cdefc..7a698772 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -8,10 +8,11 @@ async function addScrapeJobRaw( webScraperOptions: any, options: any, jobId: string, + jobPriority: number = 10 ): Promise { return await getScrapeQueue().add(jobId, webScraperOptions, { ...options, - priority: webScraperOptions.crawl_id ? 20 : 10, + priority: jobPriority, jobId, }); } @@ -20,7 +21,9 @@ export async function addScrapeJob( webScraperOptions: WebScraperOptions, options: any = {}, jobId: string = uuidv4(), + jobPriority: number = 10 ): Promise { + if (Sentry.isInitialized()) { const size = JSON.stringify(webScraperOptions).length; return await Sentry.startSpan({ @@ -39,10 +42,31 @@ export async function addScrapeJob( baggage: Sentry.spanToBaggageHeader(span), size, }, - }, options, jobId); + }, options, jobId, jobPriority); }); } else { - return await addScrapeJobRaw(webScraperOptions, options, jobId); + return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); } } +export function waitForJob(jobId: string, timeout: number) { + return new Promise((resolve, reject) => { + const start = Date.now(); + const int = setInterval(async () => { + if (Date.now() >= start + timeout) { + clearInterval(int); + reject(new Error("Job wait ")); + } else { + const state = await getScrapeQueue().getJobState(jobId); + if (state === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(jobId)).returnvalue); + } else if (state === "failed") { + // console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason); + clearInterval(int); + reject((await getScrapeQueue().getJob(jobId)).failedReason); + } + } + }, 500); + }) +} diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 113b3fa3..14dddebe 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -16,6 +16,14 @@ export function getScrapeQueue() { scrapeQueueName, { connection: redisConnection, + defaultJobOptions: { + removeOnComplete: { + age: 90000, // 25 hours + }, + removeOnFail: { + age: 90000, // 25 hours + }, + }, } // { // settings: { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a7d20383..37e14baf 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./sentry" +import "./sentry"; import * as Sentry from "@sentry/node"; import { CustomError } from "../lib/custom-error"; import { @@ -17,10 +17,27 @@ import { Logger } from "../lib/logger"; import { Worker } from "bullmq"; import systemMonitor from "./system-monitor"; import { v4 as uuidv4 } from "uuid"; -import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis"; +import { + addCrawlJob, + addCrawlJobDone, + crawlToCrawler, + finishCrawl, + getCrawl, + getCrawlJobs, + lockURL, +} from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { + addJobPriority, + deleteJobPriority, + getJobPriority, +} from "../../src/lib/job-priority"; +import { PlanType } from "../types"; +import { getJobs } from "../../src/controllers/v1/crawl-status"; +import { configDotenv } from "dotenv"; +configDotenv(); if (process.env.ENV === "production") { initSDK({ @@ -50,23 +67,24 @@ const processJobInternal = async (token: string, job: Job) => { await job.extendLock(token, jobLockExtensionTime); }, jobLockExtendInterval); + await addJobPriority(job.data.team_id, job.id); let err = null; try { const result = await processJob(job, token); - try{ + try { if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { await job.moveToCompleted(null, token, false); } else { await job.moveToCompleted(result.docs, token, false); } - }catch(e){ - } + } catch (e) {} } catch (error) { console.log("Job failed, error:", error); Sentry.captureException(error); err = error; await job.moveToFailed(error, token, false); } finally { + await deleteJobPriority(job.data.team_id, job.id); clearInterval(extendLockInterval); } @@ -80,7 +98,10 @@ process.on("SIGINT", () => { isShuttingDown = true; }); -const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise) => { +const workerFun = async ( + queueName: string, + processJobInternal: (token: string, job: Job) => Promise +) => { const worker = new Worker(queueName, null, { connection: redisConnection, lockDuration: 1 * 60 * 1000, // 1 minute @@ -109,46 +130,62 @@ const workerFun = async (queueName: string, processJobInternal: (token: string, const job = await worker.getNextJob(token); if (job) { if (job.data && job.data.sentry && Sentry.isInitialized()) { - Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => { - Sentry.startSpan({ + Sentry.continueTrace( + { + sentryTrace: job.data.sentry.trace, + baggage: job.data.sentry.baggage, + }, + () => { + Sentry.startSpan( + { + name: "Scrape job", + attributes: { + job: job.id, + worker: process.env.FLY_MACHINE_ID ?? worker.id, + }, + }, + async (span) => { + await Sentry.startSpan( + { + name: "Process scrape job", + op: "queue.process", + attributes: { + "messaging.message.id": job.id, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": job.data.sentry.size, + "messaging.message.receive.latency": + Date.now() - (job.processedOn ?? job.timestamp), + "messaging.message.retry.count": job.attemptsMade, + }, + }, + async () => { + const res = await processJobInternal(token, job); + if (res !== null) { + span.setStatus({ code: 2 }); // ERROR + } else { + span.setStatus({ code: 1 }); // OK + } + } + ); + } + ); + } + ); + } else { + Sentry.startSpan( + { name: "Scrape job", attributes: { job: job.id, worker: process.env.FLY_MACHINE_ID ?? worker.id, }, - }, async (span) => { - await Sentry.startSpan({ - name: "Process scrape job", - op: "queue.process", - attributes: { - "messaging.message.id": job.id, - "messaging.destination.name": getScrapeQueue().name, - "messaging.message.body.size": job.data.sentry.size, - "messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp), - "messaging.message.retry.count": job.attemptsMade, - } - }, async () => { - const res = await processJobInternal(token, job); - if (res !== null) { - span.setStatus({ code: 2 }); // ERROR - } else { - span.setStatus({ code: 1 }); // OK - } - }); - }); - }); - } else { - Sentry.startSpan({ - name: "Scrape job", - attributes: { - job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id, }, - }, () => { - processJobInternal(token, job); - }); + () => { + processJobInternal(token, job); + } + ); } - + await sleep(gotJobInterval); } else { await sleep(connectionMonitorInterval); @@ -163,13 +200,20 @@ async function processJob(job: Job, token: string) { // Check if the job URL is researchhub and block it immediately // TODO: remove this once solve the root issue - if (job.data.url && (job.data.url.includes("researchhub.com") || job.data.url.includes("ebay.com") || job.data.url.includes("youtube.com") || job.data.url.includes("microsoft.com") )) { + if ( + job.data.url && + (job.data.url.includes("researchhub.com") || + job.data.url.includes("ebay.com") || + job.data.url.includes("youtube.com") || + job.data.url.includes("microsoft.com")) + ) { Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); const data = { success: false, docs: [], project_id: job.data.project_id, - error: "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.", + error: + "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.", }; await job.moveToCompleted(data.docs, token, false); return data; @@ -183,14 +227,21 @@ async function processJob(job: Job, token: string) { current_url: "", }); const start = Date.now(); - + const { success, message, docs } = await startWebScraperPipeline({ job, token, }); + + // Better if we throw here so we capture with the correct error + if (!success) { + throw new Error(message); + } const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; + const rawHtml = docs[0] ? docs[0].rawHtml : ""; + const data = { success, result: { @@ -206,8 +257,26 @@ async function processJob(job: Job, token: string) { docs, }; - if (job.data.mode === "crawl") { - await callWebhook(job.data.team_id, job.id as string, data); + // No idea what this does and when it is called. + if (job.data.mode === "crawl" && !job.data.v1) { + callWebhook( + job.data.team_id, + job.id as string, + data, + job.data.webhook, + job.data.v1 + ); + } + if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { + await callWebhook( + job.data.team_id, + job.data.crawl_id, + data, + job.data.webhook, + job.data.v1, + "crawl.page", + true + ); } if (job.data.crawl_id) { @@ -229,35 +298,48 @@ async function processJob(job: Job, token: string) { await addCrawlJobDone(job.data.crawl_id, job.id); - const sc = await getCrawl(job.data.crawl_id) as StoredCrawl; + const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; if (!job.data.sitemapped) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); - let linksOnPage = []; - try{ - linksOnPage = data.docs[0]?.linksOnPage ?? []; - }catch(e){ - linksOnPage = [] - } + const links = crawler.filterLinks( - linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl)) - .filter(x => x !== null), + crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl), Infinity, sc.crawlerOptions?.maxDepth ?? 10 - ) - + ); + for (const link of links) { if (await lockURL(job.data.crawl_id, sc, link)) { - const newJob = await addScrapeJob({ - url: link, - mode: "single_urls", - crawlerOptions: sc.crawlerOptions, + // This seems to work really welel + const jobPriority = await getJobPriority({ + plan: sc.plan as PlanType, team_id: sc.team_id, - pageOptions: sc.pageOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, + basePriority: job.data.crawl_id ? 20 : 10, }); + const jobId = uuidv4(); + + // console.log("plan: ", sc.plan); + // console.log("team_id: ", sc.team_id) + // console.log("base priority: ", job.data.crawl_id ? 20 : 10) + // console.log("job priority: " , jobPriority, "\n\n\n") + + const newJob = await addScrapeJob( + { + url: link, + mode: "single_urls", + crawlerOptions: sc.crawlerOptions, + team_id: sc.team_id, + pageOptions: sc.pageOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + v1: job.data.v1, + }, + {}, + jobId, + jobPriority + ); await addCrawlJob(job.data.crawl_id, newJob.id); } @@ -266,67 +348,98 @@ async function processJob(job: Job, token: string) { } if (await finishCrawl(job.data.crawl_id)) { - const jobIDs = await getCrawlJobs(job.data.crawl_id); + - const jobs = (await Promise.all(jobIDs.map(async x => { - if (x === job.id) { - return { - async getState() { - return "completed" - }, - timestamp: Date.now(), - returnvalue: docs, - } + if (!job.data.v1) { + const jobIDs = await getCrawlJobs(job.data.crawl_id); + + const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobStatuses = await Promise.all(jobs.map((x) => x.getState())); + const jobStatus = + sc.cancelled || jobStatuses.some((x) => x === "failed") + ? "failed" + : "completed"; + + const fullDocs = jobs.map((x) => + Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue + ); + + await logJob({ + job_id: job.data.crawl_id, + success: jobStatus === "completed", + message: sc.cancelled ? "Cancelled" : message, + num_docs: fullDocs.length, + docs: [], + time_taken: (Date.now() - sc.createdAt) / 1000, + team_id: job.data.team_id, + mode: "crawl", + url: sc.originUrl, + crawlerOptions: sc.crawlerOptions, + pageOptions: sc.pageOptions, + origin: job.data.origin, + }); + + const data = { + success: jobStatus !== "failed", + result: { + links: fullDocs.map((doc) => { + return { + content: doc, + source: doc?.metadata?.sourceURL ?? doc?.url ?? "", + }; + }), + }, + project_id: job.data.project_id, + error: message /* etc... */, + docs: fullDocs, + }; + + // v0 web hooks, call when done with all the data + if (!job.data.v1) { + callWebhook( + job.data.team_id, + job.data.crawl_id, + data, + job.data.webhook, + job.data.v1, + "crawl.completed" + ); } + } else { + const jobIDs = await getCrawlJobs(job.data.crawl_id); + const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x))); + const jobStatus = + sc.cancelled || jobStatuses.some((x) => x === "failed") + ? "failed" + : "completed"; - const j = await getScrapeQueue().getJob(x); - - if (process.env.USE_DB_AUTHENTICATION === "true") { - const supabaseData = await supabaseGetJobById(j.id); - - if (supabaseData) { - j.returnvalue = supabaseData.docs; + // v1 web hooks, call when done with no data, but with event completed + if (job.data.v1 && job.data.webhook) { + callWebhook( + job.data.team_id, + job.data.crawl_id, + [], + job.data.webhook, + job.data.v1, + "crawl.completed" + ); } - } - - return j; - }))).sort((a, b) => a.timestamp - b.timestamp); - const jobStatuses = await Promise.all(jobs.map(x => x.getState())); - const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed"; - - const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); - await logJob({ - job_id: job.data.crawl_id, - success: jobStatus === "completed", - message: sc.cancelled ? "Cancelled" : message, - num_docs: fullDocs.length, - docs: [], - time_taken: (Date.now() - sc.createdAt) / 1000, - team_id: job.data.team_id, - mode: "crawl", - url: sc.originUrl, - crawlerOptions: sc.crawlerOptions, - pageOptions: sc.pageOptions, - origin: job.data.origin, - }); - - const data = { - success: jobStatus !== "failed", - result: { - links: fullDocs.map((doc) => { - return { - content: doc, - source: doc?.metadata?.sourceURL ?? doc?.url ?? "", - }; - }), - }, - project_id: job.data.project_id, - error: message /* etc... */, - docs: fullDocs, - }; - - await callWebhook(job.data.team_id, job.data.crawl_id, data); + await logJob({ + job_id: job.data.crawl_id, + success: jobStatus === "completed", + message: sc.cancelled ? "Cancelled" : message, + num_docs: jobIDs.length, + docs: [], + time_taken: (Date.now() - sc.createdAt) / 1000, + team_id: job.data.team_id, + mode: "crawl", + url: sc.originUrl, + crawlerOptions: sc.crawlerOptions, + pageOptions: sc.pageOptions, + origin: job.data.origin, + }); + } } } @@ -335,11 +448,13 @@ async function processJob(job: Job, token: string) { } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); - Sentry.captureException(error, { - data: { - job: job.id - }, - }) + if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) { + Sentry.captureException(error, { + data: { + job: job.id, + }, + }); + } if (error instanceof CustomError) { // Here we handle the error, then save the failed job @@ -368,11 +483,27 @@ async function processJob(job: Job, token: string) { error: "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; - - if (job.data.mode === "crawl" || job.data.crawl_id) { - await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data); + + if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) { + callWebhook( + job.data.team_id, + job.data.crawl_id ?? (job.id as string), + data, + job.data.webhook, + job.data.v1 + ); } - + if (job.data.v1) { + callWebhook( + job.data.team_id, + job.id as string, + [], + job.data.webhook, + job.data.v1, + "crawl.failed" + ); + } + if (job.data.crawl_id) { await logJob({ job_id: job.id as string, @@ -380,7 +511,8 @@ async function processJob(job: Job, token: string) { message: typeof error === "string" ? error - : error.message ?? "Something went wrong... Contact help@mendable.ai", + : error.message ?? + "Something went wrong... Contact help@mendable.ai", num_docs: 0, docs: [], time_taken: 0, @@ -401,7 +533,8 @@ async function processJob(job: Job, token: string) { message: typeof error === "string" ? error - : error.message ?? "Something went wrong... Contact help@mendable.ai", + : error.message ?? + "Something went wrong... Contact help@mendable.ai", num_docs: 0, docs: [], time_taken: 0, diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index c49c85d9..3e252301 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -65,7 +65,7 @@ describe("Rate Limiter Service", () => { "test-prefix:someToken", "standard" ); - expect(limiter2.points).toBe(50); + expect(limiter2.points).toBe(100); const limiter3 = getRateLimiter( "search" as RateLimiterMode, @@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => { "test-prefix:someToken", "growth" ); - expect(limiter4.points).toBe(150); + expect(limiter4.points).toBe(250); }); it("should return the default rate limiter if plan is not provided", () => { @@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => { "crawlStatus" as RateLimiterMode, "test-prefix:someToken" ); - expect(limiter2.points).toBe(150); + expect(limiter2.points).toBe(250); }); it("should consume points correctly for 'crawl' mode", async () => { @@ -188,14 +188,13 @@ describe("Rate Limiter Service", () => { "test-prefix:someTokenXY", "hobby" ); - // expect hobby to have 100 points - expect(limiter.points).toBe(10); + expect(limiter.points).toBe(20); const consumePoints = 5; const res = await limiter.consume("test-prefix:someTokenXY", consumePoints); expect(res.consumedPoints).toBe(5); - expect(res.remainingPoints).toBe(5); + expect(res.remainingPoints).toBe(15); }); it("should return the correct rate limiter for 'crawl' mode", () => { @@ -227,7 +226,7 @@ describe("Rate Limiter Service", () => { "test-prefix:someToken", "free" ); - expect(limiter.points).toBe(5); + expect(limiter.points).toBe(10); const limiter2 = getRateLimiter( "scrape" as RateLimiterMode, @@ -241,7 +240,14 @@ describe("Rate Limiter Service", () => { "test-prefix:someToken", "standard" ); - expect(limiter3.points).toBe(50); + expect(limiter3.points).toBe(100); + + const limiter4 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken", + "growth" + ); + expect(limiter4.points).toBe(1000); }); it("should return the correct rate limiter for 'search' mode", () => { @@ -309,7 +315,7 @@ describe("Rate Limiter Service", () => { "crawlStatus" as RateLimiterMode, "test-prefix:someToken" ); - expect(limiter2.points).toBe(150); + expect(limiter2.points).toBe(250); }); it("should return the correct rate limiter for 'testSuite' mode", () => { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index d96da069..51a0ecfa 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -6,7 +6,7 @@ const RATE_LIMITS = { crawl: { default: 3, free: 2, - starter: 3, + starter: 10, standard: 5, standardOld: 40, scale: 50, @@ -17,9 +17,22 @@ const RATE_LIMITS = { growthdouble: 50, }, scrape: { + default: 20, + free: 10, + starter: 100, + standard: 100, + standardOld: 100, + scale: 500, + hobby: 20, + standardNew: 100, + standardnew: 100, + growth: 1000, + growthdouble: 1000, + }, + search: { default: 20, free: 5, - starter: 20, + starter: 50, standard: 50, standardOld: 40, scale: 500, @@ -29,12 +42,12 @@ const RATE_LIMITS = { growth: 500, growthdouble: 500, }, - search: { + map:{ default: 20, free: 5, - starter: 20, - standard: 40, - standardOld: 40, + starter: 50, + standard: 50, + standardOld: 50, scale: 500, hobby: 10, standardNew: 50, @@ -52,7 +65,7 @@ const RATE_LIMITS = { }, crawlStatus: { free: 150, - default: 150, + default: 250, }, testSuite: { free: 10000, @@ -91,21 +104,44 @@ export const devBRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const manualRateLimiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix: "manual", + points: 2000, + duration: 60, // Duration in seconds +}); + + +export const scrapeStatusRateLimiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix: "scrape-status", + points: 400, + duration: 60, // Duration in seconds +}); + +const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"]; + +const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; + export function getRateLimiter( mode: RateLimiterMode, token: string, plan?: string, teamId?: string ) { - - if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) { + + if (testSuiteTokens.some(testToken => token.includes(testToken))) { return testSuiteRateLimiter; } - if(teamId === process.env.DEV_B_TEAM_ID) { + if(teamId && teamId === process.env.DEV_B_TEAM_ID) { return devBRateLimiter; } + if(teamId && manual.includes(teamId)) { + return manualRateLimiter; + } + const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} if (!rateLimitConfig) return serverRateLimiter; diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 70ada12b..7636717e 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,5 +1,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { Logger } from "../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -8,8 +10,9 @@ class SupabaseService { constructor() { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null Logger.warn( "Authentication is disabled. Supabase client will not be initialized." diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index b0222ea3..06e5649d 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,22 +1,40 @@ +import axios from "axios"; +import { legacyDocumentConverter } from "../../src/controllers/v1/types"; import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; +import { WebhookEventType } from "../types"; +import { configDotenv } from "dotenv"; +configDotenv(); -export const callWebhook = async (teamId: string, jobId: string,data: any) => { +export const callWebhook = async ( + teamId: string, + id: string, + data: any | null, + specified?: string, + v1 = false, + eventType: WebhookEventType = "crawl.page", + awaitWebhook: boolean = false +) => { try { - const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId); - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - let webhookUrl = selfHostedUrl; + const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace( + "{{JOB_ID}}", + id + ); + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; + let webhookUrl = specified ?? selfHostedUrl; - // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // and the USE_DB_AUTHENTICATION environment variable is set to true - if (!selfHostedUrl && useDbAuthentication) { + if (!webhookUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") .eq("team_id", teamId) .limit(1); if (error) { - Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`); + Logger.error( + `Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}` + ); return null; } @@ -28,29 +46,93 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { } let dataToSend = []; - if (data.result.links && data.result.links.length !== 0) { + if ( + data && + data.result && + data.result.links && + data.result.links.length !== 0 + ) { for (let i = 0; i < data.result.links.length; i++) { - dataToSend.push({ - content: data.result.links[i].content.content, - markdown: data.result.links[i].content.markdown, - metadata: data.result.links[i].content.metadata, - }); + if (v1) { + dataToSend.push( + legacyDocumentConverter(data.result.links[i].content) + ); + } else { + dataToSend.push({ + content: data.result.links[i].content.content, + markdown: data.result.links[i].content.markdown, + metadata: data.result.links[i].content.metadata, + }); + } } } - await fetch(webhookUrl, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - success: data.success, - jobId: jobId, - data: dataToSend, - error: data.error || undefined, - }), - }); + if (awaitWebhook) { + try { + await axios.post( + webhookUrl, + { + success: !v1 + ? data.success + : eventType === "crawl.page" + ? data.success + : true, + type: eventType, + [v1 ? "id" : "jobId"]: id, + data: dataToSend, + error: !v1 + ? data?.error || undefined + : eventType === "crawl.page" + ? data?.error || undefined + : undefined, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) + } + ); + } catch (error) { + Logger.error( + `Axios error (0) sending webhook for team ID: ${teamId}, error: ${error.message}` + ); + } + } else { + axios + .post( + webhookUrl, + { + success: !v1 + ? data.success + : eventType === "crawl.page" + ? data.success + : true, + type: eventType, + [v1 ? "id" : "jobId"]: id, + data: dataToSend, + error: !v1 + ? data?.error || undefined + : eventType === "crawl.page" + ? data?.error || undefined + : undefined, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) + } + ) + .catch((error) => { + Logger.error( + `Axios error sending webhook for team ID: ${teamId}, error: ${error.message}` + ); + }); + } } catch (error) { - Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); + Logger.debug( + `Error sending webhook for team ID: ${teamId}, error: ${error.message}` + ); } }; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index b092d310..50fb6eef 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -30,6 +30,9 @@ export interface WebScraperOptions { origin?: string; crawl_id?: string; sitemapped?: boolean; + webhook?: string; + v1?: boolean; + is_scrape?: boolean; } export interface RunWebScraperParams { @@ -44,6 +47,7 @@ export interface RunWebScraperParams { team_id: string; bull_job_id: string; priority?: number; + is_scrape?: boolean; } export interface RunWebScraperResult { @@ -105,6 +109,7 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", + Map = "map", } @@ -113,7 +118,8 @@ export interface AuthResponse { team_id?: string; error?: string; status?: number; - plan?: string; + api_key?: string; + plan?: PlanType; } @@ -136,4 +142,18 @@ export type ScrapeLog = { html?: string; ipv4_support?: boolean | null; ipv6_support?: boolean | null; -}; \ No newline at end of file +}; + +export type PlanType = + | "starter" + | "standard" + | "scale" + | "hobby" + | "standardnew" + | "growth" + | "growthdouble" + | "free" + | ""; + + +export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed"; \ No newline at end of file diff --git a/apps/go-sdk/examples/.gitignore b/apps/go-sdk/examples/.gitignore deleted file mode 100644 index 6f72f892..00000000 --- a/apps/go-sdk/examples/.gitignore +++ /dev/null @@ -1,25 +0,0 @@ -# If you prefer the allow list template instead of the deny list, see community template: -# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore -# -# Binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE -*.out - -# Dependency directories (remove the comment below to include it) -# vendor/ - -# Go workspace file -go.work -go.work.sum - -# env file -.env diff --git a/apps/go-sdk/examples/LICENSE b/apps/go-sdk/examples/LICENSE deleted file mode 100644 index 25800a2e..00000000 --- a/apps/go-sdk/examples/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Mendable - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/apps/go-sdk/examples/example.go b/apps/go-sdk/examples/example.go deleted file mode 100644 index 48ba49da..00000000 --- a/apps/go-sdk/examples/example.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "log" - - "github.com/google/uuid" - "github.com/mendableai/firecrawl-go" -) - -func main() { - app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev") - if err != nil { - log.Fatalf("Failed to create FirecrawlApp: %v", err) - } - - // Scrape a website - scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil) - if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) - } - fmt.Println(scrapeResult.Markdown) - - // Crawl a website - idempotencyKey := uuid.New().String() // optional idempotency key - crawlParams := map[string]any{ - "crawlerOptions": map[string]any{ - "excludes": []string{"blog/*"}, - }, - } - crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey) - if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) - } - jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ") - if err != nil { - log.Fatalf("Failed to marshal crawl result: %v", err) - } - fmt.Println(string(jsonCrawlResult)) - - // LLM Extraction using JSON schema - jsonSchema := map[string]any{ - "type": "object", - "properties": map[string]any{ - "top": map[string]any{ - "type": "array", - "items": map[string]any{ - "type": "object", - "properties": map[string]any{ - "title": map[string]string{"type": "string"}, - "points": map[string]string{"type": "number"}, - "by": map[string]string{"type": "string"}, - "commentsURL": map[string]string{"type": "string"}, - }, - "required": []string{"title", "points", "by", "commentsURL"}, - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News", - }, - }, - "required": []string{"top"}, - } - - llmExtractionParams := map[string]any{ - "extractorOptions": firecrawl.ExtractorOptions{ - ExtractionSchema: jsonSchema, - Mode: "llm-extraction", - }, - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, - } - - llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams) - if err != nil { - log.Fatalf("Failed to perform LLM extraction: %v", err) - } - - // Pretty print the LLM extraction result - jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ") - if err != nil { - log.Fatalf("Failed to marshal LLM extraction result: %v", err) - } - fmt.Println(string(jsonResult)) -} diff --git a/apps/go-sdk/examples/go.mod b/apps/go-sdk/examples/go.mod deleted file mode 100644 index 3ea9b92f..00000000 --- a/apps/go-sdk/examples/go.mod +++ /dev/null @@ -1,9 +0,0 @@ -module github.com/mendableai/firecrawl-go-examples - -go 1.22.5 - -replace github.com/mendableai/firecrawl => ../ - -require github.com/google/uuid v1.6.0 - -require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect diff --git a/apps/go-sdk/examples/go.sum b/apps/go-sdk/examples/go.sum deleted file mode 100644 index 760ca553..00000000 --- a/apps/go-sdk/examples/go.sum +++ /dev/null @@ -1,14 +0,0 @@ -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= -github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs= -github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/apps/go-sdk/firecrawl/.gitignore b/apps/go-sdk/firecrawl/.gitignore deleted file mode 100644 index db27dc80..00000000 --- a/apps/go-sdk/firecrawl/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.env -vendor \ No newline at end of file diff --git a/apps/go-sdk/firecrawl/README.md b/apps/go-sdk/firecrawl/README.md deleted file mode 100644 index 7e17c10f..00000000 --- a/apps/go-sdk/firecrawl/README.md +++ /dev/null @@ -1,189 +0,0 @@ -# Firecrawl Go SDK - -The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Go SDK, you can - -```bash -go get github.com/mendableai/firecrawl -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK with error handling: - -```go -import ( - "fmt" - "log" - - "github.com/mendableai/firecrawl/firecrawl" -) - -func main() { - // Initialize the FirecrawlApp with your API key - app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY") - if err != nil { - log.Fatalf("Failed to initialize FirecrawlApp: %v", err) - } - - // Scrape a single URL - url := "https://mendable.ai" - scrapedData, err := app.ScrapeURL(url, nil) - if err != nil { - log.Fatalf("Error occurred while scraping: %v", err) - } - fmt.Println(scrapedData) - - // Crawl a website - crawlUrl := "https://mendable.ai" - params := map[string]any{ - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, - } - - crawlResult, err := app.CrawlURL(crawlUrl, params) - if err != nil { - log.Fatalf("Error occurred while crawling: %v", err) - } - fmt.Println(crawlResult) -} -``` - -### Scraping a URL - -To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```go -url := "https://mendable.ai" -scrapedData, err := app.ScrapeURL(url, nil) -if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) -} -fmt.Println(scrapedData) -``` - -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it: - -```go -jsonSchema := map[string]any{ - "type": "object", - "properties": map[string]any{ - "top": map[string]any{ - "type": "array", - "items": map[string]any{ - "type": "object", - "properties": map[string]any{ - "title": map[string]string{"type": "string"}, - "points": map[string]string{"type": "number"}, - "by": map[string]string{"type": "string"}, - "commentsURL": map[string]string{"type": "string"}, - }, - "required": []string{"title", "points", "by", "commentsURL"}, - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News", - }, - }, - "required": []string{"top"}, -} - -llmExtractionParams := map[string]any{ - "extractorOptions": firecrawl.ExtractorOptions{ - ExtractionSchema: jsonSchema, - }, -} - -scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams) -if err != nil { - log.Fatalf("Failed to perform LLM extraction: %v", err) -} -fmt.Println(scrapeResult) -``` - -### Search for a query - -To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results. - - -```go -query := "what is mendable?" -searchResult, err := app.Search(query) -if err != nil { - log.Fatalf("Failed to search: %v", err) -} -fmt.Println(searchResult) -``` - -### Crawling a Website - -To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -```go -crawlParams := map[string]any{ - "crawlerOptions": map[string]any{ - "excludes": []string{"blog/*"}, - "includes": []string{}, // leave empty for all pages - "limit": 1000, - }, - "pageOptions": map[string]any{ - "onlyMainContent": true, - }, -} -crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey) -if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) -} -fmt.Println(crawlResult) -``` - -### Checking Crawl Status - -To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```go -status, err := app.CheckCrawlStatus(jobId) -if err != nil { - log.Fatalf("Failed to check crawl status: %v", err) -} -fmt.Println(status) -``` - -### Canceling a Crawl Job -To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job. - -```go -canceled, err := app.CancelCrawlJob(jobId) -if err != nil { - log.Fatalf("Failed to cancel crawl job: %v", err) -} -fmt.Println(canceled) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Contributing - -Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions: - -- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details. diff --git a/apps/go-sdk/firecrawl/firecrawl.go b/apps/go-sdk/firecrawl/firecrawl.go deleted file mode 100644 index 9a9dcfef..00000000 --- a/apps/go-sdk/firecrawl/firecrawl.go +++ /dev/null @@ -1,584 +0,0 @@ -// Package firecrawl provides a client for interacting with the Firecrawl API. -package firecrawl - -import ( - "bytes" - "encoding/json" - "fmt" - "io" - "math" - "net/http" - "os" - "time" -) - -// FirecrawlDocumentMetadata represents metadata for a Firecrawl document -type FirecrawlDocumentMetadata struct { - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - Keywords string `json:"keywords,omitempty"` - Robots string `json:"robots,omitempty"` - OGTitle string `json:"ogTitle,omitempty"` - OGDescription string `json:"ogDescription,omitempty"` - OGURL string `json:"ogUrl,omitempty"` - OGImage string `json:"ogImage,omitempty"` - OGAudio string `json:"ogAudio,omitempty"` - OGDeterminer string `json:"ogDeterminer,omitempty"` - OGLocale string `json:"ogLocale,omitempty"` - OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"` - OGSiteName string `json:"ogSiteName,omitempty"` - OGVideo string `json:"ogVideo,omitempty"` - DCTermsCreated string `json:"dctermsCreated,omitempty"` - DCDateCreated string `json:"dcDateCreated,omitempty"` - DCDate string `json:"dcDate,omitempty"` - DCTermsType string `json:"dctermsType,omitempty"` - DCType string `json:"dcType,omitempty"` - DCTermsAudience string `json:"dctermsAudience,omitempty"` - DCTermsSubject string `json:"dctermsSubject,omitempty"` - DCSubject string `json:"dcSubject,omitempty"` - DCDescription string `json:"dcDescription,omitempty"` - DCTermsKeywords string `json:"dctermsKeywords,omitempty"` - ModifiedTime string `json:"modifiedTime,omitempty"` - PublishedTime string `json:"publishedTime,omitempty"` - ArticleTag string `json:"articleTag,omitempty"` - ArticleSection string `json:"articleSection,omitempty"` - SourceURL string `json:"sourceURL,omitempty"` - PageStatusCode int `json:"pageStatusCode,omitempty"` - PageError string `json:"pageError,omitempty"` -} - -// FirecrawlDocument represents a document in Firecrawl -type FirecrawlDocument struct { - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` - Content string `json:"content"` - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - LLMExtraction map[string]any `json:"llm_extraction,omitempty"` - CreatedAt *time.Time `json:"createdAt,omitempty"` - UpdatedAt *time.Time `json:"updatedAt,omitempty"` - Type string `json:"type,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` - ChildrenLinks []string `json:"childrenLinks,omitempty"` - Provider string `json:"provider,omitempty"` - Warning string `json:"warning,omitempty"` - Index int `json:"index,omitempty"` -} - -// ExtractorOptions represents options for extraction. -type ExtractorOptions struct { - Mode string `json:"mode,omitempty"` - ExtractionPrompt string `json:"extractionPrompt,omitempty"` - ExtractionSchema any `json:"extractionSchema,omitempty"` -} - -// ScrapeResponse represents the response for scraping operations -type ScrapeResponse struct { - Success bool `json:"success"` - Data *FirecrawlDocument `json:"data,omitempty"` -} - -// SearchResponse represents the response for searching operations -type SearchResponse struct { - Success bool `json:"success"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - -// CrawlResponse represents the response for crawling operations -type CrawlResponse struct { - Success bool `json:"success"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - -// JobStatusResponse represents the response for checking crawl job status -type JobStatusResponse struct { - Success bool `json:"success"` - Status string `json:"status"` - Current int `json:"current,omitempty"` - CurrentURL string `json:"current_url,omitempty"` - CurrentStep string `json:"current_step,omitempty"` - Total int `json:"total,omitempty"` - JobID string `json:"jobId,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` - PartialData []*FirecrawlDocument `json:"partial_data,omitempty"` -} - -// CancelCrawlJobResponse represents the response for canceling a crawl job -type CancelCrawlJobResponse struct { - Success bool `json:"success"` - Status string `json:"status"` -} - -// requestOptions represents options for making requests. -type requestOptions struct { - retries int - backoff int -} - -// requestOption is a functional option type for requestOptions. -type requestOption func(*requestOptions) - -// newRequestOptions creates a new requestOptions instance with the provided options. -// -// Parameters: -// - opts: Optional request options. -// -// Returns: -// - *requestOptions: A new instance of requestOptions with the provided options. -func newRequestOptions(opts ...requestOption) *requestOptions { - options := &requestOptions{retries: 1} - for _, opt := range opts { - opt(options) - } - return options -} - -// withRetries sets the number of retries for a request. -// -// Parameters: -// - retries: The number of retries to be performed. -// -// Returns: -// - requestOption: A functional option that sets the number of retries for a request. -func withRetries(retries int) requestOption { - return func(opts *requestOptions) { - opts.retries = retries - } -} - -// withBackoff sets the backoff interval for a request. -// -// Parameters: -// - backoff: The backoff interval (in milliseconds) to be used for retries. -// -// Returns: -// - requestOption: A functional option that sets the backoff interval for a request. -func withBackoff(backoff int) requestOption { - return func(opts *requestOptions) { - opts.backoff = backoff - } -} - -// FirecrawlApp represents a client for the Firecrawl API. -type FirecrawlApp struct { - APIKey string - APIURL string - Client *http.Client -} - -// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. -// If the API key or API URL is not provided, it attempts to retrieve them from environment variables. -// If the API key is still not found, it returns an error. -// -// Parameters: -// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable. -// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev". -// -// Returns: -// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. -// - error: An error if the API key is not provided or retrieved. -func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) { - if apiKey == "" { - apiKey = os.Getenv("FIRECRAWL_API_KEY") - if apiKey == "" { - return nil, fmt.Errorf("no API key provided") - } - } - - if apiURL == "" { - apiURL = os.Getenv("FIRECRAWL_API_URL") - if apiURL == "" { - apiURL = "https://api.firecrawl.dev" - } - } - - client := &http.Client{ - Timeout: 60 * time.Second, - } - - return &FirecrawlApp{ - APIKey: apiKey, - APIURL: apiURL, - Client: client, - }, nil -} - -// ScrapeURL scrapes the content of the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to be scraped. -// - params: Optional parameters for the scrape request, including extractor options for LLM extraction. -// -// Returns: -// - *FirecrawlDocument: The scraped document data. -// - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) { - headers := app.prepareHeaders("") - scrapeBody := map[string]any{"url": url} - - if params != nil { - if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { - if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { - extractorOptions.ExtractionSchema = schema.schema() - } - if extractorOptions.Mode == "" { - extractorOptions.Mode = "llm-extraction" - } - scrapeBody["extractorOptions"] = extractorOptions - } - - for key, value := range params { - if key != "extractorOptions" { - scrapeBody[key] = value - } - } - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/scrape", app.APIURL), - scrapeBody, - headers, - "scrape URL", - ) - if err != nil { - return nil, err - } - - var scrapeResponse ScrapeResponse - err = json.Unmarshal(resp, &scrapeResponse) - if err != nil { - return nil, err - } - - if scrapeResponse.Success { - return scrapeResponse.Data, nil - } - - return nil, fmt.Errorf("failed to scrape URL") -} - -// Search performs a search query using the Firecrawl API and returns the search results. -// -// Parameters: -// - query: The search query string. -// - params: Optional parameters for the search request. -// -// Returns: -// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results. -// - error: An error if the search request fails. -func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) { - headers := app.prepareHeaders("") - searchBody := map[string]any{"query": query} - for k, v := range params { - searchBody[k] = v - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/search", app.APIURL), - searchBody, - headers, - "search", - ) - if err != nil { - return nil, err - } - - var searchResponse SearchResponse - err = json.Unmarshal(resp, &searchResponse) - if err != nil { - return nil, err - } - - if searchResponse.Success { - return searchResponse.Data, nil - } - - return nil, fmt.Errorf("failed to search") -} - -// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to crawl. -// - params: Optional parameters for the crawl request. -// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning. -// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true. -// - idempotencyKey: An optional idempotency key to ensure the request is idempotent. -// -// Returns: -// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true. -// - error: An error if the crawl request fails. -func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) { - headers := app.prepareHeaders(idempotencyKey) - crawlBody := map[string]any{"url": url} - for k, v := range params { - crawlBody[k] = v - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v0/crawl", app.APIURL), - crawlBody, - headers, - "start crawl job", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - if waitUntilDone { - return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval) - } - - if crawlResponse.JobID == "" { - return nil, fmt.Errorf("failed to get job ID") - } - - return crawlResponse.JobID, nil -} - -// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to check. -// -// Returns: -// - *JobStatusResponse: The status of the crawl job. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) { - headers := app.prepareHeaders("") - resp, err := app.makeRequest( - http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var jobStatusResponse JobStatusResponse - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err - } - - return &jobStatusResponse, nil -} - -// CancelCrawlJob cancels a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to cancel. -// -// Returns: -// - string: The status of the crawl job after cancellation. -// - error: An error if the crawl job cancellation request fails. -func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) { - headers := app.prepareHeaders("") - resp, err := app.makeRequest( - http.MethodDelete, - fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID), - nil, - headers, - "cancel crawl job", - ) - if err != nil { - return "", err - } - - var cancelCrawlJobResponse CancelCrawlJobResponse - err = json.Unmarshal(resp, &cancelCrawlJobResponse) - if err != nil { - return "", err - } - - return cancelCrawlJobResponse.Status, nil -} - -// prepareHeaders prepares the headers for an HTTP request. -// -// Parameters: -// - idempotencyKey: A string representing the idempotency key to be included in the headers. -// If the idempotency key is an empty string, it will not be included in the headers. -// -// Returns: -// - map[string]string: A map containing the headers for the HTTP request. -func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string { - headers := map[string]string{ - "Content-Type": "application/json", - "Authorization": fmt.Sprintf("Bearer %s", app.APIKey), - } - if idempotencyKey != "" { - headers["x-idempotency-key"] = idempotencyKey - } - return headers -} - -// makeRequest makes a request to the specified URL with the provided method, data, headers, and options. -// -// Parameters: -// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE"). -// - url: The URL to send the request to. -// - data: The data to be sent in the request body. -// - headers: The headers to be included in the request. -// - action: A string describing the action being performed. -// - opts: Optional request options. -// -// Returns: -// - []byte: The response body from the request. -// - error: An error if the request fails. -func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) { - var body []byte - var err error - if data != nil { - body, err = json.Marshal(data) - if err != nil { - return nil, err - } - } - - req, err := http.NewRequest(method, url, bytes.NewBuffer(body)) - if err != nil { - return nil, err - } - - for key, value := range headers { - req.Header.Set(key, value) - } - - var resp *http.Response - options := newRequestOptions(opts...) - for i := 0; i < options.retries; i++ { - resp, err = app.Client.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != 502 { - break - } - - time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond) - } - - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return nil, err - } - - statusCode := resp.StatusCode - if statusCode != 200 { - return nil, app.handleError(statusCode, respBody, action) - } - - return respBody, nil -} - -// monitorJobStatus monitors the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - jobID: The ID of the crawl job to monitor. -// - headers: The headers to be included in the request. -// - pollInterval: The interval (in seconds) at which to poll the job status. -// -// Returns: -// - []*FirecrawlDocument: The crawl result if the job is completed. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) { - attempts := 0 - for { - resp, err := app.makeRequest( - http.MethodGet, - fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID), - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var statusData JobStatusResponse - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } - - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") - } - - if status == "completed" { - if statusData.Data != nil { - return statusData.Data, nil - } - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") - } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) - } - } -} - -// handleError handles errors returned by the Firecrawl API. -// -// Parameters: -// - resp: The HTTP response object. -// - body: The response body from the HTTP response. -// - action: A string describing the action being performed. -// -// Returns: -// - error: An error describing the failure reason. -func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error { - var errorData map[string]any - err := json.Unmarshal(body, &errorData) - if err != nil { - return fmt.Errorf("failed to parse error response: %v", err) - } - - errorMessage, _ := errorData["error"].(string) - if errorMessage == "" { - errorMessage = "No additional error details provided." - } - - var message string - switch statusCode { - case 402: - message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage) - case 408: - message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage) - case 409: - message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage) - case 500: - message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage) - default: - message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage) - } - - return fmt.Errorf(message) -} diff --git a/apps/go-sdk/firecrawl/firecrawl_test.go b/apps/go-sdk/firecrawl/firecrawl_test.go deleted file mode 100644 index 9d56c7ac..00000000 --- a/apps/go-sdk/firecrawl/firecrawl_test.go +++ /dev/null @@ -1,292 +0,0 @@ -package firecrawl - -import ( - "log" - "os" - "testing" - "time" - - "github.com/google/uuid" - "github.com/joho/godotenv" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var API_URL string -var TEST_API_KEY string - -func init() { - err := godotenv.Load("../.env") - if err != nil { - log.Fatalf("Error loading .env file: %v", err) - } - API_URL = os.Getenv("API_URL") - TEST_API_KEY = os.Getenv("TEST_API_KEY") -} - -func TestNoAPIKey(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL) - assert.Error(t, err) - assert.Contains(t, err.Error(), "no API key provided") -} - -func TestScrapeURLInvalidAPIKey(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL) - require.NoError(t, err) - - _, err = app.ScrapeURL("https://firecrawl.dev", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") -} - -func TestBlocklistedURL(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") -} - -func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL) - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") -} - -func TestScrapeURLE2E(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") - assert.NotEqual(t, response.Markdown, "") - assert.NotNil(t, response.Metadata) - assert.Equal(t, response.HTML, "") -} - -func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) - require.NoError(t, err) - - params := map[string]any{ - "pageOptions": map[string]any{ - "includeHtml": true, - }, - } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) - require.NoError(t, err) - assert.NotNil(t, response) - - assert.Contains(t, response.Content, "_Roast_") - assert.Contains(t, response.Markdown, "_Roast_") - assert.Contains(t, response.HTML, " { -// Crawl a website: -const idempotencyKey = uuidv4(); // optional -const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey); -console.log(crawlResult) + // Scrape a website: + const scrapeResult = await app.scrapeUrl('firecrawl.dev'); -const jobId = await crawlResult['jobId']; -console.log(jobId); - -let job; -while (true) { - job = await app.checkCrawlStatus(jobId); - if (job.status == 'completed') { - break; + if (scrapeResult.success) { + console.log(scrapeResult.markdown) } - await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second -} -console.log(job.data[0].content); + // Crawl a website: + const crawlResult = await app.crawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + console.log(crawlResult); -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) -console.log(searchResult) + // Asynchronously crawl a website: + const asyncCrawlResult = await app.asyncCrawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + + if (asyncCrawlResult.success) { + const id = asyncCrawlResult.id; + console.log(id); -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); + let checkStatus; + if (asyncCrawlResult.success) { + while (true) { + checkStatus = await app.checkCrawlStatus(id); + if (checkStatus.success && checkStatus.status === 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second + } -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" + if (checkStatus.success && checkStatus.data) { + console.log(checkStatus.data[0].markdown); + } } - }, - "required": ["top"] + } + + // Map a website: + const mapResult = await app.mapUrl('https://firecrawl.dev'); + console.log(mapResult) + + + // Crawl a website with WebSockets: + const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + + watch.addEventListener("document", doc => { + console.log("DOC", doc.detail); + }); + + watch.addEventListener("error", err => { + console.error("ERR", err.detail.error); + }); + + watch.addEventListener("done", state => { + console.log("DONE", state.detail.status); + }); } -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file +main() diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index f314c080..7412e479 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,92 +1,61 @@ -import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; -import { z } from "zod"; +import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from 'firecrawl'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); -// Scrape a website: -const scrapeResult = await app.scrapeUrl('firecrawl.dev'); +const main = async () => { -if (scrapeResult.data) { - console.log(scrapeResult.data.content) -} + // Scrape a website: + const scrapeResult = await app.scrapeUrl('firecrawl.dev'); -// Crawl a website: -const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); -console.log(crawlResult) - -const jobId: string = await crawlResult['jobId']; -console.log(jobId); - -let job: JobStatusResponse; -while (true) { - job = await app.checkCrawlStatus(jobId); - if (job.status === 'completed') { - break; + if (scrapeResult.success) { + console.log(scrapeResult.markdown) } - await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second -} -if (job.data) { - console.log(job.data[0].content); -} + // Crawl a website: + const crawlResult = await app.crawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + console.log(crawlResult); -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) + // Asynchronously crawl a website: + const asyncCrawlResult = await app.asyncCrawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + + if (asyncCrawlResult.success) { + const id = asyncCrawlResult.id; + console.log(id); -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); + let checkStatus: CrawlStatusResponse | ErrorResponse; + if (asyncCrawlResult.success) { + while (true) { + checkStatus = await app.checkCrawlStatus(id); + if (checkStatus.success && checkStatus.status === 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second + } -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); -} - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" + if (checkStatus.success && checkStatus.data) { + console.log(checkStatus.data[0].markdown); + } } - }, - "required": ["top"] -} - -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); + } + + // Map a website: + const mapResult = await app.mapUrl('https://firecrawl.dev'); + console.log(mapResult) + + // Crawl a website with WebSockets: + const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); + + watch.addEventListener("document", doc => { + console.log("DOC", doc.detail); + }); + + watch.addEventListener("error", err => { + console.error("ERR", err.detail.error); + }); + + watch.addEventListener("done", state => { + console.log("DONE", state.detail.status); + }); } +main() \ No newline at end of file diff --git a/apps/go-sdk/firecrawl/LICENSE b/apps/js-sdk/firecrawl/LICENSE similarity index 100% rename from apps/go-sdk/firecrawl/LICENSE rename to apps/js-sdk/firecrawl/LICENSE diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index d916bf70..0f3a6824 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -1,10 +1,10 @@ -# Firecrawl JavaScript SDK +# Firecrawl Node SDK -The Firecrawl JavaScript SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. +The Firecrawl Node SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. ## Installation -To install the Firecrawl JavaScript SDK, you can use npm: +To install the Firecrawl Node SDK, you can use npm: ```bash npm install @mendable/firecrawl-js @@ -15,44 +15,31 @@ npm install @mendable/firecrawl-js 1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - Here's an example of how to use the SDK with error handling: ```js - import FirecrawlApp from '@mendable/firecrawl-js'; +import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js'; - async function main() { - try { - // Initialize the FirecrawlApp with your API key - const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" }); +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); - // Scrape a single URL - const url = 'https://mendable.ai'; - const scrapedData = await app.scrapeUrl(url); - console.log(scrapedData); - - // Crawl a website - const crawlUrl = 'https://mendable.ai'; - const params = { - crawlerOptions: { - excludes: ['blog/'], - includes: [], // leave empty for all pages - limit: 1000, - }, - pageOptions: { - onlyMainContent: true - } - }; +// Scrape a website +const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', { + formats: ['markdown', 'html'], +}); - const crawlResult = await app.crawlUrl(crawlUrl, params); - console.log(crawlResult); +if (scrapeResponse) { + console.log(scrapeResponse) +} - } catch (error) { - console.error('An error occurred:', error.message); - } +// Crawl a website +const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { + limit: 100, + scrapeOptions: { + formats: ['markdown', 'html'], } +}) - main(); +console.log(crawlResponse) ``` ### Scraping a URL @@ -60,31 +47,54 @@ Here's an example of how to use the SDK with error handling: To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary. ```js - async function scrapeExample() { - try { - const url = 'https://example.com'; - const scrapedData = await app.scrapeUrl(url); - console.log(scrapedData); +const url = "https://example.com"; +const scrapedData = await app.scrapeUrl(url); +``` - } catch (error) { - console.error( - 'Error occurred while scraping:', - error.message - ); - } +### Crawling a Website + +To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +```js +const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { + limit: 100, + scrapeOptions: { + formats: ['markdown', 'html'], } - - scrapeExample(); +}) +``` + + +### Asynchronous Crawl + +To initiate an asynchronous crawl of a website, utilize the AsyncCrawlURL method. This method requires the starting URL and optional parameters as inputs. The params argument enables you to define various settings for the asynchronous crawl, such as the maximum number of pages to crawl, permitted domains, and the output format. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the crawl. + +```js +const asyncCrawlResult = await app.asyncCrawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); +``` + +### Checking Crawl Status + +To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job` + +```js +const status = await app.checkCrawlStatus(id); ``` ### Extracting structured data from a URL -With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it: +With LLM extraction, you can easily extract structured data from any URL. We support zod schema to make it easier for you too. Here is how you to use it: ```js +import FirecrawlApp from "@mendable/firecrawl-js"; import { z } from "zod"; -const zodSchema = z.object({ +const app = new FirecrawlApp({ + apiKey: "fc-YOUR_API_KEY", +}); + +// Define schema to extract contents into +const schema = z.object({ top: z .array( z.object({ @@ -98,98 +108,53 @@ const zodSchema = z.object({ .describe("Top 5 stories on Hacker News"), }); -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, +const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", { + extractorOptions: { extractionSchema: schema }, }); -console.log(llmExtractionResult.data.llm_extraction); +console.log(scrapeResult.data["llm_extraction"]); ``` -### Search for a query +### Map a Website -Used to search the web, get the most relevant results, scrap each page and return the markdown. +Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap. ```js -query = 'what is mendable?' -searchResult = app.search(query) +const mapResult = await app.mapUrl('https://example.com') as MapResponse; +console.log(mapResult) ``` -### Crawling a Website +### Crawl a website with WebSockets -To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. +To crawl a website with WebSockets, use the `crawlUrlAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. ```js -async function crawlExample() { - try { - const crawlUrl = 'https://example.com'; - const params = { - crawlerOptions: { - excludes: ['blog/'], - includes: [], // leave empty for all pages - limit: 1000, - }, - pageOptions: { - onlyMainContent: true - } - }; - const waitUntilDone = true; - const timeout = 5; - const crawlResult = await app.crawlUrl( - crawlUrl, - params, - waitUntilDone, - timeout - ); +// Crawl a website with WebSockets: +const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); - console.log(crawlResult); +watch.addEventListener("document", doc => { + console.log("DOC", doc.detail); +}); - } catch (error) { - console.error( - 'Error occurred while crawling:', - error.message - ); - } -} +watch.addEventListener("error", err => { + console.error("ERR", err.detail.error); +}); -crawlExample(); -``` - - -### Checking Crawl Status - -To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```js -async function checkStatusExample(jobId) { - try { - const status = await app.checkCrawlStatus(jobId); - console.log(status); - - } catch (error) { - console.error( - 'Error occurred while checking crawl status:', - error.message - ); - } -} -// Example usage, assuming you have a jobId -checkStatusExample('your_job_id_here'); -``` - -## Running Locally -To use the SDK when running Firecrawl locally, you can change the initial Firecrawl app instance to: -```js -const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY", apiUrl: "http://localhost:3002" }); +watch.addEventListener("done", state => { + console.log("DONE", state.detail.status); +}); ``` ## Error Handling The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks. -## Contributing - -Contributions to the Firecrawl JavaScript SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - ## License -The Firecrawl JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). +The Firecrawl Node SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions: + +- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details. diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js deleted file mode 100644 index dbc2d6b9..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ /dev/null @@ -1,271 +0,0 @@ -"use strict"; -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const axios_1 = __importDefault(require("axios")); -const zod_1 = require("zod"); -const zod_to_json_schema_1 = require("zod-to-json-schema"); -/** - * Main class for interacting with the Firecrawl API. - */ -class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof zod_1.z.ZodSchema) { - schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios_1.default.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios_1.default.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} -exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json deleted file mode 100644 index b731bd61..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js deleted file mode 100644 index 99de5e2b..00000000 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ /dev/null @@ -1,265 +0,0 @@ -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -import axios from "axios"; -import { z } from "zod"; -import { zodToJsonSchema } from "zod-to-json-schema"; -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json deleted file mode 100644 index 6990891f..00000000 --- a/apps/js-sdk/firecrawl/build/esm/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "module"} diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 4d9254ac..2dcca44d 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,17 +1,17 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.36", + "version": "1.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.36", + "version": "1.3.0", "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", - "uuid": "^9.0.1", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -23,9 +23,12 @@ "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", + "dotenv": "^16.4.5", "jest": "^29.7.0", "ts-jest": "^29.2.2", - "typescript": "^5.4.5" + "tsup": "^8.2.4", + "typescript": "^5.4.5", + "uuid": "^9.0.1" } }, "node_modules/@ampproject/remapping": { @@ -598,6 +601,486 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.23.1.tgz", + "integrity": "sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.23.1.tgz", + "integrity": "sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.23.1.tgz", + "integrity": "sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.23.1.tgz", + "integrity": "sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.23.1.tgz", + "integrity": "sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.23.1.tgz", + "integrity": "sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.1.tgz", + "integrity": "sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.23.1.tgz", + "integrity": "sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.23.1.tgz", + "integrity": "sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.23.1.tgz", + "integrity": "sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.23.1.tgz", + "integrity": "sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.23.1.tgz", + "integrity": "sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.23.1.tgz", + "integrity": "sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==", + "cpu": [ + "mips64el" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.23.1.tgz", + "integrity": "sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.23.1.tgz", + "integrity": "sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.23.1.tgz", + "integrity": "sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.23.1.tgz", + "integrity": "sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.23.1.tgz", + "integrity": "sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.1.tgz", + "integrity": "sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.23.1.tgz", + "integrity": "sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.23.1.tgz", + "integrity": "sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.23.1.tgz", + "integrity": "sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.23.1.tgz", + "integrity": "sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.23.1.tgz", + "integrity": "sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@isaacs/cliui": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", + "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", + "dev": true, + "dependencies": { + "string-width": "^5.1.2", + "string-width-cjs": "npm:string-width@^4.2.0", + "strip-ansi": "^7.0.1", + "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", + "wrap-ansi": "^8.1.0", + "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@isaacs/cliui/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/emoji-regex": { + "version": "9.2.2", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", + "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", + "dev": true + }, + "node_modules/@isaacs/cliui/node_modules/string-width": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", + "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", + "dev": true, + "dependencies": { + "eastasianwidth": "^0.2.0", + "emoji-regex": "^9.2.2", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@isaacs/cliui/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "dev": true, + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/wrap-ansi": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", + "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", + "dev": true, + "dependencies": { + "ansi-styles": "^6.1.0", + "string-width": "^5.0.1", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -949,6 +1432,259 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@pkgjs/parseargs": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", + "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", + "dev": true, + "optional": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.21.2.tgz", + "integrity": "sha512-fSuPrt0ZO8uXeS+xP3b+yYTCBUd05MoSp2N/MFOgjhhUhMmchXlpTQrTpI8T+YAwAQuK7MafsCOxW7VrPMrJcg==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.21.2.tgz", + "integrity": "sha512-xGU5ZQmPlsjQS6tzTTGwMsnKUtu0WVbl0hYpTPauvbRAnmIvpInhJtgjj3mcuJpEiuUw4v1s4BimkdfDWlh7gA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.21.2.tgz", + "integrity": "sha512-99AhQ3/ZMxU7jw34Sq8brzXqWH/bMnf7ZVhvLk9QU2cOepbQSVTns6qoErJmSiAvU3InRqC2RRZ5ovh1KN0d0Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.21.2.tgz", + "integrity": "sha512-ZbRaUvw2iN/y37x6dY50D8m2BnDbBjlnMPotDi/qITMJ4sIxNY33HArjikDyakhSv0+ybdUxhWxE6kTI4oX26w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.21.2.tgz", + "integrity": "sha512-ztRJJMiE8nnU1YFcdbd9BcH6bGWG1z+jP+IPW2oDUAPxPjo9dverIOyXz76m6IPA6udEL12reYeLojzW2cYL7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.21.2.tgz", + "integrity": "sha512-flOcGHDZajGKYpLV0JNc0VFH361M7rnV1ee+NTeC/BQQ1/0pllYcFmxpagltANYt8FYf9+kL6RSk80Ziwyhr7w==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.21.2.tgz", + "integrity": "sha512-69CF19Kp3TdMopyteO/LJbWufOzqqXzkrv4L2sP8kfMaAQ6iwky7NoXTp7bD6/irKgknDKM0P9E/1l5XxVQAhw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.21.2.tgz", + "integrity": "sha512-48pD/fJkTiHAZTnZwR0VzHrao70/4MlzJrq0ZsILjLW/Ab/1XlVUStYyGt7tdyIiVSlGZbnliqmult/QGA2O2w==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.21.2.tgz", + "integrity": "sha512-cZdyuInj0ofc7mAQpKcPR2a2iu4YM4FQfuUzCVA2u4HI95lCwzjoPtdWjdpDKyHxI0UO82bLDoOaLfpZ/wviyQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.21.2.tgz", + "integrity": "sha512-RL56JMT6NwQ0lXIQmMIWr1SW28z4E4pOhRRNqwWZeXpRlykRIlEpSWdsgNWJbYBEWD84eocjSGDu/XxbYeCmwg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.21.2.tgz", + "integrity": "sha512-PMxkrWS9z38bCr3rWvDFVGD6sFeZJw4iQlhrup7ReGmfn7Oukrr/zweLhYX6v2/8J6Cep9IEA/SmjXjCmSbrMQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.21.2.tgz", + "integrity": "sha512-B90tYAUoLhU22olrafY3JQCFLnT3NglazdwkHyxNDYF/zAxJt5fJUB/yBoWFoIQ7SQj+KLe3iL4BhOMa9fzgpw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.21.2.tgz", + "integrity": "sha512-7twFizNXudESmC9oneLGIUmoHiiLppz/Xs5uJQ4ShvE6234K0VB1/aJYU3f/4g7PhssLGKBVCC37uRkkOi8wjg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.21.2.tgz", + "integrity": "sha512-9rRero0E7qTeYf6+rFh3AErTNU1VCQg2mn7CQcI44vNUWM9Ze7MSRS/9RFuSsox+vstRt97+x3sOhEey024FRQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.21.2.tgz", + "integrity": "sha512-5rA4vjlqgrpbFVVHX3qkrCo/fZTj1q0Xxpg+Z7yIo3J2AilW7t2+n6Q8Jrx+4MrYpAnjttTYF8rr7bP46BPzRw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.21.2.tgz", + "integrity": "sha512-6UUxd0+SKomjdzuAcp+HAmxw1FlGBnl1v2yEPSabtx4lBfdXHDVsW7+lQkgz9cNFJGY3AWR7+V8P5BqkD9L9nA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -1034,6 +1770,12 @@ "dotenv": "*" } }, + "node_modules/@types/estree": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz", + "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==", + "dev": true + }, "node_modules/@types/graceful-fs": { "version": "4.1.9", "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz", @@ -1158,6 +1900,12 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, + "node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "dev": true + }, "node_modules/anymatch": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", @@ -1180,6 +1928,15 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/array-union": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", + "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/async": { "version": "3.2.5", "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", @@ -1314,6 +2071,18 @@ "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", "dev": true }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -1395,6 +2164,30 @@ "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", "dev": true }, + "node_modules/bundle-require": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/bundle-require/-/bundle-require-5.0.0.tgz", + "integrity": "sha512-GuziW3fSSmopcx4KRymQEJVbZUfqlCqcq7dvs6TYwKRZiegK/2buMxQTPs6MGlNv50wms1699qYO54R8XfRX4w==", + "dev": true, + "dependencies": { + "load-tsconfig": "^0.2.3" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "peerDependencies": { + "esbuild": ">=0.18" + } + }, + "node_modules/cac": { + "version": "6.7.14", + "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", + "integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -1458,6 +2251,30 @@ "node": ">=10" } }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, "node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -1538,12 +2355,30 @@ "node": ">= 0.8" } }, + "node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "dev": true, + "engines": { + "node": ">= 6" + } + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", "dev": true }, + "node_modules/consola": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/consola/-/consola-3.2.3.tgz", + "integrity": "sha512-I5qxpzLv+sJhTVEoLYNcTW+bThDCPsit0vLNKShZx6rLtpilNpmmeTPaeqJb9ZE9dV3DGaeby6Vuhrw38WjeyQ==", + "dev": true, + "engines": { + "node": "^14.18.0 || >=16.10.0" + } + }, "node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -1586,12 +2421,12 @@ } }, "node_modules/debug": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", - "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz", + "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==", "dev": true, "dependencies": { - "ms": "2.1.2" + "ms": "^2.1.3" }, "engines": { "node": ">=6.0" @@ -1651,10 +2486,23 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/dir-glob": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", + "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==", + "dev": true, + "dependencies": { + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/dotenv": { "version": "16.4.5", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "dev": true, "engines": { "node": ">=12" }, @@ -1662,6 +2510,12 @@ "url": "https://dotenvx.com" } }, + "node_modules/eastasianwidth": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", + "dev": true + }, "node_modules/ejs": { "version": "3.1.10", "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.1.10.tgz", @@ -1710,6 +2564,45 @@ "is-arrayish": "^0.2.1" } }, + "node_modules/esbuild": { + "version": "0.23.1", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.23.1.tgz", + "integrity": "sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==", + "dev": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.23.1", + "@esbuild/android-arm": "0.23.1", + "@esbuild/android-arm64": "0.23.1", + "@esbuild/android-x64": "0.23.1", + "@esbuild/darwin-arm64": "0.23.1", + "@esbuild/darwin-x64": "0.23.1", + "@esbuild/freebsd-arm64": "0.23.1", + "@esbuild/freebsd-x64": "0.23.1", + "@esbuild/linux-arm": "0.23.1", + "@esbuild/linux-arm64": "0.23.1", + "@esbuild/linux-ia32": "0.23.1", + "@esbuild/linux-loong64": "0.23.1", + "@esbuild/linux-mips64el": "0.23.1", + "@esbuild/linux-ppc64": "0.23.1", + "@esbuild/linux-riscv64": "0.23.1", + "@esbuild/linux-s390x": "0.23.1", + "@esbuild/linux-x64": "0.23.1", + "@esbuild/netbsd-x64": "0.23.1", + "@esbuild/openbsd-arm64": "0.23.1", + "@esbuild/openbsd-x64": "0.23.1", + "@esbuild/sunos-x64": "0.23.1", + "@esbuild/win32-arm64": "0.23.1", + "@esbuild/win32-ia32": "0.23.1", + "@esbuild/win32-x64": "0.23.1" + } + }, "node_modules/escalade": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz", @@ -1789,12 +2682,37 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, "node_modules/fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==", "dev": true }, + "node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "dev": true, + "dependencies": { + "reusify": "^1.0.4" + } + }, "node_modules/fb-watchman": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz", @@ -1878,6 +2796,34 @@ } } }, + "node_modules/foreground-child": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", + "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", + "dev": true, + "dependencies": { + "cross-spawn": "^7.0.0", + "signal-exit": "^4.0.1" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/foreground-child/node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/form-data": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", @@ -1979,6 +2925,18 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/globals": { "version": "11.12.0", "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz", @@ -1988,6 +2946,26 @@ "node": ">=4" } }, + "node_modules/globby": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz", + "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==", + "dev": true, + "dependencies": { + "array-union": "^2.1.0", + "dir-glob": "^3.0.1", + "fast-glob": "^3.2.9", + "ignore": "^5.2.0", + "merge2": "^1.4.1", + "slash": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -2030,6 +3008,15 @@ "node": ">=10.17.0" } }, + "node_modules/ignore": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", + "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", + "dev": true, + "engines": { + "node": ">= 4" + } + }, "node_modules/import-local": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz", @@ -2080,6 +3067,18 @@ "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", "dev": true }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/is-core-module": { "version": "2.13.1", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz", @@ -2092,6 +3091,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-fullwidth-code-point": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", @@ -2110,6 +3118,18 @@ "node": ">=6" } }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-number": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", @@ -2137,6 +3157,20 @@ "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "dev": true }, + "node_modules/isows": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz", + "integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/wagmi-dev" + } + ], + "peerDependencies": { + "ws": "*" + } + }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -2236,6 +3270,21 @@ "node": ">=8" } }, + "node_modules/jackspeak": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", + "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", + "dev": true, + "dependencies": { + "@isaacs/cliui": "^8.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + }, + "optionalDependencies": { + "@pkgjs/parseargs": "^0.11.0" + } + }, "node_modules/jake": { "version": "10.9.1", "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.1.tgz", @@ -2842,6 +3891,15 @@ "url": "https://github.com/chalk/supports-color?sponsor=1" } }, + "node_modules/joycon": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/joycon/-/joycon-3.1.1.tgz", + "integrity": "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==", + "dev": true, + "engines": { + "node": ">=10" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -2909,12 +3967,33 @@ "node": ">=6" } }, + "node_modules/lilconfig": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", + "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", "dev": true }, + "node_modules/load-tsconfig": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/load-tsconfig/-/load-tsconfig-0.2.5.tgz", + "integrity": "sha512-IXO6OCs9yg8tMKzfPZ1YmheJbZCiEsnBdcB03l0OcfK9prKnJb96siuHCr5Fl37/yo9DnKU+TLpxzTUspw9shg==", + "dev": true, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + } + }, "node_modules/locate-path": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", @@ -2933,6 +4012,12 @@ "integrity": "sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==", "dev": true }, + "node_modules/lodash.sortby": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", + "integrity": "sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==", + "dev": true + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -3011,6 +4096,15 @@ "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", "dev": true }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, "node_modules/micromatch": { "version": "4.0.5", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz", @@ -3064,12 +4158,32 @@ "node": "*" } }, + "node_modules/minipass": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", + "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "dev": true, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, "node_modules/ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "dev": true }, + "node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "dev": true, + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -3109,6 +4223,15 @@ "node": ">=8" } }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -3184,6 +4307,12 @@ "node": ">=6" } }, + "node_modules/package-json-from-dist": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.0.tgz", + "integrity": "sha512-dATvCeZN/8wQsGywez1mzHtTlP22H8OEfPrVMLNr4/eGa+ijtLn/6M5f0dY8UKNrC2O9UCU6SSoG3qRKnt7STw==", + "dev": true + }, "node_modules/parse-json": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", @@ -3235,6 +4364,37 @@ "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", "dev": true }, + "node_modules/path-scurry": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", + "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", + "dev": true, + "dependencies": { + "lru-cache": "^10.2.0", + "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" + }, + "engines": { + "node": ">=16 || 14 >=14.18" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/path-scurry/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "dev": true + }, + "node_modules/path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/picocolors": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.1.tgz", @@ -3274,6 +4434,48 @@ "node": ">=8" } }, + "node_modules/postcss-load-config": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-6.0.1.tgz", + "integrity": "sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "lilconfig": "^3.1.1" + }, + "engines": { + "node": ">= 18" + }, + "peerDependencies": { + "jiti": ">=1.21.0", + "postcss": ">=8.0.9", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + }, + "postcss": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, "node_modules/pretty-format": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", @@ -3318,6 +4520,15 @@ "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "dev": true, + "engines": { + "node": ">=6" + } + }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -3334,12 +4545,44 @@ } ] }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, "node_modules/react-is": { "version": "18.2.0", "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==", "dev": true }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -3396,6 +4639,74 @@ "node": ">=10" } }, + "node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "dev": true, + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.21.2", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.21.2.tgz", + "integrity": "sha512-e3TapAgYf9xjdLvKQCkQTnbTKd4a6jwlpQSJJFokHGaX2IVjoEqkIIhiQfqsi0cdwlOD+tQGuOd5AJkc5RngBw==", + "dev": true, + "dependencies": { + "@types/estree": "1.0.5" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.21.2", + "@rollup/rollup-android-arm64": "4.21.2", + "@rollup/rollup-darwin-arm64": "4.21.2", + "@rollup/rollup-darwin-x64": "4.21.2", + "@rollup/rollup-linux-arm-gnueabihf": "4.21.2", + "@rollup/rollup-linux-arm-musleabihf": "4.21.2", + "@rollup/rollup-linux-arm64-gnu": "4.21.2", + "@rollup/rollup-linux-arm64-musl": "4.21.2", + "@rollup/rollup-linux-powerpc64le-gnu": "4.21.2", + "@rollup/rollup-linux-riscv64-gnu": "4.21.2", + "@rollup/rollup-linux-s390x-gnu": "4.21.2", + "@rollup/rollup-linux-x64-gnu": "4.21.2", + "@rollup/rollup-linux-x64-musl": "4.21.2", + "@rollup/rollup-win32-arm64-msvc": "4.21.2", + "@rollup/rollup-win32-ia32-msvc": "4.21.2", + "@rollup/rollup-win32-x64-msvc": "4.21.2", + "fsevents": "~2.3.2" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, "node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -3511,6 +4822,21 @@ "node": ">=8" } }, + "node_modules/string-width-cjs": { + "name": "string-width", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/strip-ansi": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", @@ -3523,6 +4849,19 @@ "node": ">=8" } }, + "node_modules/strip-ansi-cjs": { + "name": "strip-ansi", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/strip-bom": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz", @@ -3553,6 +4892,72 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/sucrase": { + "version": "3.35.0", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", + "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==", + "dev": true, + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "glob": "^10.3.10", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/glob": { + "version": "10.4.5", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", + "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "dev": true, + "dependencies": { + "foreground-child": "^3.1.0", + "jackspeak": "^3.1.2", + "minimatch": "^9.0.4", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^1.11.1" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/minimatch": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", + "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", + "dev": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/supports-color": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", @@ -3591,6 +4996,27 @@ "node": ">=8" } }, + "node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "dev": true, + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "dev": true, + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/tmpl": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz", @@ -3618,6 +5044,30 @@ "node": ">=8.0" } }, + "node_modules/tr46": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", + "integrity": "sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==", + "dev": true, + "dependencies": { + "punycode": "^2.1.0" + } + }, + "node_modules/tree-kill": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz", + "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==", + "dev": true, + "bin": { + "tree-kill": "cli.js" + } + }, + "node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "dev": true + }, "node_modules/ts-jest": { "version": "29.2.2", "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-29.2.2.tgz", @@ -3699,6 +5149,69 @@ "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", "dev": true }, + "node_modules/tsup": { + "version": "8.2.4", + "resolved": "https://registry.npmjs.org/tsup/-/tsup-8.2.4.tgz", + "integrity": "sha512-akpCPePnBnC/CXgRrcy72ZSntgIEUa1jN0oJbbvpALWKNOz1B7aM+UVDWGRGIO/T/PZugAESWDJUAb5FD48o8Q==", + "dev": true, + "dependencies": { + "bundle-require": "^5.0.0", + "cac": "^6.7.14", + "chokidar": "^3.6.0", + "consola": "^3.2.3", + "debug": "^4.3.5", + "esbuild": "^0.23.0", + "execa": "^5.1.1", + "globby": "^11.1.0", + "joycon": "^3.1.1", + "picocolors": "^1.0.1", + "postcss-load-config": "^6.0.1", + "resolve-from": "^5.0.0", + "rollup": "^4.19.0", + "source-map": "0.8.0-beta.0", + "sucrase": "^3.35.0", + "tree-kill": "^1.2.2" + }, + "bin": { + "tsup": "dist/cli-default.js", + "tsup-node": "dist/cli-node.js" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@microsoft/api-extractor": "^7.36.0", + "@swc/core": "^1", + "postcss": "^8.4.12", + "typescript": ">=4.5.0" + }, + "peerDependenciesMeta": { + "@microsoft/api-extractor": { + "optional": true + }, + "@swc/core": { + "optional": true + }, + "postcss": { + "optional": true + }, + "typescript": { + "optional": true + } + } + }, + "node_modules/tsup/node_modules/source-map": { + "version": "0.8.0-beta.0", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.8.0-beta.0.tgz", + "integrity": "sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==", + "dev": true, + "dependencies": { + "whatwg-url": "^7.0.0" + }, + "engines": { + "node": ">= 8" + } + }, "node_modules/type-detect": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", @@ -3733,6 +5246,11 @@ "node": ">=14.17" } }, + "node_modules/typescript-event-target": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz", + "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -3773,6 +5291,7 @@ "version": "9.0.1", "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" @@ -3804,6 +5323,23 @@ "makeerror": "1.0.12" } }, + "node_modules/webidl-conversions": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", + "integrity": "sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==", + "dev": true + }, + "node_modules/whatwg-url": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-7.1.0.tgz", + "integrity": "sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==", + "dev": true, + "dependencies": { + "lodash.sortby": "^4.7.0", + "tr46": "^1.0.1", + "webidl-conversions": "^4.0.2" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -3836,6 +5372,24 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/wrap-ansi-cjs": { + "name": "wrap-ansi", + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", @@ -3855,6 +5409,27 @@ "node": "^12.13.0 || ^14.15.0 || >=16.0.0" } }, + "node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "peer": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 4b857b65..f6f14fb2 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,25 +1,22 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.36", + "version": "1.4.2", "description": "JavaScript SDK for Firecrawl API", - "main": "build/cjs/index.js", - "types": "types/index.d.ts", - "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", "exports": { - "require": { - "types": "./types/index.d.ts", - "default": "./build/cjs/index.js" - }, - "import": { - "types": "./types/index.d.ts", - "default": "./build/esm/index.js" + "./package.json": "./package.json", + ".": { + "import": "./dist/index.js", + "default": "./dist/index.cjs" } }, + "type": "module", "scripts": { - "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", + "build": "tsup", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", - "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" + "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts" }, "repository": { "type": "git", @@ -29,8 +26,8 @@ "license": "MIT", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", - "uuid": "^9.0.1", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" }, @@ -39,6 +36,8 @@ }, "homepage": "https://github.com/mendableai/firecrawl#readme", "devDependencies": { + "uuid": "^9.0.1", + "dotenv": "^16.4.5", "@jest/globals": "^29.7.0", "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", @@ -48,6 +47,7 @@ "@types/uuid": "^9.0.8", "jest": "^29.7.0", "ts-jest": "^29.2.2", + "tsup": "^8.2.4", "typescript": "^5.4.5" }, "keywords": [ diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index ad917de4..7d107afe 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -1,160 +1,330 @@ -import FirecrawlApp from '../../index'; -import { v4 as uuidv4 } from 'uuid'; -import dotenv from 'dotenv'; -import { describe, test, expect } from '@jest/globals'; +import FirecrawlApp, { + CrawlResponseV0, + CrawlStatusResponse, + CrawlStatusResponseV0, + FirecrawlDocumentV0, + ScrapeResponseV0, + SearchResponseV0, +} from "../../index"; +import { v4 as uuidv4 } from "uuid"; +import dotenv from "dotenv"; +import { describe, test, expect } from "@jest/globals"; dotenv.config(); const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = "http://127.0.0.1:3002"; -describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { +describe('FirecrawlApp<"v0"> E2E Tests', () => { + test.concurrent("should throw error for no API key", async () => { expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + new FirecrawlApp<"v0">({ apiKey: null, apiUrl: API_URL, version: "v0" }); }).toThrow("No API key provided"); }); - test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); - }); - - test.concurrent('should throw error for blocklisted URL on scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); - }); - - test.concurrent('should return successful response with valid preview token', async () => { - const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response for valid scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - expect(response.data).toHaveProperty('markdown'); - expect(response.data).toHaveProperty('metadata'); - expect(response.data).not.toHaveProperty('html'); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response with valid API key and include HTML', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); - expect(response.data?.markdown).toContain("_Roast_"); - expect(response.data?.html).toContain(" { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds timeout - - test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); - expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds timeout - - test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); - }); - - test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); - }); - - test.concurrent('should return successful response for crawl and wait for completion', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30); - expect(response).not.toBeNull(); - expect(response[0].content).toContain("_Roast_"); - }, 60000); // 60 seconds timeout - - test.concurrent('should handle idempotency key for crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey); - expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); - - await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); - }); - - test.concurrent('should check crawl status', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false); - expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); - - let statusResponse = await app.checkCrawlStatus(response.jobId); - const maxChecks = 15; - let checks = 0; - - while (statusResponse.status === 'active' && checks < maxChecks) { - await new Promise(resolve => setTimeout(resolve, 1000)); - expect(statusResponse.partial_data).not.toBeNull(); - expect(statusResponse.current).toBeGreaterThanOrEqual(1); - statusResponse = await app.checkCrawlStatus(response.jobId); - checks++; + test.concurrent( + "should throw error for invalid API key on scrape", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect( + invalidApp.scrapeUrl("https://roastmywebsite.ai") + ).rejects.toThrow("Request failed with status code 401"); } + ); - expect(statusResponse).not.toBeNull(); - expect(statusResponse.success).toBe(true); - expect(statusResponse.status).toBe('completed'); - expect(statusResponse.total).toEqual(statusResponse.current); - expect(statusResponse.current_step).not.toBeNull(); - expect(statusResponse?.data?.length).toBeGreaterThan(0); - }, 35000); // 35 seconds timeout + test.concurrent( + "should throw error for blocklisted URL on scrape", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( + "Request failed with status code 403" + ); + } + ); - test.concurrent('should return successful response for search', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.search("test query"); + test.concurrent( + "should return successful response with valid preview token", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: "this_is_just_a_preview_token", + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://roastmywebsite.ai" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response for valid scrape", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://roastmywebsite.ai" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data).toHaveProperty("markdown"); + expect(response.data).toHaveProperty("metadata"); + expect(response.data).not.toHaveProperty("html"); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response with valid API key and include HTML", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl("https://roastmywebsite.ai", { + pageOptions: { includeHtml: true }, + })) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data?.markdown).toContain("_Roast_"); + expect(response.data?.html).toContain(" { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://arxiv.org/pdf/astro-ph/9301001.pdf" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should return successful response for valid scrape with PDF file without explicit extension", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl( + "https://arxiv.org/pdf/astro-ph/9301001" + )) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should throw error for invalid API key on crawl", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect( + invalidApp.crawlUrl("https://roastmywebsite.ai") + ).rejects.toThrow("Request failed with status code 401"); + } + ); + + test.concurrent( + "should throw error for blocklisted URL on crawl", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const blocklistedUrl = "https://twitter.com/fake-test"; + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( + "Request failed with status code 403" + ); + } + ); + + test.concurrent( + "should return successful response for crawl and wait for completion", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + true, + 10 + )) as FirecrawlDocumentV0[]; + expect(response).not.toBeNull(); + expect(response[0].content).toContain("_Roast_"); + }, + 60000 + ); // 60 seconds timeout + + test.concurrent("should handle idempotency key for crawl", async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const uniqueIdempotencyKey = uuidv4(); + const response = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + false, + 2, + uniqueIdempotencyKey + )) as CrawlResponseV0; expect(response).not.toBeNull(); - expect(response?.data?.[0]?.content).toBeDefined(); - expect(response?.data?.length).toBeGreaterThan(2); - }, 30000); // 30 seconds timeout + expect(response.jobId).toBeDefined(); - test.concurrent('should throw error for invalid API key on search', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401"); + await expect( + app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + true, + 2, + uniqueIdempotencyKey + ) + ).rejects.toThrow("Request failed with status code 409"); }); - test.concurrent('should perform LLM extraction', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl("https://mendable.ai", { - extractorOptions: { - mode: 'llm-extraction', - extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractionSchema: { - type: 'object', - properties: { - company_mission: { type: 'string' }, - supports_sso: { type: 'boolean' }, - is_open_source: { type: 'boolean' } - }, - required: ['company_mission', 'supports_sso', 'is_open_source'] - } + test.concurrent( + "should check crawl status", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response: any = (await app.crawlUrl( + "https://roastmywebsite.ai", + { crawlerOptions: { excludes: ["blog/*"] } }, + false + )) as CrawlResponseV0; + expect(response).not.toBeNull(); + expect(response.jobId).toBeDefined(); + + let statusResponse = await app.checkCrawlStatus(response.jobId); + const maxChecks = 15; + let checks = 0; + + while (statusResponse.status === "active" && checks < maxChecks) { + await new Promise((resolve) => setTimeout(resolve, 5000)); + expect(statusResponse.partial_data).not.toBeNull(); + // expect(statusResponse.current).toBeGreaterThanOrEqual(1); + statusResponse = (await app.checkCrawlStatus( + response.jobId + )) as CrawlStatusResponseV0; + checks++; } - }); - expect(response).not.toBeNull(); - expect(response.data?.llm_extraction).toBeDefined(); - const llmExtraction = response.data?.llm_extraction; - expect(llmExtraction?.company_mission).toBeDefined(); - expect(typeof llmExtraction?.supports_sso).toBe('boolean'); - expect(typeof llmExtraction?.is_open_source).toBe('boolean'); - }, 30000); // 30 seconds timeout + + expect(statusResponse).not.toBeNull(); + expect(statusResponse.success).toBe(true); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.total).toEqual(statusResponse.current); + expect(statusResponse.current_step).not.toBeNull(); + expect(statusResponse.current).toBeGreaterThanOrEqual(1); + + expect(statusResponse?.data?.length).toBeGreaterThan(0); + }, + 35000 + ); // 35 seconds timeout + + test.concurrent( + "should return successful response for search", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.search("test query")) as SearchResponseV0; + expect(response).not.toBeNull(); + expect(response?.data?.[0]?.content).toBeDefined(); + expect(response?.data?.length).toBeGreaterThan(2); + }, + 30000 + ); // 30 seconds timeout + + test.concurrent( + "should throw error for invalid API key on search", + async () => { + const invalidApp = new FirecrawlApp<"v0">({ + apiKey: "invalid_api_key", + apiUrl: API_URL, + version: "v0", + }); + await expect(invalidApp.search("test query")).rejects.toThrow( + "Request failed with status code 401" + ); + } + ); + + test.concurrent( + "should perform LLM extraction", + async () => { + const app = new FirecrawlApp<"v0">({ + apiKey: TEST_API_KEY, + apiUrl: API_URL, + version: "v0", + }); + const response = (await app.scrapeUrl("https://mendable.ai", { + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { type: "string" }, + supports_sso: { type: "boolean" }, + is_open_source: { type: "boolean" }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + })) as ScrapeResponseV0; + expect(response).not.toBeNull(); + expect(response.data?.llm_extraction).toBeDefined(); + const llmExtraction = response.data?.llm_extraction; + expect(llmExtraction?.company_mission).toBeDefined(); + expect(typeof llmExtraction?.supports_sso).toBe("boolean"); + expect(typeof llmExtraction?.is_open_source).toBe("boolean"); + }, + 30000 + ); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index dcda96f7..92951237 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -31,7 +31,7 @@ describe('the firecrawl JS SDK', () => { }); const apiKey = 'YOUR_API_KEY' - const app = new FirecrawlApp({ apiKey }); + const app = new FirecrawlApp<"v0">({ apiKey }); // Scrape a single URL const url = 'https://mendable.ai'; const scrapedData = await app.scrapeUrl(url); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts new file mode 100644 index 00000000..98a52538 --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -0,0 +1,335 @@ +import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index'; +import { v4 as uuidv4 } from 'uuid'; +import dotenv from 'dotenv'; +import { describe, test, expect } from '@jest/globals'; + +dotenv.config(); + +const TEST_API_KEY = process.env.TEST_API_KEY; +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; + +describe('FirecrawlApp E2E Tests', () => { + test.concurrent('should throw error for no API key', async () => { + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + }); + + test.concurrent('should throw error for invalid API key on scrape', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain("_Roast_"); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response).not.toHaveProperty('content'); // v0 + expect(response).not.toHaveProperty('html'); + expect(response).not.toHaveProperty('rawHtml'); + expect(response).not.toHaveProperty('screenshot'); + expect(response).not.toHaveProperty('links'); + + expect(response).toHaveProperty('markdown'); + expect(response).toHaveProperty('metadata'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response with valid API key and options', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl( + 'https://roastmywebsite.ai', { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + timeout: 30000, + waitFor: 1000 + }) as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response).not.toHaveProperty('content'); // v0 + expect(response.markdown).toContain("_Roast_"); + expect(response.html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should throw error for invalid API key on crawl', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on crawl', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://twitter.com/fake-test"; + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); + + test.concurrent('should return successful response for crawl and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse; + expect(response).not.toBeNull(); + expect(response).toHaveProperty("total"); + expect(response.total).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).not.toHaveProperty("next"); // wait until done + expect(response.data.length).toBeGreaterThan(0); + expect(response.data[0]).not.toBeNull(); + expect(response.data[0]).not.toBeUndefined(); + if (response.data[0]) { + expect(response.data[0]).toHaveProperty("markdown"); + expect(response.data[0].markdown).toContain("_Roast_"); + expect(response.data[0]).not.toHaveProperty('content'); // v0 + expect(response.data[0]).not.toHaveProperty("html"); + expect(response.data[0]).not.toHaveProperty("rawHtml"); + expect(response.data[0]).not.toHaveProperty("screenshot"); + expect(response.data[0]).not.toHaveProperty("links"); + expect(response.data[0]).toHaveProperty("metadata"); + expect(response.data[0].metadata).toHaveProperty("title"); + expect(response.data[0].metadata).toHaveProperty("description"); + expect(response.data[0].metadata).toHaveProperty("language"); + expect(response.data[0].metadata).toHaveProperty("sourceURL"); + expect(response.data[0].metadata).toHaveProperty("statusCode"); + expect(response.data[0].metadata).not.toHaveProperty("error"); + } + }, 60000); // 60 seconds timeout + + test.concurrent('should return successful response for crawl with options and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { + excludePaths: ['blog/*'], + includePaths: ['/'], + maxDepth: 2, + ignoreSitemap: true, + limit: 10, + allowBackwardLinks: true, + allowExternalLinks: true, + scrapeOptions: { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + waitFor: 1000 + } + } as CrawlParams, 30) as CrawlStatusResponse; + expect(response).not.toBeNull(); + expect(response).toHaveProperty("total"); + expect(response.total).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).not.toHaveProperty("next"); + expect(response.data.length).toBeGreaterThan(0); + expect(response.data[0]).not.toBeNull(); + expect(response.data[0]).not.toBeUndefined(); + if (response.data[0]) { + expect(response.data[0]).toHaveProperty("markdown"); + expect(response.data[0].markdown).toContain("_Roast_"); + expect(response.data[0]).not.toHaveProperty('content'); // v0 + expect(response.data[0]).toHaveProperty("html"); + expect(response.data[0].html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const uniqueIdempotencyKey = uuidv4(); + const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse; + expect(response).not.toBeNull(); + expect(response.id).toBeDefined(); + + await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); + }); + + test.concurrent('should check crawl status', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse; + expect(response).not.toBeNull(); + expect(response.id).toBeDefined(); + + let statusResponse = await app.checkCrawlStatus(response.id); + const maxChecks = 15; + let checks = 0; + + expect(statusResponse.success).toBe(true); + while ((statusResponse as any).status === 'scraping' && checks < maxChecks) { + await new Promise(resolve => setTimeout(resolve, 5000)); + expect(statusResponse).not.toHaveProperty("partial_data"); // v0 + expect(statusResponse).not.toHaveProperty("current"); // v0 + expect(statusResponse).toHaveProperty("data"); + expect(statusResponse).toHaveProperty("total"); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse).toHaveProperty("next"); + expect(statusResponse.success).toBe(true); + if (statusResponse.success === true) { + expect(statusResponse.total).toBeGreaterThan(0); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse.status).toBe("scraping"); + expect(statusResponse.next).toContain("/v1/crawl/"); + } + statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; + expect(statusResponse.success).toBe(true); + checks++; + } + + expect(statusResponse).not.toBeNull(); + expect(statusResponse).toHaveProperty("total"); + expect(statusResponse.success).toBe(true); + if (statusResponse.success === true) { + expect(statusResponse.total).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.data.length).toBeGreaterThan(0); + expect(statusResponse.data[0]).not.toBeNull(); + expect(statusResponse.data[0]).not.toBeUndefined(); + if (statusResponse.data[0]) { + expect(statusResponse.data[0]).toHaveProperty("markdown"); + expect(statusResponse.data[0].markdown?.length).toBeGreaterThan(10); + expect(statusResponse.data[0]).not.toHaveProperty('content'); // v0 + expect(statusResponse.data[0]).toHaveProperty("html"); + expect(statusResponse.data[0].html).toContain(" { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + expect(response.links?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + + expect(response.links?.length).toBeGreaterThan(0); + expect(response.links?.[0]).toContain("https://"); + const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); + expect(filteredLinks?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout + + test('should throw NotImplementedError for search on v1', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + }); +}); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a42d4618..661ce34b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,8 +1,13 @@ -import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; +import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; +import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; +import { WebSocket } from "isows"; +import { TypedEventTarget } from "typescript-event-target"; + /** * Configuration interface for FirecrawlApp. + * @param apiKey - Optional API key for authentication. + * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. */ export interface FirecrawlAppConfig { apiKey?: string | null; @@ -11,6 +16,7 @@ export interface FirecrawlAppConfig { /** * Metadata for a Firecrawl document. + * Includes various optional properties for document metadata. */ export interface FirecrawlDocumentMetadata { title?: string; @@ -43,142 +49,198 @@ export interface FirecrawlDocumentMetadata { articleTag?: string; articleSection?: string; sourceURL?: string; - pageStatusCode?: number; - pageError?: string; - [key: string]: any; + statusCode?: number; + error?: string; + [key: string]: any; // Allows for additional metadata properties not explicitly defined. } /** * Document interface for Firecrawl. + * Represents a document retrieved or processed by Firecrawl. */ -export interface FirecrawlDocument { - id?: string; +export interface FirecrawlDocument { url?: string; - content: string; markdown?: string; html?: string; - llm_extraction?: Record; - createdAt?: Date; - updatedAt?: Date; - type?: string; - metadata: FirecrawlDocumentMetadata; - childrenLinks?: string[]; - provider?: string; - warning?: string; + rawHtml?: string; + links?: string[]; + extract?: T; + screenshot?: string; + metadata?: FirecrawlDocumentMetadata; +} - index?: number; +/** + * Parameters for scraping operations. + * Defines the options and configurations available for scraping web content. + */ +export interface CrawlScrapeOptions { + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[]; + headers?: Record; + includeTags?: string[]; + excludeTags?: string[]; + onlyMainContent?: boolean; + waitFor?: number; + timeout?: number; +} + +export interface ScrapeParams extends CrawlScrapeOptions { + extract?: { + prompt?: string; + schema?: LLMSchema; + systemPrompt?: string; + }; } /** * Response interface for scraping operations. + * Defines the structure of the response received after a scraping operation. */ -export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; +export interface ScrapeResponse extends FirecrawlDocument { + success: true; + warning?: string; error?: string; } + /** - * Response interface for searching operations. + * Parameters for crawling operations. + * Includes options for both scraping and mapping during a crawl. */ -export interface SearchResponse { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; +export interface CrawlParams { + includePaths?: string[]; + excludePaths?: string[]; + maxDepth?: number; + limit?: number; + allowBackwardLinks?: boolean; + allowExternalLinks?: boolean; + ignoreSitemap?: boolean; + scrapeOptions?: CrawlScrapeOptions; + webhook?: string; } + /** * Response interface for crawling operations. + * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: FirecrawlDocument[]; + id?: string; + url?: string; + success: true; error?: string; } + /** * Response interface for job status checks. + * Provides detailed status of a crawl job including progress and results. */ -export interface JobStatusResponse { - success: boolean; - status: string; - current?: number; - current_url?: string; - current_step?: string; - total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; +export interface CrawlStatusResponse { + success: true; + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: Date; + next?: string; + data: FirecrawlDocument[]; +}; + +/** + * Parameters for mapping operations. + * Defines options for mapping URLs during a crawl. + */ +export interface MapParams { + search?: string; + ignoreSitemap?: boolean; + includeSubdomains?: boolean; + limit?: number; +} + +/** + * Response interface for mapping operations. + * Defines the structure of the response received after a mapping operation. + */ +export interface MapResponse { + success: true; + links?: string[]; error?: string; } + /** - * Generic parameter interface. + * Error response interface. + * Defines the structure of the response received when an error occurs. */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; +export interface ErrorResponse { + success: false; + error: string; } + /** * Main class for interacting with the Firecrawl API. + * Provides methods for scraping, searching, crawling, and mapping web content. */ export default class FirecrawlApp { - private apiKey: string; - private apiUrl: string; + public apiKey: string; + public apiUrl: string; /** * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { + if (typeof apiKey !== "string") { throw new Error("No API key provided"); } + + this.apiKey = apiKey; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; } /** * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. + * @param url - The URL to scrape. + * @param params - Additional parameters for the scrape request. + * @returns The response from the scrape operation. */ - async scrapeUrl( + async scrapeUrl( url: string, - params: Params | null = null - ): Promise { + params?: ScrapeParams + ): Promise> | ErrorResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: Params = { url, ...params }; - if (params?.extractorOptions?.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { + let jsonData: any = { url, ...params }; + if (jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + + // Try parsing the schema as a Zod schema + try { schema = zodToJsonSchema(schema); + } catch (error) { + } jsonData = { ...jsonData, - extractorOptions: { - ...params.extractorOptions, - extractionSchema: schema, - mode: params.extractorOptions.mode || "llm-extraction", + extract: { + ...jsonData.extract, + schema: schema, }, }; } try { const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/scrape", + this.apiUrl + `/v1/scrape`, jsonData, { headers } ); if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData; + return { + success: true, + warning: responseData.warning, + error: responseData.error, + ...responseData.data + }; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } @@ -192,134 +254,174 @@ export default class FirecrawlApp { } /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. + * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. + * @param query - The search query string. + * @param params - Additional parameters for the search. + * @returns Throws an error advising to use version 0 of the API. */ async search( query: string, - params: Params | null = null - ): Promise { - const headers: AxiosRequestHeaders = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - } as AxiosRequestHeaders; - let jsonData: Params = { query }; - if (params) { - jsonData = { ...jsonData, ...params }; - } - try { - const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/search", - jsonData, - { headers } - ); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } else { - this.handleError(response, "search"); - } - } catch (error: any) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; + params?: any + ): Promise { + throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0."); } /** * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. + * @param url - The URL to crawl. + * @param params - Additional parameters for the crawl request. + * @param pollInterval - Time in seconds for job status checks. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns The response from the crawl operation. */ async crawlUrl( url: string, - params: Params | null = null, - waitUntilDone: boolean = true, + params?: CrawlParams, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: Params = { url }; - if (params) { - jsonData = { ...jsonData, ...params }; - } + let jsonData: any = { url, ...params }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + "/v0/crawl", + this.apiUrl + `/v1/crawl`, jsonData, headers ); if (response.status === 200) { - const jobId: string = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } else { - return { success: true, jobId }; - } + const id: string = response.data.id; + return this.monitorJobStatus(id, headers, pollInterval); } else { this.handleError(response, "start crawl job"); } } catch (error: any) { - console.log(error); - throw new Error(error.message); + if (error.response?.data?.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } + } + return { success: false, error: "Internal server error." }; + } + + async asyncCrawlUrl( + url: string, + params?: CrawlParams, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: any = { url, ...params }; + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/crawl`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "start crawl job"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } } return { success: false, error: "Internal server error." }; } /** * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. + * @param id - The ID of the crawl operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) + * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(id?: string, getAllData = false): Promise { + if (!id) { + throw new Error("No crawl ID provided"); + } + const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + `${this.apiUrl}/v1/crawl/${id}`, headers ); if (response.status === 200) { - return { - success: true, + let allData = response.data.data; + if (getAllData && response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } + return ({ + success: response.data.success, status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; + completed: response.data.completed, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, + data: allData, + error: response.data.error, + }) } else { this.handleError(response, "check crawl status"); } } catch (error: any) { throw new Error(error.message); } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; + return { success: false, error: "Internal server error." }; + } + + async crawlUrlAndWatch( + url: string, + params?: CrawlParams, + idempotencyKey?: string, + ) { + const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey); + + if (crawl.success && crawl.id) { + const id = crawl.id; + return new CrawlWatcher(id, this); + } + + throw new Error("Crawl job failed to start"); + } + + async mapUrl(url: string, params?: MapParams): Promise { + const headers = this.prepareHeaders(); + let jsonData: { url: string } & MapParams = { url, ...params }; + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/map`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data as MapResponse; + } else { + this.handleError(response, "map"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; } /** * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. + * @param idempotencyKey - Optional key to ensure idempotency. + * @returns The prepared headers. */ prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { return { @@ -331,14 +433,14 @@ export default class FirecrawlApp { /** * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. + * @param url - The URL to send the request to. + * @param data - The data to send in the request. + * @param headers - The headers for the request. + * @returns The response from the POST request. */ postRequest( url: string, - data: Params, + data: any, headers: AxiosRequestHeaders ): Promise { return axios.post(url, data, { headers }); @@ -346,51 +448,65 @@ export default class FirecrawlApp { /** * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. + * @param url - The URL to send the request to. + * @param headers - The headers for the request. + * @returns The response from the GET request. */ - getRequest( + async getRequest( url: string, headers: AxiosRequestHeaders ): Promise { - return axios.get(url, { headers }); + try { + return await axios.get(url, { headers }); + } catch (error) { + if (error instanceof AxiosError && error.response) { + return error.response as AxiosResponse; + } else { + throw error; + } + } } /** * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. + * @param id - The ID of the crawl operation. + * @param headers - The headers for the request. + * @param checkInterval - Interval in seconds for job status checks. + * @param checkUrl - Optional URL to check the status (used for v1 API) + * @returns The final job status or data. */ async monitorJobStatus( - jobId: string, + id: string, headers: AxiosRequestHeaders, checkInterval: number - ): Promise { + ): Promise { while (true) { - const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + let statusResponse: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/crawl/${id}`, headers ); if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) + let statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new Error("Crawl job completed but no data was returned"); + } + } else if ( + ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { - if (checkInterval < 2) { - checkInterval = 2; - } + checkInterval = Math.max(checkInterval, 2); await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000) - ); // Wait for the specified timeout before checking again + ); } else { throw new Error( `Crawl job failed or was stopped. Status: ${statusData.status}` @@ -421,3 +537,111 @@ export default class FirecrawlApp { } } } + +interface CrawlWatcherEvents { + document: CustomEvent>, + done: CustomEvent<{ + status: CrawlStatusResponse["status"]; + data: FirecrawlDocument[]; + }>, + error: CustomEvent<{ + status: CrawlStatusResponse["status"], + data: FirecrawlDocument[], + error: string, + }>, +} + +export class CrawlWatcher extends TypedEventTarget { + private ws: WebSocket; + public data: FirecrawlDocument[]; + public status: CrawlStatusResponse["status"]; + + constructor(id: string, app: FirecrawlApp) { + super(); + this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); + this.status = "scraping"; + this.data = []; + + type ErrorMessage = { + type: "error", + error: string, + } + + type CatchupMessage = { + type: "catchup", + data: CrawlStatusResponse, + } + + type DocumentMessage = { + type: "document", + data: FirecrawlDocument, + } + + type DoneMessage = { type: "done" } + + type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage; + + const messageHandler = (msg: Message) => { + if (msg.type === "done") { + this.status = "completed"; + this.dispatchTypedEvent("done", new CustomEvent("done", { + detail: { + status: this.status, + data: this.data, + }, + })); + } else if (msg.type === "error") { + this.status = "failed"; + this.dispatchTypedEvent("error", new CustomEvent("error", { + detail: { + status: this.status, + data: this.data, + error: msg.error, + }, + })); + } else if (msg.type === "catchup") { + this.status = msg.data.status; + this.data.push(...(msg.data.data ?? [])); + for (const doc of this.data) { + this.dispatchTypedEvent("document", new CustomEvent("document", { + detail: doc, + })); + } + } else if (msg.type === "document") { + this.dispatchTypedEvent("document", new CustomEvent("document", { + detail: msg.data, + })); + } + } + + this.ws.onmessage = ((ev: MessageEvent) => { + if (typeof ev.data !== "string") { + this.ws.close(); + return; + } + + const msg = JSON.parse(ev.data) as Message; + messageHandler(msg); + }).bind(this); + + this.ws.onclose = ((ev: CloseEvent) => { + const msg = JSON.parse(ev.reason) as Message; + messageHandler(msg); + }).bind(this); + + this.ws.onerror = ((_: Event) => { + this.status = "failed" + this.dispatchTypedEvent("error", new CustomEvent("error", { + detail: { + status: this.status, + data: this.data, + error: "WebSocket error", + }, + })); + }).bind(this); + } + + close() { + this.ws.close(); + } +} diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index d7764a46..1297aed9 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -1,110 +1,24 @@ { "compilerOptions": { - /* Visit https://aka.ms/tsconfig to read more about this file */ + // See https://www.totaltypescript.com/tsconfig-cheat-sheet + /* Base Options: */ + "esModuleInterop": true, + "skipLibCheck": true, + "target": "es2022", + "allowJs": true, + "resolveJsonModule": true, + "moduleDetection": "force", + "isolatedModules": true, + "verbatimModuleSyntax": true, - /* Projects */ - // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ - // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ - // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ - // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ - // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ - // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + /* Strictness */ + "strict": true, + "noUncheckedIndexedAccess": true, + "noImplicitOverride": true, - /* Language and Environment */ - "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ - // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ - // "jsx": "preserve", /* Specify what JSX code is generated. */ - // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ - // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ - // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ - // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ - // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ - // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ - // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ - // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ - // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ - - /* Modules */ - "module": "NodeNext", /* Specify what module code is generated. */ - "rootDir": "./src", /* Specify the root folder within your source files. */ - "moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */ - // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ - // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ - // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ - // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ - // "types": [], /* Specify type package names to be included without being referenced in a source file. */ - // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ - // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ - // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ - // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ - // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ - // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ - // "resolveJsonModule": true, /* Enable importing .json files. */ - // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ - // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ - - /* JavaScript Support */ - // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ - // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ - // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ - - /* Emit */ - "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ - // "declarationMap": true, /* Create sourcemaps for d.ts files. */ - // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ - // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ - // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ - // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ - "outDir": "./build", /* Specify an output folder for all emitted files. */ - // "removeComments": true, /* Disable emitting comments. */ - // "noEmit": true, /* Disable emitting files from a compilation. */ - // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ - // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ - // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ - // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ - // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ - // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ - // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ - // "newLine": "crlf", /* Set the newline character for emitting files. */ - // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ - // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ - // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ - // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ - "declarationDir": "./types", /* Specify the output directory for generated declaration files. */ - // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ - - /* Interop Constraints */ - // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ - // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ - // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ - "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ - // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ - "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ - - /* Type Checking */ - "strict": true, /* Enable all strict type-checking options. */ - // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ - // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ - // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ - // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ - // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ - // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ - // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ - // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ - // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ - // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ - // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ - // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ - // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ - // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ - // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ - // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ - // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ - // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ - - /* Completeness */ - // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ - "skipLibCheck": true /* Skip type checking all .d.ts files. */ + /* If NOT transpiling with TypeScript: */ + "module": "NodeNext", + "noEmit": true, }, "include": ["src/**/*"], "exclude": ["node_modules", "dist", "**/__tests__/*"] diff --git a/apps/js-sdk/firecrawl/tsup.config.ts b/apps/js-sdk/firecrawl/tsup.config.ts new file mode 100644 index 00000000..b3b7e42d --- /dev/null +++ b/apps/js-sdk/firecrawl/tsup.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + entryPoints: ["src/index.ts"], + format: ["cjs", "esm"], + dts: true, + outDir: "dist", + clean: true, +}); \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts deleted file mode 100644 index bd6cfc20..00000000 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ /dev/null @@ -1,193 +0,0 @@ -import { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; -/** - * Configuration interface for FirecrawlApp. - */ -export interface FirecrawlAppConfig { - apiKey?: string | null; - apiUrl?: string | null; -} -/** - * Metadata for a Firecrawl document. - */ -export interface FirecrawlDocumentMetadata { - title?: string; - description?: string; - language?: string; - keywords?: string; - robots?: string; - ogTitle?: string; - ogDescription?: string; - ogUrl?: string; - ogImage?: string; - ogAudio?: string; - ogDeterminer?: string; - ogLocale?: string; - ogLocaleAlternate?: string[]; - ogSiteName?: string; - ogVideo?: string; - dctermsCreated?: string; - dcDateCreated?: string; - dcDate?: string; - dctermsType?: string; - dcType?: string; - dctermsAudience?: string; - dctermsSubject?: string; - dcSubject?: string; - dcDescription?: string; - dctermsKeywords?: string; - modifiedTime?: string; - publishedTime?: string; - articleTag?: string; - articleSection?: string; - sourceURL?: string; - pageStatusCode?: number; - pageError?: string; - [key: string]: any; -} -/** - * Document interface for Firecrawl. - */ -export interface FirecrawlDocument { - id?: string; - url?: string; - content: string; - markdown?: string; - html?: string; - llm_extraction?: Record; - createdAt?: Date; - updatedAt?: Date; - type?: string; - metadata: FirecrawlDocumentMetadata; - childrenLinks?: string[]; - provider?: string; - warning?: string; - index?: number; -} -/** - * Response interface for scraping operations. - */ -export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; - error?: string; -} -/** - * Response interface for searching operations. - */ -export interface SearchResponse { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for crawling operations. - */ -export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for job status checks. - */ -export interface JobStatusResponse { - success: boolean; - status: string; - current?: number; - current_url?: string; - current_step?: string; - total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; - error?: string; -} -/** - * Generic parameter interface. - */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; -} -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - private apiKey; - private apiUrl; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey, apiUrl }: FirecrawlAppConfig); - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url: string, params?: Params | null): Promise; - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query: string, params?: Params | null): Promise; - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId: string): Promise; - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url: string, headers: AxiosRequestHeaders): Promise; - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void; -} diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index ca337062..b0f358cb 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,8 +9,8 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.19", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", @@ -422,15 +422,31 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.19", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz", - "integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", + "integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", "dependencies": { "axios": "^1.6.8", + "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", + "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, + "node_modules/@mendable/firecrawl-js/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", @@ -531,6 +547,17 @@ "node": ">=0.3.1" } }, + "node_modules/dotenv": { + "version": "16.4.5", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", + "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/esbuild": { "version": "0.20.2", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", @@ -569,6 +596,32 @@ "@esbuild/win32-x64": "0.20.2" } }, + "node_modules/firecrawl": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/firecrawl/-/firecrawl-1.2.0.tgz", + "integrity": "sha512-Sy1BCCvs5FhGc4yxPP7NG9iWnK8RXdvA1ZS/K1Gj+LrEN3iAT2WRzhYET7x8G2bif25F6rHJg57vdVb5sr6RyQ==", + "dependencies": { + "axios": "^1.6.8", + "dotenv": "^16.4.5", + "isows": "^1.0.4", + "typescript-event-target": "^1.1.1", + "uuid": "^9.0.1", + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.0" + } + }, + "node_modules/firecrawl/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/follow-redirects": { "version": "1.15.6", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", @@ -627,6 +680,20 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/isows": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz", + "integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/wagmi-dev" + } + ], + "peerDependencies": { + "ws": "*" + } + }, "node_modules/make-error": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", @@ -738,6 +805,11 @@ "node": ">=14.17" } }, + "node_modules/typescript-event-target": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz", + "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -761,6 +833,27 @@ "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==" }, + "node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "peer": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/yn": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 2d2c36e8..ac3ef038 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,8 +11,9 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.19", + "@mendable/firecrawl-js": "^1.0.3", "axios": "^1.6.8", + "firecrawl": "^1.2.0", "ts-node": "^10.9.2", "typescript": "^5.4.5", "uuid": "^10.0.0", diff --git a/apps/js-sdk/test.ts b/apps/js-sdk/test.ts deleted file mode 100644 index 5419c2d5..00000000 --- a/apps/js-sdk/test.ts +++ /dev/null @@ -1,28 +0,0 @@ -import FirecrawlApp from "@mendable/firecrawl-js"; -import { z } from "zod"; - -async function a() { - const app = new FirecrawlApp({ - apiKey: "fc-YOUR_API_KEY", - }); - - // Define schema to extract contents into - const schema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), - }); - const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", { - extractorOptions: { extractionSchema: schema }, - }); - console.log(scrapeResult.data["llm_extraction"]); -} -a(); diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 8505fec6..dcf44b25 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -18,23 +18,28 @@ pip install firecrawl-py Here's an example of how to use the SDK: ```python -from firecrawl import FirecrawlApp +from firecrawl.firecrawl import FirecrawlApp -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) +# Scrape a website: +scrape_status = app.scrape_url( + 'https://firecrawl.dev', + params={'formats': ['markdown', 'html']} +) +print(scrape_status) -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) +# Crawl a website: +crawl_status = app.crawl_url( + 'https://firecrawl.dev', + params={ + 'limit': 100, + 'scrapeOptions': {'formats': ['markdown', 'html']} + }, + wait_until_done=True, + poll_interval=30 +) +print(crawl_status) ``` ### Scraping a URL @@ -72,45 +77,77 @@ data = app.scrape_url('https://news.ycombinator.com', { print(data["llm_extraction"]) ``` -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - ### Crawling a Website To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. -The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. - ```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) +idempotency_key = str(uuid.uuid4()) # optional idempotency key +crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) +print(crawl_result) ``` -If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. +### Asynchronous Crawl a Website + +To crawl a website asynchronously, use the `async_crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +```python +crawl_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") +print(crawl_result) +``` ### Checking Crawl Status To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. ```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) +id = crawl_result['id'] +status = app.check_crawl_status(id) +``` + +### Map a Website + +Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap. + +```python +# Map a website: +map_result = app.map_url('https://example.com') +print(map_result) +``` + +### Crawl a website with WebSockets + +To crawl a website with WebSockets, use the `crawl_url_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +```python +# inside an async function... +nest_asyncio.apply() + +# Define event handlers +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + + # Function to start the crawl and watch process +async def start_crawl_and_watch(): + # Initiate the crawl job and get the watcher + watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 }) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() + +# Run the event loop +await start_crawl_and_watch() ``` ## Error Handling diff --git a/apps/python-sdk/build/lib/firecrawl/__init__.py b/apps/python-sdk/build/lib/firecrawl/__init__.py deleted file mode 100644 index e7f8063d..00000000 --- a/apps/python-sdk/build/lib/firecrawl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .firecrawl import FirecrawlApp diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py deleted file mode 100644 index 3f50c798..00000000 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -FirecrawlApp Module - -This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. -It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, -and check the status of these jobs. The module uses requests for HTTP communication -and handles retries for certain HTTP status codes. - -Classes: - - FirecrawlApp: Main class for interacting with the Firecrawl API. -""" - -import os -import time -from typing import Any, Dict, Optional - -import requests - - -class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. - - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: - """ - Scrape the specified URL using the Firecrawl API. - - Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. - - Returns: - Any: The scraped data if the request is successful. - - Raises: - Exception: If the scrape request fails. - """ - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} - - # If there are additional params, process them - if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key != 'extractorOptions': - scrape_params[key] = value - # Make the POST request with the prepared headers and JSON data - response = requests.post( - f'{self.api_url}/v0/scrape', - headers=headers, - json=scrape_params, - ) - if response.status_code == 200: - response = response.json() - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - - def search(self, query, params=None): - """ - Perform a search using the Firecrawl API. - - Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. - - Returns: - Any: The search results if the request is successful. - - Raises: - Exception: If the search request fails. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - json_data = {'query': query} - if params: - json_data.update(params) - response = requests.post( - f'{self.api_url}/v0/search', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to search. Error: {response["error"]}') - - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to search. Status code: {response.status_code}') - - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): - """ - Initiate a crawl job for the specified URL using the Firecrawl API. - - Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - wait_until_done (bool): Whether to wait until the crawl job is completed. - timeout (int): Timeout between status checks when waiting for job completion. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. - - Returns: - Any: The crawl job ID or the crawl results if waiting until completion. - - Raises: - Exception: If the crawl job initiation or monitoring fails. - """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} - else: - self._handle_error(response, 'start crawl job') - - def check_crawl_status(self, job_id): - """ - Check the status of a crawl job using the Firecrawl API. - - Args: - job_id (str): The ID of the crawl job. - - Returns: - Any: The status of the crawl job. - - Raises: - Exception: If the status check request fails. - """ - headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') - - def _prepare_headers(self, idempotency_key=None): - """ - Prepare the headers for API requests. - - Args: - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. - - Returns: - Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. - """ - if idempotency_key: - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - 'x-idempotency-key': idempotency_key - } - - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): - """ - Make a POST request with retries. - - Args: - url (str): The URL to send the POST request to. - data (Dict[str, Any]): The JSON data to include in the POST request. - headers (Dict[str, str]): The headers to include in the POST request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the POST request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.post(url, headers=headers, json=data) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): - """ - Make a GET request with retries. - - Args: - url (str): The URL to send the GET request to. - headers (Dict[str, str]): The headers to include in the GET request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the GET request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.get(url, headers=headers) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _monitor_job_status(self, job_id, headers, timeout): - """ - Monitor the status of a crawl job until completion. - - Args: - job_id (str): The ID of the crawl job. - headers (Dict[str, str]): The headers to include in the status check requests. - timeout (int): Timeout between status checks. - - Returns: - Any: The crawl results if the job is completed successfully. - - Raises: - Exception: If the job fails or an error occurs during status checks. - """ - while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: - timeout=max(timeout,2) - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') - - def _handle_error(self, response, action): - """ - Handle errors from API responses. - - Args: - response (requests.Response): The response object from the API request. - action (str): Description of the action that was being performed. - - Raises: - Exception: An exception with a message containing the status code and error details from the response. - """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz deleted file mode 100644 index 83cd7221..00000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl deleted file mode 100644 index b96c8f48..00000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index d80fa795..02c06288 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,7 +1,9 @@ +import time +import nest_asyncio import uuid from firecrawl.firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="fc-YOUR_API_KEY") +app = FirecrawlApp(api_key="fc-") # Scrape a website: scrape_result = app.scrape_url('firecrawl.dev') @@ -9,9 +11,26 @@ print(scrape_result['markdown']) # Crawl a website: idempotency_key = str(uuid.uuid4()) # optional idempotency key -crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) print(crawl_result) +# Asynchronous Crawl a website: +async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") +print(async_result) + +crawl_status = app.check_crawl_status(async_result['id']) +print(crawl_status) + +attempts = 15 +while attempts > 0 and crawl_status['status'] != 'completed': + print(crawl_status) + crawl_status = app.check_crawl_status(async_result['id']) + attempts -= 1 + time.sleep(1) + +crawl_status = app.get_crawl_status(async_result['id']) +print(crawl_status) + # LLM Extraction: # Define schema to extract contents into using pydantic from pydantic import BaseModel, Field @@ -27,18 +46,15 @@ class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True + 'formats': ['extract'], + 'extract': { + 'schema': TopArticlesSchema.model_json_schema() } }) -print(llm_extraction_result['llm_extraction']) +print(llm_extraction_result['extract']) -# Define schema to extract contents into using json schema +# # Define schema to extract contents into using json schema json_schema = { "type": "object", "properties": { @@ -62,7 +78,10 @@ json_schema = { "required": ["top"] } -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { +app2 = FirecrawlApp(api_key="fc-", version="v0") + + +llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { 'extractorOptions': { 'extractionSchema': json_schema, 'mode': 'llm-extraction' @@ -72,4 +91,36 @@ llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { } }) -print(llm_extraction_result['llm_extraction']) \ No newline at end of file +# print(llm_extraction_result['llm_extraction']) + + +# Map a website: +map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) +print(map_result) + +# Crawl a website with WebSockets: +# inside an async function... +nest_asyncio.apply() + +# Define event handlers +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + + # Function to start the crawl and watch process +async def start_crawl_and_watch(): + # Initiate the crawl job and get the watcher + watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 }) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index fbb2bdbf..540ce67e 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "0.0.16" +__version__ = "1.2.4" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 452d4982..8945d74d 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -7,7 +7,7 @@ from dotenv import load_dotenv load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = "http://127.0.0.1:3002" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) + invalid_app = FirecrawlApp(api_url=API_URL, version='v0') assert "No API key provided" in str(excinfo.value) def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response assert "_Roast_" in response['content'] def test_scrape_url_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') + print(response) + assert response is not None assert 'content' in response assert 'markdown' in response @@ -54,7 +56,7 @@ def test_scrape_url_e2e(): assert "_Roast_" in response['content'] def test_successful_response_with_valid_api_key_and_include_html(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) assert response is not None assert 'content' in response @@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html(): assert " 0 @@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e(): assert "_Roast_" in response[0]['content'] def test_crawl_url_with_idempotency_key_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') uniqueIdempotencyKey = str(uuid4()) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) assert response is not None @@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e(): assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) assert response is not None assert 'jobId' in response @@ -131,21 +133,21 @@ def test_check_crawl_status_e2e(): assert len(status_response['data']) > 0 def test_search_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.search("test query") assert response is not None assert 'content' in response[0] assert len(response) > 2 def test_search_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.search("test query") assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url("https://mendable.ai", { + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') + response = app.scrape_url("https://firecrawl.dev", { 'extractorOptions': { 'mode': 'llm-extraction', 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example new file mode 100644 index 00000000..904887bf --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example @@ -0,0 +1,3 @@ +API_URL=http://localhost:3002 +ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py +TEST_API_KEY=fc-YOUR_API_KEY \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 00000000..5ba1f132 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py new file mode 100644 index 00000000..12fa10ce --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -0,0 +1,352 @@ +import importlib.util +import pytest +import time +import os +from uuid import uuid4 +from dotenv import load_dotenv +from datetime import datetime + +load_dotenv() + +API_URL = "http://127.0.0.1:3002"; +ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" +TEST_API_KEY = os.getenv('TEST_API_KEY') + +print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}") + +spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH) +firecrawl = importlib.util.module_from_spec(spec) +spec.loader.exec_module(firecrawl) +FirecrawlApp = firecrawl.FirecrawlApp + +def test_no_api_key(): + with pytest.raises(Exception) as excinfo: + invalid_app = FirecrawlApp(api_url=API_URL) + assert "No API key provided" in str(excinfo.value) + +def test_scrape_url_invalid_api_key(): + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.scrape_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url(): + blocklisted_url = "https://facebook.com/fake-test" + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(Exception) as excinfo: + app.scrape_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert "_Roast_" in response['markdown'] + assert "content" not in response + assert "html" not in response + assert "metadata" in response + assert "links" not in response + assert "rawHtml" not in response + +def test_successful_response_for_valid_scrape(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert 'markdown' in response + assert "_Roast_" in response['markdown'] + assert 'metadata' in response + assert 'content' not in response + assert 'html' not in response + assert 'rawHtml' not in response + assert 'screenshot' not in response + assert 'links' not in response + +def test_successful_response_with_valid_api_key_and_options(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + params = { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {'x-key': 'test'}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'timeout': 30000, + 'waitFor': 1000 + } + response = app.scrape_url('https://roastmywebsite.ai', params) + assert response is not None + assert 'content' not in response + assert 'markdown' in response + assert 'html' in response + assert 'rawHtml' in response + assert 'screenshot' in response + assert 'links' in response + assert "_Roast_" in response['markdown'] + assert " 0 + assert "https://" in response['links'][0] + assert 'metadata' in response + assert 'title' in response['metadata'] + assert 'description' in response['metadata'] + assert 'keywords' in response['metadata'] + assert 'robots' in response['metadata'] + assert 'ogTitle' in response['metadata'] + assert 'ogDescription' in response['metadata'] + assert 'ogUrl' in response['metadata'] + assert 'ogImage' in response['metadata'] + assert 'ogLocaleAlternate' in response['metadata'] + assert 'ogSiteName' in response['metadata'] + assert 'sourceURL' in response['metadata'] + assert 'statusCode' in response['metadata'] + assert 'pageStatusCode' not in response['metadata'] + assert 'pageError' not in response['metadata'] + assert 'error' not in response['metadata'] + assert response['metadata']['title'] == "Roast My Website" + assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl" + assert response['metadata']['robots'] == "follow, index" + assert response['metadata']['ogTitle'] == "Roast My Website" + assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai" + assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png" + assert response['metadata']['ogLocaleAlternate'] == [] + assert response['metadata']['ogSiteName'] == "Roast My Website" + assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai" + assert response['metadata']['statusCode'] == 200 + +def test_successful_response_for_valid_scrape_with_pdf_file(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf') + assert response is not None + assert 'content' not in response + assert 'metadata' in response + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] + +def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001') + time.sleep(1) # wait for 1 second + assert response is not None + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] + +def test_crawl_url_invalid_api_key(): + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.crawl_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_should_return_error_for_blocklisted_url(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + blocklisted_url = "https://twitter.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.crawl_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_crawl_url_wait_for_completion_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30) + assert response is not None + assert 'total' in response + assert response['total'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' not in response['data'][0] + assert 'rawHtml' not in response['data'][0] + assert 'screenshot' not in response['data'][0] + assert 'links' not in response['data'][0] + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_options_and_wait_for_completion(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', { + 'excludePaths': ['blog/*'], + 'includePaths': ['/'], + 'maxDepth': 2, + 'ignoreSitemap': True, + 'limit': 10, + 'allowBackwardLinks': True, + 'allowExternalLinks': True, + 'scrapeOptions': { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {"x-key": "test"}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'waitFor': 1000 + } + }, True, 30) + assert response is not None + assert 'total' in response + assert response['total'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' in response['data'][0] + assert " 0 + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey) + assert response is not None + assert 'id' in response + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey) + assert "Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False) + assert response is not None + assert 'id' in response + + max_checks = 15 + checks = 0 + status_response = app.check_crawl_status(response['id']) + + while status_response['status'] == 'scraping' and checks < max_checks: + time.sleep(1) # wait for 1 second + assert 'partial_data' not in status_response + assert 'current' not in status_response + assert 'data' in status_response + assert 'total' in status_response + assert 'creditsUsed' in status_response + assert 'expiresAt' in status_response + assert 'status' in status_response + assert 'next' in status_response + assert status_response['total'] > 0 + assert status_response['creditsUsed'] > 0 + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert status_response['status'] == 'scraping' + assert '/v1/crawl/' in status_response['next'] + status_response = app.check_crawl_status(response['id']) + checks += 1 + + assert status_response is not None + assert 'total' in status_response + assert status_response['total'] > 0 + assert 'creditsUsed' in status_response + assert status_response['creditsUsed'] > 0 + assert 'expiresAt' in status_response + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert len(status_response['data']) > 0 + assert 'markdown' in status_response['data'][0] + assert len(status_response['data'][0]['markdown']) > 10 + assert 'content' not in status_response['data'][0] + assert 'html' in status_response['data'][0] + assert " 0 + assert 'metadata' in status_response['data'][0] + assert 'title' in status_response['data'][0]['metadata'] + assert 'description' in status_response['data'][0]['metadata'] + assert 'language' in status_response['data'][0]['metadata'] + assert 'sourceURL' in status_response['data'][0]['metadata'] + assert 'statusCode' in status_response['data'][0]['metadata'] + assert 'error' not in status_response['data'][0]['metadata'] + +def test_invalid_api_key_on_map(): + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url_on_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + blocklisted_url = "https://facebook.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.map_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token_on_map(): + app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + +def test_successful_response_for_valid_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + assert any("https://" in link for link in response) + filtered_links = [link for link in response if "roastmywebsite.ai" in link] + assert len(filtered_links) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +# def test_llm_extraction(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# response = app.scrape_url("https://mendable.ai", { +# 'extractorOptions': { +# 'mode': 'llm-extraction', +# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", +# 'extractionSchema': { +# 'type': 'object', +# 'properties': { +# 'company_mission': {'type': 'string'}, +# 'supports_sso': {'type': 'boolean'}, +# 'is_open_source': {'type': 'boolean'} +# }, +# 'required': ['company_mission', 'supports_sso', 'is_open_source'] +# } +# } +# }) +# assert response is not None +# assert 'llm_extraction' in response +# llm_extraction = response['llm_extraction'] +# assert 'company_mission' in llm_extraction +# assert isinstance(llm_extraction['supports_sso'], bool) +# assert isinstance(llm_extraction['is_open_source'], bool) + + + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..3961631e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -12,31 +12,29 @@ Classes: import logging import os import time -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List +import json import requests +import websockets logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. - - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + """ + Initialize the FirecrawlApp instance with API key, API URL. - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -60,24 +58,22 @@ class FirecrawlApp: # If there are additional params, process them if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options + # Handle extract (for v1) + extract = params.get('extract', {}) + if extract: + if 'schema' in extract and hasattr(extract['schema'], 'schema'): + extract['schema'] = extract['schema'].schema() + scrape_params['extract'] = extract # Include any other params directly at the top level of scrape_params for key, value in params.items(): - if key != 'extractorOptions': + if key not in ['extract']: scrape_params[key] = value + + endpoint = f'/v1/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -102,32 +98,14 @@ class FirecrawlApp: Any: The search results if the request is successful. Raises: + NotImplementedError: If the search request is attempted on API version v1. Exception: If the search request fails. """ - headers = self._prepare_headers() - json_data = {'query': query} - if params: - json_data.update(params) - response = requests.post( - f'{self.api_url}/v0/search', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to search. Error: {response["error"]}') - - else: - self._handle_error(response, 'search') + raise NotImplementedError("Search is not supported in v1.") def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, - wait_until_done: bool = True, - poll_interval: int = 2, + poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None) -> Any: """ Initiate a crawl job for the specified URL using the Firecrawl API. @@ -135,8 +113,7 @@ class FirecrawlApp: Args: url (str): The URL to crawl. params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - wait_until_done (bool): Whether to wait until the crawl job is completed. - poll_interval (int): Time in seconds between status checks when waiting for job completion. + poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: @@ -145,26 +122,49 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, poll_interval) - else: - return {'jobId': job_id} + id = response.json().get('id') + return self._monitor_job_status(id, headers, poll_interval) + else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id: str) -> Any: + + def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + """ + Initiate a crawl job asynchronously. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Dict[str, Any]: The response from the crawl initiation request. + """ + endpoint = f'/v1/crawl' + headers = self._prepare_headers(idempotency_key) + json_data = {'url': url} + if params: + json_data.update(params) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, 'start crawl job') + + def check_crawl_status(self, id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: Any: The status of the crawl job. @@ -172,13 +172,78 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + endpoint = f'/v1/crawl/{id}' + headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - return response.json() + data = response.json() + return { + 'success': True, + 'status': data.get('status'), + 'total': data.get('total'), + 'completed': data.get('completed'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } else: self._handle_error(response, 'check crawl status') + def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + """ + Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job. + """ + crawl_response = self.async_crawl_url(url, params, idempotency_key) + if crawl_response['success'] and 'id' in crawl_response: + return CrawlWatcher(crawl_response['id'], self) + else: + raise Exception("Crawl job failed to start") + + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + + Args: + url (str): The URL to perform the map search on. + params (Optional[Dict[str, Any]]): Additional parameters for the map search. + + Returns: + Any: The result of the map search, typically a dictionary containing mapping data. + """ + endpoint = f'/v1/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + if response['success'] and 'links' in response: + return response['links'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -257,15 +322,14 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any: """ Monitor the status of a crawl job until completion. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. poll_interval (int): Secounds between status checks. - Returns: Any: The crawl results if the job is completed successfully. @@ -273,15 +337,23 @@ class FirecrawlApp: Exception: If the job fails or an error occurs during status checks. """ while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + api_url = f'{self.api_url}/v1/crawl/{id}' + + status_response = self._get_request(api_url, headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: - return status_data['data'] + data = status_data['data'] + while 'next' in status_data: + status_response = self._get_request(status_data['next'], headers) + status_data = status_response.json() + data.extend(status_data['data']) + status_data['data'] = data + return status_data else: raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: poll_interval=max(poll_interval,2) time.sleep(poll_interval) # Wait for the specified interval before checking again else: @@ -300,19 +372,66 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - error_message = response.json().get('error', 'No additional error details provided.') + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message}" + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message}" + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - \ No newline at end of file + +class CrawlWatcher: + def __init__(self, id: str, app: FirecrawlApp): + self.id = id + self.app = app + self.data: List[Dict[str, Any]] = [] + self.status = "scraping" + self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}" + self.event_handlers = { + 'done': [], + 'error': [], + 'document': [] + } + + async def connect(self): + async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: + await self._listen(websocket) + + async def _listen(self, websocket): + async for message in websocket: + msg = json.loads(message) + await self._handle_message(msg) + + def add_event_listener(self, event_type: str, handler): + if event_type in self.event_handlers: + self.event_handlers[event_type].append(handler) + + def dispatch_event(self, event_type: str, detail: Dict[str, Any]): + if event_type in self.event_handlers: + for handler in self.event_handlers[event_type]: + handler(detail) + + async def _handle_message(self, msg: Dict[str, Any]): + if msg['type'] == 'done': + self.status = 'completed' + self.dispatch_event('done', {'status': self.status, 'data': self.data}) + elif msg['type'] == 'error': + self.status = 'failed' + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) + elif msg['type'] == 'catchup': + self.status = msg['data']['status'] + self.data.extend(msg['data'].get('data', [])) + for doc in self.data: + self.dispatch_event('document', doc) + elif msg['type'] == 'document': + self.data.append(msg['data']) + self.dispatch_event('document', msg['data']) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO deleted file mode 100644 index 288eb7a5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ /dev/null @@ -1,179 +0,0 @@ -Metadata-Version: 2.1 -Name: firecrawl-py -Version: 0.0.12 -Summary: Python SDK for Firecrawl API -Home-page: https://github.com/mendableai/firecrawl -Author: Mendable.ai -Author-email: nick@mendable.ai -License: GNU General Public License v3 (GPLv3) -Project-URL: Documentation, https://docs.firecrawl.dev -Project-URL: Source, https://github.com/mendableai/firecrawl -Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues -Keywords: SDK API firecrawl -Classifier: Development Status :: 5 - Production/Stable -Classifier: Environment :: Web Environment -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) -Classifier: Natural Language :: English -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Topic :: Internet -Classifier: Topic :: Internet :: WWW/HTTP -Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search -Classifier: Topic :: Software Development -Classifier: Topic :: Software Development :: Libraries -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Topic :: Text Processing -Classifier: Topic :: Text Processing :: Indexing -Requires-Python: >=3.8 -Description-Content-Type: text/markdown - -# Firecrawl Python SDK - -The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Python SDK, you can use pip: - -```bash -pip install firecrawl-py -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK: - -```python -from firecrawl import FirecrawlApp - -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') - -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) - -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) -``` - -### Scraping a URL - -To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```python -url = 'https://example.com' -scraped_data = app.scrape_url(url) -``` -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: - -```python -class ArticleSchema(BaseModel): - title: str - points: int - by: str - commentsURL: str - -class TopArticlesSchema(BaseModel): - top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") - -data = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) -print(data["llm_extraction"]) -``` - -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - -### Crawling a Website - -To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. - -```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) -``` - -If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. - -### Checking Crawl Status - -To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Running the Tests with Pytest - -To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling. - -### Running the Tests - -To run the tests, execute the following commands: - -Install pytest: -```bash -pip install pytest -``` - -Run: -```bash -pytest firecrawl/__tests__/e2e_withAuth/test.py -``` - - -## Contributing - -Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt b/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt deleted file mode 100644 index c25567c5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt +++ /dev/null @@ -1,9 +0,0 @@ -README.md -setup.py -firecrawl/__init__.py -firecrawl/firecrawl.py -firecrawl_py.egg-info/PKG-INFO -firecrawl_py.egg-info/SOURCES.txt -firecrawl_py.egg-info/dependency_links.txt -firecrawl_py.egg-info/requires.txt -firecrawl_py.egg-info/top_level.txt \ No newline at end of file diff --git a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt b/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/apps/python-sdk/firecrawl_py.egg-info/requires.txt b/apps/python-sdk/firecrawl_py.egg-info/requires.txt deleted file mode 100644 index c8d341f5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/requires.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pytest -python-dotenv diff --git a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt b/apps/python-sdk/firecrawl_py.egg-info/top_level.txt deleted file mode 100644 index 8bce1a1f..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -firecrawl diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 0a732c43..87cb91f1 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -10,6 +10,9 @@ readme = {file="README.md", content-type = "text/markdown"} requires-python = ">=3.8" dependencies = [ "requests", + "python-dotenv", + "websockets", + "nest-asyncio" ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index 1bed5881..db67ceeb 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -1,3 +1,5 @@ requests pytest -python-dotenv \ No newline at end of file +python-dotenv +websockets +nest-asyncio \ No newline at end of file diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 4978559b..8a67d1fd 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -30,6 +30,9 @@ setup( 'requests', 'pytest', 'python-dotenv', + 'websockets', + 'asyncio', + 'nest-asyncio' ], python_requires=">=3.8", classifiers=[ diff --git a/apps/rust-sdk/.gitignore b/apps/rust-sdk/.gitignore new file mode 100644 index 00000000..2f7896d1 --- /dev/null +++ b/apps/rust-sdk/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/apps/rust-sdk/CHANGELOG.md b/apps/rust-sdk/CHANGELOG.md new file mode 100644 index 00000000..8342b9fa --- /dev/null +++ b/apps/rust-sdk/CHANGELOG.md @@ -0,0 +1,7 @@ +## CHANGELOG + +## [0.1] + +### Added + +- [feat] Firecrawl rust sdk. diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock new file mode 100644 index 00000000..c2b71d6d --- /dev/null +++ b/apps/rust-sdk/Cargo.lock @@ -0,0 +1,1999 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" +dependencies = [ + "memchr", +] + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + +[[package]] +name = "assert_matches" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" +dependencies = [ + "autocfg 1.3.0", +] + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4efd02e230a02e18f92fc2735f44597385ed02ad8f831e7c1c1156ee5e1ab3a5" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "blake2b_simd" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq", +] + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + +[[package]] +name = "cc" +version = "1.0.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5208975e568d83b6b05cc0a063c8e7e9acc2b43bee6da15616a5b73e109d7437" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clippy" +version = "0.0.302" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d911ee15579a3f50880d8c1d59ef6e79f9533127a3bd342462f5d584f5e8c294" +dependencies = [ + "term 0.5.2", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + +[[package]] +name = "dirs" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" +dependencies = [ + "libc", + "redox_users", + "winapi 0.3.9", +] + +[[package]] +name = "dotenv" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" + +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "env_logger" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b" +dependencies = [ + "log 0.3.9", + "regex", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "extprim" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b1a357c911c352439b460d7b375b5c85977b9db395b703dfee5a94dfb4d66a2" +dependencies = [ + "num-traits", + "rand", + "rustc_version", + "semver", + "serde", +] + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "firecrawl" +version = "0.1.0" +dependencies = [ + "assert_matches", + "clippy", + "dotenv", + "log 0.4.22", + "reqwest", + "rustfmt", + "serde", + "serde_json", + "thiserror", + "tokio", + "uuid", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-core", + "futures-io", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + +[[package]] +name = "h2" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" + +[[package]] +name = "hyper" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", +] + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg 1.3.0", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" +dependencies = [ + "log 0.4.22", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.48.0", +] + +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log 0.4.22", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg 1.3.0", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "openssl" +version = "0.10.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.2", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" +dependencies = [ + "autocfg 0.1.8", + "libc", + "rand_chacha", + "rand_core 0.4.2", + "rand_hc", + "rand_isaac", + "rand_jitter", + "rand_os", + "rand_pcg", + "rand_xorshift", + "winapi 0.3.9", +] + +[[package]] +name = "rand_chacha" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" +dependencies = [ + "autocfg 0.1.8", + "rand_core 0.3.1", +] + +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +dependencies = [ + "rand_core 0.4.2", +] + +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + +[[package]] +name = "rand_hc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "rand_isaac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "rand_jitter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" +dependencies = [ + "libc", + "rand_core 0.4.2", + "winapi 0.3.9", +] + +[[package]] +name = "rand_os" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" +dependencies = [ + "cloudabi", + "fuchsia-cprng", + "libc", + "rand_core 0.4.2", + "rdrand", + "winapi 0.3.9", +] + +[[package]] +name = "rand_pcg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +dependencies = [ + "autocfg 0.1.8", + "rand_core 0.4.2", +] + +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "redox_syscall" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" + +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "redox_users" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de0737333e7a9502c789a36d7c7fa6092a49895d4faa31ca5df163857ded2e9d" +dependencies = [ + "getrandom 0.1.16", + "redox_syscall 0.1.57", + "rust-argon2", +] + +[[package]] +name = "regex" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9329abc99e39129fcceabd24cf5d85b4671ef7c29c50e972bc5afe32438ec384" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", + "utf8-ranges", +] + +[[package]] +name = "regex-syntax" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d707a4fa2637f2dca2ef9fd02225ec7661fe01a53623c1e6515b6916511f7a7" +dependencies = [ + "ucd-util", +] + +[[package]] +name = "reqwest" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +dependencies = [ + "base64 0.22.1", + "bytes", + "encoding_rs", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "ipnet", + "js-sys", + "log 0.4.22", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rust-argon2" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" +dependencies = [ + "base64 0.13.1", + "blake2b_simd", + "constant_time_eq", + "crossbeam-utils", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + +[[package]] +name = "rustfmt" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec940eed814db0fb7ab928c5f5025f97dc55d1c0e345e39dda2ce9f945557500" +dependencies = [ + "diff", + "env_logger", + "getopts", + "kernel32-sys", + "libc", + "log 0.3.9", + "regex", + "serde", + "serde_derive", + "serde_json", + "strings", + "syntex_errors", + "syntex_syntax", + "term 0.4.6", + "toml", + "unicode-segmentation", + "winapi 0.2.8", +] + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4828ea528154ae444e5a642dbb7d5623354030dc9822b83fd9bb79683c7399d0" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pemfile" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +dependencies = [ + "base64 0.22.1", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" + +[[package]] +name = "rustls-webpki" +version = "0.102.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9a6fccd794a42c2c105b513a2f62bc3fd8f3ba57a4593677ceb0bd035164d78" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + +[[package]] +name = "serde" +version = "1.0.204" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.204" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg 1.3.0", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "strings" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa481ee1bc42fc3df8195f91f7cb43cf8f2b71b48bac40bf5381cfaf7e481f3c" +dependencies = [ + "log 0.3.9", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + +[[package]] +name = "syntex_errors" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3133289179676c9f5c5b2845bf5a2e127769f4889fcbada43035ef6bd662605e" +dependencies = [ + "libc", + "serde", + "serde_derive", + "syntex_pos", + "term 0.4.6", + "unicode-xid", +] + +[[package]] +name = "syntex_pos" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ab669fa003d208c681f874bbc76d91cc3d32550d16b5d9d2087cf477316470" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "syntex_syntax" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03815b9f04d95828770d9c974aa39c6e1f6ef3114eb77a3ce09008a0d15dd142" +dependencies = [ + "bitflags 0.9.1", + "extprim", + "log 0.3.9", + "serde", + "serde_derive", + "serde_json", + "syntex_errors", + "syntex_pos", + "unicode-xid", +] + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "term" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa63644f74ce96fbeb9b794f66aff2a52d601cbd5e80f4b97123e3899f4570f1" +dependencies = [ + "kernel32-sys", + "winapi 0.2.8", +] + +[[package]] +name = "term" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" +dependencies = [ + "byteorder", + "dirs", + "winapi 0.3.9", +] + +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tinyvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6b6a2fb3a985e99cebfaefa9faa3024743da73304ca1c683a36429613d3d22" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +dependencies = [ + "rustls", + "rustls-pki-types", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" +dependencies = [ + "serde", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "ucd-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abd2fc5d32b590614af8b0a20d837f32eca055edd0bbead59a9cfe80858be003" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode-width" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + +[[package]] +name = "uuid" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +dependencies = [ + "getrandom 0.2.15", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log 0.4.22", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml new file mode 100644 index 00000000..685545e2 --- /dev/null +++ b/apps/rust-sdk/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "firecrawl" +author="Mendable.ai" +version = "0.1.0" +edition = "2021" +license = "GPL-2.0-or-later" +homepage = "https://www.firecrawl.dev/" +repository ="https://github.com/mendableai/firecrawl" +description = "Rust SDK for Firecrawl API." +authors = ["sanix-darker "] + +[lib] +path = "src/lib.rs" +name = "firecrawl" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] +reqwest = { version = "^0.12", features = ["json", "blocking"] } +serde = { version = "^1.0", features = ["derive"] } +serde_json = "^1.0" +log = "^0.4" +thiserror = "^1.0" +uuid = { version = "^1.10", features = ["v4"] } +tokio = { version = "^1", features = ["full"] } + +[dev-dependencies] +clippy = "^0.0.302" +rustfmt = "^0.10" +assert_matches = "^1.5" +dotenv = "^0.15" +tokio = { version = "1", features = ["full"] } + +[build-dependencies] +tokio = { version = "1", features = ["full"] } diff --git a/apps/rust-sdk/README.md b/apps/rust-sdk/README.md new file mode 100644 index 00000000..54ad9097 --- /dev/null +++ b/apps/rust-sdk/README.md @@ -0,0 +1,181 @@ +# Firecrawl Rust SDK + +The Firecrawl Rust SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. + +## Installation + +To install the Firecrawl Rust SDK, add the following to your `Cargo.toml`: + +```toml +[dependencies] +firecrawl = "^0.1" +tokio = { version = "^1", features = ["full"] } +serde = { version = "^1.0", features = ["derive"] } +serde_json = "^1.0" +uuid = { version = "^1.10", features = ["v4"] } + +[build-dependencies] +tokio = { version = "1", features = ["full"] } +``` + +To add it in your codebase. + +## Usage + +1. Get an API key from [firecrawl.dev](https://firecrawl.dev) +2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` struct. + +Here's an example of how to use the SDK in [example.rs](./examples/example.rs): +All below example can start with : +```rust +use firecrawl::FirecrawlApp; + +#[tokio::main] +async fn main() { + // Initialize the FirecrawlApp with the API key + let api_key = ...; + let api_url = ...; + let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp"); + + // your code here... +} +``` + +### Scraping a URL + +To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a `serde_json::Value`. + +```rust +// Example scrape code... +let scrape_result = app.scrape_url("https://example.com", None).await; +match scrape_result { + Ok(data) => println!("Scrape Result:\n{}", data["markdown"]), + Err(e) => eprintln!("Scrape failed: {}", e), +} +``` + +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support Serde for JSON schema validation to make it easier for you too. Here is how you use it: + +```rust +let json_schema = json!({ + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +}); + +let llm_extraction_params = json!({ + "extractorOptions": { + "extractionSchema": json_schema, + "mode": "llm-extraction" + }, + "pageOptions": { + "onlyMainContent": true + } +}); + +// Example scrape code... +let llm_extraction_result = app + .scrape_url("https://news.ycombinator.com", Some(llm_extraction_params)) + .await; +match llm_extraction_result { + Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]), + Err(e) => eprintln!("LLM Extraction failed: {}", e), +} +``` + +### Search for a query + +Used to search the web, get the most relevant results, scrape each page, and return the markdown. + +```rust +// Example query search code... +let query = "what is mendable?"; +let search_result = app.search(query).await; +match search_result { + Ok(data) => println!("Search Result:\n{}", data), + Err(e) => eprintln!("Search failed: {}", e), +} +``` + +### Crawling a Website + +To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `true`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `false`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. + +```rust +let random_uuid = String::from(Uuid::new_v4()); +let idempotency_key = Some(random_uuid); // optional idempotency key +let crawl_params = json!({ + "crawlerOptions": { + "excludes": ["blog/*"] + } +}); + +// Example crawl code... +let crawl_result = app + .crawl_url("https://example.com", Some(crawl_params), true, 2, idempotency_key) + .await; +match crawl_result { + Ok(data) => println!("Crawl Result:\n{}", data), + Err(e) => eprintln!("Crawl failed: {}", e), +} +``` + +If `wait_until_done` is set to `true`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. + +### Checking Crawl Status + +To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. + +```rust +let job_id = crawl_result["jobId"].as_str().expect("Job ID not found"); +let status = app.check_crawl_status(job_id).await; +match status { + Ok(data) => println!("Crawl Status:\n{}", data), + Err(e) => eprintln!("Failed to check crawl status: {}", e), +} +``` + +## Error Handling + +The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. + +## Running the Tests with Cargo + +To ensure the functionality of the Firecrawl Rust SDK, we have included end-to-end tests using `cargo`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling. + +### Running the Tests + +To run the tests, execute the following commands: +```bash +$ export $(xargs < ./tests/.env) +$ cargo test --test e2e_with_auth +``` + +## Contributing + +Contributions to the Firecrawl Rust SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. + +## License + +The Firecrawl Rust SDK is open-source and released under the [AGPL License](https://www.gnu.org/licenses/agpl-3.0.en.html). diff --git a/apps/rust-sdk/examples/example.rs b/apps/rust-sdk/examples/example.rs new file mode 100644 index 00000000..c6b96b78 --- /dev/null +++ b/apps/rust-sdk/examples/example.rs @@ -0,0 +1,82 @@ +use firecrawl::FirecrawlApp; +use serde_json::json; +use uuid::Uuid; + +#[tokio::main] +async fn main() { + // Initialize the FirecrawlApp with the API key + let api_key = Some("fc-YOUR_API_KEY".to_string()); + let api_url = Some("http://0.0.0.0:3002".to_string()); + let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp"); + + // Scrape a website + let scrape_result = app.scrape_url("https://firecrawl.dev", None).await; + match scrape_result { + Ok(data) => println!("Scrape Result:\n{}", data["markdown"]), + Err(e) => eprintln!("Scrape failed: {}", e), + } + + // Crawl a website + let random_uuid = String::from(Uuid::new_v4()); + let idempotency_key = Some(random_uuid); // optional idempotency key + let crawl_params = json!({ + "crawlerOptions": { + "excludes": ["blog/*"] + } + }); + let crawl_result = app + .crawl_url( + "https://mendable.ai", + Some(crawl_params), + true, + 2, + idempotency_key, + ) + .await; + match crawl_result { + Ok(data) => println!("Crawl Result:\n{}", data), + Err(e) => eprintln!("Crawl failed: {}", e), + } + + // LLM Extraction with a JSON schema + let json_schema = json!({ + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] + }); + + let llm_extraction_params = json!({ + "extractorOptions": { + "extractionSchema": json_schema, + "mode": "llm-extraction" + }, + "pageOptions": { + "onlyMainContent": true + } + }); + + let llm_extraction_result = app + .scrape_url("https://news.ycombinator.com", Some(llm_extraction_params)) + .await; + match llm_extraction_result { + Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]), + Err(e) => eprintln!("LLM Extraction failed: {}", e), + } +} diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs new file mode 100644 index 00000000..a2ca75ad --- /dev/null +++ b/apps/rust-sdk/src/lib.rs @@ -0,0 +1,373 @@ +/* +* +* - Structs and Enums: +* FirecrawlError: Custom error enum for handling various errors. +* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client. +* +* - Initialization: +* +* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided. +* +* - API Methods: +* scrape_url, search, crawl_url, check_crawl_status: +* Methods for interacting with the Firecrawl API, similar to the Python methods. +* monitor_job_status: Polls the API to monitor the status of a crawl job until completion. +*/ + +use std::env; +use std::thread; +use std::time::Duration; + +use log::debug; +use reqwest::{Client, Response}; +use serde_json::json; +use serde_json::Value; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum FirecrawlError { + #[error("HTTP request failed: {0}")] + HttpRequestFailed(String), + #[error("API key not provided")] + ApiKeyNotProvided, + #[error("Failed to parse response: {0}")] + ResponseParseError(String), + #[error("Crawl job failed or stopped: {0}")] + CrawlJobFailed(String), +} + +#[derive(Clone, Debug)] +pub struct FirecrawlApp { + api_key: String, + api_url: String, + client: Client, +} +// the api verstion of firecrawl +const API_VERSION: &str = "/v0"; + +impl FirecrawlApp { + /// Initialize the FirecrawlApp instance. + /// + /// # Arguments: + /// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API. + /// * `api_url` (Optional[str]): Base URL for the Firecrawl API. + pub fn new(api_key: Option, api_url: Option) -> Result { + let api_key = api_key + .or_else(|| env::var("FIRECRAWL_API_KEY").ok()) + .ok_or(FirecrawlError::ApiKeyNotProvided)?; + let api_url = api_url.unwrap_or_else(|| { + env::var("FIRECRAWL_API_URL") + .unwrap_or_else(|_| "https://api.firecrawl.dev".to_string()) + }); + + debug!("Initialized FirecrawlApp with API key: {}", api_key); + debug!("Initialized FirecrawlApp with API URL: {}", api_url); + + Ok(FirecrawlApp { + api_key, + api_url, + client: Client::new(), + }) + } + + /// Scrape the specified URL using the Firecrawl API. + /// + /// # Arguments: + /// * `url` (str): The URL to scrape. + /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + /// + /// # Returns: + /// * `Any`: The scraped data if the request is successful. + /// + /// # Raises: + /// * `Exception`: If the scrape request fails. + pub async fn scrape_url( + &self, + url: &str, + params: Option, + ) -> Result { + let headers = self.prepare_headers(None); + let mut scrape_params = json!({"url": url}); + + if let Some(mut params) = params { + if let Some(extractor_options) = params.get_mut("extractorOptions") { + if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") { + if extraction_schema.is_object() && extraction_schema.get("schema").is_some() { + extractor_options["extractionSchema"] = extraction_schema["schema"].clone(); + } + extractor_options["mode"] = extractor_options + .get("mode") + .cloned() + .unwrap_or_else(|| json!("llm-extraction")); + } + scrape_params["extractorOptions"] = extractor_options.clone(); + } + for (key, value) in params.as_object().unwrap() { + if key != "extractorOptions" { + scrape_params[key] = value.clone(); + } + } + } + + let response = self + .client + .post(&format!("{}{}/scrape", self.api_url, API_VERSION)) + .headers(headers) + .json(&scrape_params) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + self.handle_response(response, "scrape URL").await + } + + /// Perform a search using the Firecrawl API. + /// + /// # Arguments: + /// * `query` (str): The search query. + /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request. + /// + /// # Returns: + /// * `Any`: The search results if the request is successful. + /// + /// # Raises: + /// * `Exception`: If the search request fails. + pub async fn search( + &self, + query: &str, + params: Option, + ) -> Result { + let headers = self.prepare_headers(None); + let mut json_data = json!({"query": query}); + if let Some(params) = params { + for (key, value) in params.as_object().unwrap() { + json_data[key] = value.clone(); + } + } + + let response = self + .client + .post(&format!("{}{}/search", self.api_url, API_VERSION)) + .headers(headers) + .json(&json_data) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + self.handle_response(response, "search").await + } + + /// Initiate a crawl job for the specified URL using the Firecrawl API. + /// + /// # Arguments: + /// * `url` (str): The URL to crawl. + /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + /// * `wait_until_done` (bool): Whether to wait until the crawl job is completed. + /// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion. + /// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests. + /// + /// # Returns: + /// * `Any`: The crawl job ID or the crawl results if waiting until completion. + /// + /// # `Raises`: + /// * `Exception`: If the crawl job initiation or monitoring fails. + pub async fn crawl_url( + &self, + url: &str, + params: Option, + wait_until_done: bool, + poll_interval: u64, + idempotency_key: Option, + ) -> Result { + let headers = self.prepare_headers(idempotency_key); + let mut json_data = json!({"url": url}); + if let Some(params) = params { + for (key, value) in params.as_object().unwrap() { + json_data[key] = value.clone(); + } + } + + let response = self + .client + .post(&format!("{}{}/crawl", self.api_url, API_VERSION)) + .headers(headers.clone()) + .json(&json_data) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + let response_json = self.handle_response(response, "start crawl job").await?; + let job_id = response_json["jobId"].as_str().unwrap().to_string(); + + if wait_until_done { + self.monitor_job_status(&job_id, headers, poll_interval) + .await + } else { + Ok(json!({"jobId": job_id})) + } + } + + /// Check the status of a crawl job using the Firecrawl API. + /// + /// # Arguments: + /// * `job_id` (str): The ID of the crawl job. + /// + /// # Returns: + /// * `Any`: The status of the crawl job. + /// + /// # Raises: + /// * `Exception`: If the status check request fails. + pub async fn check_crawl_status(&self, job_id: &str) -> Result { + let headers = self.prepare_headers(None); + let response = self + .client + .get(&format!( + "{}{}/crawl/status/{}", + self.api_url, API_VERSION, job_id + )) + .headers(headers) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + self.handle_response(response, "check crawl status").await + } + + /// Monitor the status of a crawl job until completion. + /// + /// # Arguments: + /// * `job_id` (str): The ID of the crawl job. + /// * `headers` (Dict[str, str]): The headers to include in the status check requests. + /// * `poll_interval` (int): Secounds between status checks. + /// + /// # Returns: + /// * `Any`: The crawl results if the job is completed successfully. + /// + /// # Raises: + /// Exception: If the job fails or an error occurs during status checks. + async fn monitor_job_status( + &self, + job_id: &str, + headers: reqwest::header::HeaderMap, + poll_interval: u64, + ) -> Result { + loop { + let response = self + .client + .get(&format!( + "{}{}/crawl/status/{}", + self.api_url, API_VERSION, job_id + )) + .headers(headers.clone()) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + let status_data = self.handle_response(response, "check crawl status").await?; + match status_data["status"].as_str() { + Some("completed") => { + if status_data["data"].is_object() { + return Ok(status_data["data"].clone()); + } else { + return Err(FirecrawlError::CrawlJobFailed( + "Crawl job completed but no data was returned".to_string(), + )); + } + } + Some("active") | Some("paused") | Some("pending") | Some("queued") + | Some("waiting") => { + thread::sleep(Duration::from_secs(poll_interval)); + } + Some(status) => { + return Err(FirecrawlError::CrawlJobFailed(format!( + "Crawl job failed or was stopped. Status: {}", + status + ))); + } + None => { + return Err(FirecrawlError::CrawlJobFailed( + "Unexpected response: no status field".to_string(), + )); + } + } + } + } + + /// Prepare the headers for API requests. + /// + /// # Arguments: + /// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests. + /// + /// # Returns: + /// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. + fn prepare_headers(&self, idempotency_key: Option) -> reqwest::header::HeaderMap { + let mut headers = reqwest::header::HeaderMap::new(); + headers.insert("Content-Type", "application/json".parse().unwrap()); + headers.insert( + "Authorization", + format!("Bearer {}", self.api_key).parse().unwrap(), + ); + if let Some(key) = idempotency_key { + headers.insert("x-idempotency-key", key.parse().unwrap()); + } + headers + } + + /// Handle errors from API responses. + /// + /// # Arguments: + /// * `response` (requests.Response): The response object from the API request. + /// * `action` (str): Description of the action that was being performed. + /// + /// # Raises: + /// Exception: An exception with a message containing the status code and error details from the response. + async fn handle_response( + &self, + response: Response, + action: &str, + ) -> Result { + if response.status().is_success() { + let response_json: Value = response + .json() + .await + .map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?; + if response_json["success"].as_bool().unwrap_or(false) { + Ok(response_json["data"].clone()) + } else { + Err(FirecrawlError::HttpRequestFailed(format!( + "Failed to {}: {}", + action, response_json["error"] + ))) + } + } else { + let status_code = response.status().as_u16(); + let error_message = response + .json::() + .await + .unwrap_or_else(|_| json!({"error": "No additional error details provided."})); + let message = match status_code { + 402 => format!( + "Payment Required: Failed to {}. {}", + action, error_message["error"] + ), + 408 => format!( + "Request Timeout: Failed to {} as the request timed out. {}", + action, error_message["error"] + ), + 409 => format!( + "Conflict: Failed to {} due to a conflict. {}", + action, error_message["error"] + ), + 500 => format!( + "Internal Server Error: Failed to {}. {}", + action, error_message["error"] + ), + _ => format!( + "Unexpected error during {}: Status code {}. {}", + action, status_code, error_message["error"] + ), + }; + Err(FirecrawlError::HttpRequestFailed(message)) + } + } +} diff --git a/apps/go-sdk/firecrawl/.env.example b/apps/rust-sdk/tests/.env.example similarity index 50% rename from apps/go-sdk/firecrawl/.env.example rename to apps/rust-sdk/tests/.env.example index 772a6243..5aa1cb11 100644 --- a/apps/go-sdk/firecrawl/.env.example +++ b/apps/rust-sdk/tests/.env.example @@ -1,2 +1,2 @@ API_URL=http://localhost:3002 -TEST_API_KEY=fc-YOUR-API-KEY +TEST_API_KEY=fc-YOUR_API_KEY diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs new file mode 100644 index 00000000..ac9dc1d3 --- /dev/null +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -0,0 +1,174 @@ +use assert_matches::assert_matches; +use dotenv::dotenv; +use firecrawl::FirecrawlApp; +use serde_json::json; +use std::env; +use std::time::Duration; +use tokio::time::sleep; + +#[tokio::test] +async fn test_no_api_key() { + dotenv().ok(); + let api_url = env::var("API_URL").expect("API_URL environment variable is not set"); + assert_matches!(FirecrawlApp::new(None, Some(api_url)), Err(e) if e.to_string() == "API key not provided"); +} + +#[tokio::test] +async fn test_blocklisted_url() { + dotenv().ok(); + let api_url = env::var("API_URL").unwrap(); + let api_key = env::var("TEST_API_KEY").unwrap(); + let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); + let blocklisted_url = "https://facebook.com/fake-test"; + let result = app.scrape_url(blocklisted_url, None).await; + + assert_matches!( + result, + Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") + ); +} + +#[tokio::test] +async fn test_successful_response_with_valid_preview_token() { + dotenv().ok(); + let api_url = env::var("API_URL").unwrap(); + let app = FirecrawlApp::new( + Some("this_is_just_a_preview_token".to_string()), + Some(api_url), + ) + .unwrap(); + let result = app + .scrape_url("https://roastmywebsite.ai", None) + .await + .unwrap(); + assert!(result.as_object().unwrap().contains_key("content")); + assert!(result["content"].as_str().unwrap().contains("_Roast_")); +} + +#[tokio::test] +async fn test_scrape_url_e2e() { + dotenv().ok(); + let api_url = env::var("API_URL").unwrap(); + let api_key = env::var("TEST_API_KEY").unwrap(); + let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); + let result = app + .scrape_url("https://roastmywebsite.ai", None) + .await + .unwrap(); + assert!(result.as_object().unwrap().contains_key("content")); + assert!(result.as_object().unwrap().contains_key("markdown")); + assert!(result.as_object().unwrap().contains_key("metadata")); + assert!(!result.as_object().unwrap().contains_key("html")); + assert!(result["content"].as_str().unwrap().contains("_Roast_")); +} + +#[tokio::test] +async fn test_successful_response_with_valid_api_key_and_include_html() { + dotenv().ok(); + let api_url = env::var("API_URL").unwrap(); + let api_key = env::var("TEST_API_KEY").unwrap(); + let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); + let params = json!({ + "pageOptions": { + "includeHtml": true + } + }); + let result = app + .scrape_url("https://roastmywebsite.ai", Some(params)) + .await + .unwrap(); + assert!(result.as_object().unwrap().contains_key("content")); + assert!(result.as_object().unwrap().contains_key("markdown")); + assert!(result.as_object().unwrap().contains_key("html")); + assert!(result.as_object().unwrap().contains_key("metadata")); + assert!(result["content"].as_str().unwrap().contains("_Roast_")); + assert!(result["markdown"].as_str().unwrap().contains("_Roast_")); + assert!(result["html"].as_str().unwrap().contains(" { describe("Scraping website tests with a dataset", () => { it("Should scrape the website and prompt it against OpenAI", async () => { + let totalTimeTaken = 0; let passedTests = 0; const batchSize = 15; // Adjusted to comply with the rate limit of 15 per minute const batchPromises = []; @@ -51,11 +52,16 @@ describe("Scraping Checkup (E2E)", () => { const batchPromise = Promise.all( batch.map(async (websiteData: WebsiteData) => { try { + const startTime = new Date().getTime(); const scrapedContent = await request(TEST_URL || "") - .post("/v0/scrape") + .post("/v1/scrape") .set("Content-Type", "application/json") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .send({ url: websiteData.website, pageOptions: { onlyMainContent: true } }); + .send({ url: websiteData.website }); + + const endTime = new Date().getTime(); + const timeTaken = endTime - startTime; + totalTimeTaken += timeTaken; if (scrapedContent.statusCode !== 200) { console.error(`Failed to scrape ${websiteData.website} ${scrapedContent.statusCode}`); @@ -165,6 +171,7 @@ describe("Scraping Checkup (E2E)", () => { const timeTaken = (endTime - startTime) / 1000; console.log(`Score: ${score}%`); console.log(`Total tokens: ${totalTokens}`); + console.log(`Total time taken: ${totalTimeTaken} miliseconds`); await logErrors(errorLog, timeTaken, totalTokens, score, websitesData.length); diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index abf7fd78..a1549e24 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,6 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -9,7 +10,8 @@ class SupabaseService { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null console.warn( "Authentication is disabled. Supabase client will not be initialized." @@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { console.debug( "Attempted to access Supabase client when it's not configured." ); diff --git a/apps/ui/ingestion-ui/README.md b/apps/ui/ingestion-ui/README.md index e6b49b95..61f9f983 100644 --- a/apps/ui/ingestion-ui/README.md +++ b/apps/ui/ingestion-ui/README.md @@ -20,7 +20,7 @@ This template provides an easy way to spin up a UI for Firecrawl using React. It ``` 2. Set up your Firecrawl API key: - Open `src/components/FirecrawlComponent.tsx` and replace the placeholder API key: + Open `src/components/ingestion.tsx` and replace the placeholder API key: ```typescript const FIRECRAWL_API_KEY = "your-api-key-here"; @@ -36,7 +36,7 @@ This template provides an easy way to spin up a UI for Firecrawl using React. It ## Customization -The main Firecrawl component is located in `src/components/FirecrawlComponent.tsx`. You can modify this file to customize the UI or add additional features. +The main Firecrawl component is located in `src/components/ingestion.tsx`. You can modify this file to customize the UI or add additional features. ## Security Considerations diff --git a/apps/ui/ingestion-ui/package-lock.json b/apps/ui/ingestion-ui/package-lock.json index 7038a1f2..e48e99b8 100644 --- a/apps/ui/ingestion-ui/package-lock.json +++ b/apps/ui/ingestion-ui/package-lock.json @@ -11,6 +11,7 @@ "@radix-ui/react-checkbox": "^1.1.1", "@radix-ui/react-collapsible": "^1.1.0", "@radix-ui/react-label": "^2.1.0", + "@radix-ui/react-radio-group": "^1.2.0", "@radix-ui/react-slot": "^1.1.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.1", @@ -1192,6 +1193,32 @@ } } }, + "node_modules/@radix-ui/react-collection": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.0.tgz", + "integrity": "sha512-GZsZslMJEyo1VKm5L1ZJY8tGDxZNPAoUeQUIbKeJfoi7Q4kmig5AsgLMYYuyYbfjd8fBmFORAIwYAkXMnXZgZw==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.0", + "@radix-ui/react-context": "1.1.0", + "@radix-ui/react-primitive": "2.0.0", + "@radix-ui/react-slot": "1.1.0" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-compose-refs": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.1.0.tgz", @@ -1220,6 +1247,21 @@ } } }, + "node_modules/@radix-ui/react-direction": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@radix-ui/react-direction/-/react-direction-1.1.0.tgz", + "integrity": "sha512-BUuBvgThEiAXh2DWu93XsT+a3aWrGqolGlqqw5VU1kG7p/ZH2cuDlM1sRLNnY3QcBS69UIz2mcKhMxDsdewhjg==", + "license": "MIT", + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-id": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@radix-ui/react-id/-/react-id-1.1.0.tgz", @@ -1304,6 +1346,69 @@ } } }, + "node_modules/@radix-ui/react-radio-group": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@radix-ui/react-radio-group/-/react-radio-group-1.2.0.tgz", + "integrity": "sha512-yv+oiLaicYMBpqgfpSPw6q+RyXlLdIpQWDHZbUKURxe+nEh53hFXPPlfhfQQtYkS5MMK/5IWIa76SksleQZSzw==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.0", + "@radix-ui/react-compose-refs": "1.1.0", + "@radix-ui/react-context": "1.1.0", + "@radix-ui/react-direction": "1.1.0", + "@radix-ui/react-presence": "1.1.0", + "@radix-ui/react-primitive": "2.0.0", + "@radix-ui/react-roving-focus": "1.1.0", + "@radix-ui/react-use-controllable-state": "1.1.0", + "@radix-ui/react-use-previous": "1.1.0", + "@radix-ui/react-use-size": "1.1.0" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-roving-focus": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.0.tgz", + "integrity": "sha512-EA6AMGeq9AEeQDeSH0aZgG198qkfHSbvWTf1HvoDmOB5bBG/qTxjYMWUKMnYiV6J/iP/J8MEFSuB2zRU2n7ODA==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.0", + "@radix-ui/react-collection": "1.1.0", + "@radix-ui/react-compose-refs": "1.1.0", + "@radix-ui/react-context": "1.1.0", + "@radix-ui/react-direction": "1.1.0", + "@radix-ui/react-id": "1.1.0", + "@radix-ui/react-primitive": "2.0.0", + "@radix-ui/react-use-callback-ref": "1.1.0", + "@radix-ui/react-use-controllable-state": "1.1.0" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-slot": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.1.0.tgz", diff --git a/apps/ui/ingestion-ui/package.json b/apps/ui/ingestion-ui/package.json index 48009648..01a754b2 100644 --- a/apps/ui/ingestion-ui/package.json +++ b/apps/ui/ingestion-ui/package.json @@ -13,6 +13,7 @@ "@radix-ui/react-checkbox": "^1.1.1", "@radix-ui/react-collapsible": "^1.1.0", "@radix-ui/react-label": "^2.1.0", + "@radix-ui/react-radio-group": "^1.2.0", "@radix-ui/react-slot": "^1.1.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.1", diff --git a/apps/ui/ingestion-ui/src/App.tsx b/apps/ui/ingestion-ui/src/App.tsx index eb0e6954..b80a5ad8 100644 --- a/apps/ui/ingestion-ui/src/App.tsx +++ b/apps/ui/ingestion-ui/src/App.tsx @@ -1,9 +1,35 @@ +import { useState } from "react"; import FirecrawlComponent from "./components/ingestion"; +import FirecrawlComponentV1 from "./components/ingestionV1"; +import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group"; +import { Label } from "@/components/ui/label"; function App() { + const [selectedComponent, setSelectedComponent] = useState<"v0" | "v1">("v1"); + return ( <> - +
+ setSelectedComponent(value as "v0" | "v1")} + className="flex space-x-6 mt-6" + > +
+ + +
+
+ + +
+
+
+ {selectedComponent === "v1" ? ( + + ) : ( + + )} ); } diff --git a/apps/ui/ingestion-ui/src/components/ingestionV1.tsx b/apps/ui/ingestion-ui/src/components/ingestionV1.tsx new file mode 100644 index 00000000..b34c0d6b --- /dev/null +++ b/apps/ui/ingestion-ui/src/components/ingestionV1.tsx @@ -0,0 +1,603 @@ +import { useState, ChangeEvent, FormEvent, useEffect } from "react"; +import { + Card, + CardHeader, + CardTitle, + CardContent, + CardFooter, +} from "@/components/ui/card"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { Checkbox } from "@/components/ui/checkbox"; +import { Label } from "@/components/ui/label"; +import { + Collapsible, + CollapsibleContent, + CollapsibleTrigger, +} from "@/components/ui/collapsible"; +import { ChevronDown, ChevronLeft, ChevronRight } from "lucide-react"; + +//! Hardcoded values (not recommended for production) +//! Highly recommended to move all Firecrawl API calls to the backend (e.g. Next.js API route) +const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud +const FIRECRAWL_API_KEY = "fc-YOUR_API_KEY"; // Replace with your actual API key + +interface FormData { + url: string; + crawlSubPages: boolean; + search: string; + limit: string; + maxDepth: string; + excludePaths: string; + includePaths: string; + extractMainContent: boolean; +} + +interface CrawlerOptions { + includes?: string[]; + excludes?: string[]; + maxDepth?: number; + limit?: number; + returnOnlyUrls: boolean; +} + +interface ScrapeOptions { + formats?: string[]; + onlyMainContent?: boolean; +} + +interface PageOptions { + onlyMainContent: boolean; +} + +interface RequestBody { + url: string; + crawlerOptions?: CrawlerOptions; + pageOptions?: PageOptions; + search?: string; + excludePaths?: string[]; + includePaths?: string[]; + maxDepth?: number; + limit?: number; + scrapeOptions?: ScrapeOptions; + formats?: string[]; +} + +interface ScrapeResultMetadata { + title: string; + description: string; + language: string; + sourceURL: string; + pageStatusCode: number; + pageError?: string; + [key: string]: string | number | undefined; +} + +interface ScrapeResultData { + markdown: string; + content: string; + html: string; + rawHtml: string; + metadata: ScrapeResultMetadata; + llm_extraction: Record; + warning?: string; +} + +interface ScrapeResult { + success: boolean; + data: ScrapeResultData; +} + +export default function FirecrawlComponentV1() { + const [formData, setFormData] = useState({ + url: "", + crawlSubPages: false, + search: "", + limit: "", + maxDepth: "", + excludePaths: "", + includePaths: "", + extractMainContent: false, + }); + const [loading, setLoading] = useState(false); + const [scrapingSelectedLoading, setScrapingSelectedLoading] = + useState(false); + const [crawledUrls, setCrawledUrls] = useState([]); + const [selectedUrls, setSelectedUrls] = useState([]); + const [scrapeResults, setScrapeResults] = useState< + Record + >({}); + const [isCollapsibleOpen, setIsCollapsibleOpen] = useState(true); + const [crawlStatus, setCrawlStatus] = useState<{ + current: number; + total: number | null; + }>({ current: 0, total: null }); + const [elapsedTime, setElapsedTime] = useState(0); + const [showCrawlStatus, setShowCrawlStatus] = useState(false); + const [isScraping, setIsScraping] = useState(false); + const [currentPage, setCurrentPage] = useState(1); + const urlsPerPage = 10; + + useEffect(() => { + let timer: NodeJS.Timeout; + if (loading) { + setShowCrawlStatus(true); + timer = setInterval(() => { + setElapsedTime((prevTime) => prevTime + 1); + }, 1000); + } + return () => { + if (timer) clearInterval(timer); + }; + }, [loading]); + + const handleChange = (e: ChangeEvent) => { + const { name, value, type, checked } = e.target; + setFormData((prevData) => { + const newData = { + ...prevData, + [name]: type === "checkbox" ? checked : value, + }; + + // Automatically check "Crawl Sub-pages" if limit or search have content + if (name === "limit" || name === "search") { + newData.crawlSubPages = !!value || !!newData.limit || !!newData.search; + } + + return newData; + }); + }; + + const handleSubmit = async (e: FormEvent) => { + e.preventDefault(); + setLoading(true); + setIsCollapsibleOpen(false); + setElapsedTime(0); + setCrawlStatus({ current: 0, total: null }); + setIsScraping(!formData.crawlSubPages); + setCrawledUrls([]); + setSelectedUrls([]); + setScrapeResults({}); + setScrapingSelectedLoading(false); + setShowCrawlStatus(false); + + try { + const endpoint = `${FIRECRAWL_API_URL}/v1/${ + formData.crawlSubPages ? "map" : "scrape" + }`; + + const requestBody: RequestBody = formData.crawlSubPages + ? { + url: formData.url, + search: formData.search || undefined, + limit: formData.limit ? parseInt(formData.limit) : undefined, + } + : { + url: formData.url, + formats: ["markdown"], + }; + + const response = await fetch(endpoint, { + method: "POST", + headers: { + Authorization: `Bearer ${FIRECRAWL_API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(requestBody), + }); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const data = await response.json(); + if (formData.crawlSubPages) { + if (data.success === true && Array.isArray(data.links)) { + setCrawledUrls(data.links); + setSelectedUrls(data.links); + setCrawlStatus({ + current: data.links.length, + total: data.links.length, + }); + + // Set scrape results with the links + const linkResults: Record = {}; + data.links.forEach((link: string) => { + linkResults[link] = { + success: true, + data: { + metadata: { + sourceURL: link, + title: "", + description: "", + language: "", + pageStatusCode: 200, + }, + markdown: "", + content: "", + html: "", + rawHtml: "", + llm_extraction: {}, + }, + }; + }); + } else { + console.error("Unexpected response format from map endpoint"); + console.log(data); + } + } else { + setScrapeResults({ [formData.url]: data }); + setCrawlStatus({ current: 1, total: 1 }); + } + } catch (error) { + console.error("Error:", error); + setScrapeResults({ + error: { + success: false, + data: { + metadata: { + pageError: "Error occurred while fetching data", + title: "", + description: "", + language: "", + sourceURL: "", + pageStatusCode: 0, + }, + markdown: "", + content: "", + html: "", + rawHtml: "", + llm_extraction: {}, + }, + }, + }); + } finally { + setLoading(false); + } + }; + + const handleScrapeSelected = async () => { + setLoading(true); + setElapsedTime(0); + setCrawlStatus({ current: 0, total: selectedUrls.length }); + setIsScraping(true); + setScrapingSelectedLoading(true); + const newScrapeResults: Record = {}; + + for (const [index, url] of selectedUrls.entries()) { + try { + const response = await fetch(`${FIRECRAWL_API_URL}/v1/scrape`, { + method: "POST", + headers: { + Authorization: `Bearer ${FIRECRAWL_API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + url: url, + formats: ["markdown"], + }), + }); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const data: ScrapeResult = await response.json(); + newScrapeResults[url] = data; + setCrawlStatus((prev) => ({ ...prev, current: index + 1 })); + setScrapeResults({ ...scrapeResults, ...newScrapeResults }); + } catch (error) { + console.error(`Error scraping ${url}:`, error); + newScrapeResults[url] = { + success: false, + data: { + markdown: "", + content: "", + html: "", + rawHtml: "", + metadata: { + title: "", + description: "", + language: "", + sourceURL: url, + pageStatusCode: 0, + pageError: (error as Error).message, + }, + llm_extraction: {}, + }, + }; + } + } + + setLoading(false); + setIsScraping(false); + }; + + const handlePageChange = (newPage: number) => { + setCurrentPage(newPage); + }; + + const paginatedUrls = crawledUrls.slice( + (currentPage - 1) * urlsPerPage, + currentPage * urlsPerPage + ); + + return ( +
+ + + + Extract web content (V1) + + Powered by Firecrawl 🔥 + + +
+ Use this component to quickly give your users the ability to connect + their AI apps to web data with Firecrawl. Learn more on the{" "} + + Firecrawl docs! + +
+ + +
+
+ + +
+ + + + + +
+ + setFormData((prev) => ({ + ...prev, + crawlSubPages: checked, + })) + } + /> + +
+ +
+
+ + +
+
+ + +
+
+
+
+
+ {showCrawlStatus && ( +
+
+ {!isScraping && + crawledUrls.length > 0 && + !scrapingSelectedLoading && ( + <> + { + if (checked) { + setSelectedUrls([...crawledUrls]); + } else { + setSelectedUrls([]); + } + }} + /> + + + )} +
+
+ {isScraping + ? `Scraped ${crawlStatus.current} page(s) in ${elapsedTime}s` + : `Crawled ${crawlStatus.current} pages in ${elapsedTime}s`} +
+
+ )} + + {crawledUrls.length > 0 && + !scrapingSelectedLoading && + !isScraping && ( + <> +
    + {paginatedUrls.map((url, index) => ( +
  • + + setSelectedUrls((prev) => + prev.includes(url) + ? prev.filter((u) => u !== url) + : [...prev, url] + ) + } + /> + + {url.length > 70 ? `${url.slice(0, 70)}...` : url} + +
  • + ))} +
+
+ + + Page {currentPage} of{" "} + {Math.ceil(crawledUrls.length / urlsPerPage)} + + +
+ + )} +
+ + {crawledUrls.length > 0 && !scrapingSelectedLoading && ( + + )} + + + + {Object.keys(scrapeResults).length > 0 && ( +
+

Scrape Results

+

+ You can do whatever you want with the scrape results. Here is a + basic showcase of the markdown. +

+
+ {Object.entries(scrapeResults).map(([url, result]) => ( + + + {result.data.metadata.title} + + {url + .replace(/^(https?:\/\/)?(www\.)?/, "") + .replace(/\/$/, "")} + + + +
+ {result.success ? ( + <> +
+                          {result.data.markdown.trim()}
+                        
+ + ) : ( + <> +

+ Failed to scrape this URL +

+

+ {result.toString()} +

+ + )} +
+
+
+ ))} +
+
+ )} +
+ ); +} diff --git a/apps/ui/ingestion-ui/src/components/ui/radio-group.tsx b/apps/ui/ingestion-ui/src/components/ui/radio-group.tsx new file mode 100644 index 00000000..43b43b48 --- /dev/null +++ b/apps/ui/ingestion-ui/src/components/ui/radio-group.tsx @@ -0,0 +1,42 @@ +import * as React from "react" +import * as RadioGroupPrimitive from "@radix-ui/react-radio-group" +import { Circle } from "lucide-react" + +import { cn } from "@/lib/utils" + +const RadioGroup = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => { + return ( + + ) +}) +RadioGroup.displayName = RadioGroupPrimitive.Root.displayName + +const RadioGroupItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => { + return ( + + + + + + ) +}) +RadioGroupItem.displayName = RadioGroupPrimitive.Item.displayName + +export { RadioGroup, RadioGroupItem } diff --git a/docker-compose.yaml b/docker-compose.yaml index 8c160f4a..24b51762 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,7 +15,6 @@ x-common-service: &common-service - OPENAI_BASE_URL=${OPENAI_BASE_URL} - MODEL_NAME=${MODEL_NAME:-gpt-4o} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - - SERPER_API_KEY=${SERPER_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} - LOGTAIL_KEY=${LOGTAIL_KEY} - BULL_AUTH_KEY=${BULL_AUTH_KEY} diff --git a/examples/kubernetes/cluster-install/secret.yaml b/examples/kubernetes/cluster-install/secret.yaml index 2be96320..6d8eed3b 100644 --- a/examples/kubernetes/cluster-install/secret.yaml +++ b/examples/kubernetes/cluster-install/secret.yaml @@ -6,7 +6,6 @@ type: Opaque data: OPENAI_API_KEY: "" SLACK_WEBHOOK_URL: "" - SERPER_API_KEY: "" LLAMAPARSE_API_KEY: "" LOGTAIL_KEY: "" BULL_AUTH_KEY: "" diff --git a/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb new file mode 100644 index 00000000..ee14f147 --- /dev/null +++ b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scraping and Extraction with Firecrawl and Claude\n", + "\n", + "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import json\n", + "from firecrawl import FirecrawlApp\n", + "from anthropic import Anthropic\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Set Up API Keys and URL" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "URL to scrape: https://mendable.ai\n" + ] + } + ], + "source": [ + "# Retrieve API keys from environment variables\n", + "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Set the URL to scrape\n", + "url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n", + "\n", + "print(f\"URL to scrape: {url}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Initialize Firecrawl and Anthropic Clients" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Firecrawl and Anthropic clients initialized.\n" + ] + } + ], + "source": [ + "# Initialize FirecrawlApp and Anthropic client\n", + "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n", + "anthropic_client = Anthropic(api_key=anthropic_api_key)\n", + "\n", + "print(\"Firecrawl and Anthropic clients initialized.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Scrape the URL using Firecrawl" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page content scraped. Length: 16199 characters\n" + ] + } + ], + "source": [ + "# Scrape the URL using Firecrawl\n", + "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n", + "\n", + "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Prepare the Prompt for Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt prepared for Claude.\n" + ] + } + ], + "source": [ + "# Prepare the prompt for Claude\n", + "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n", + "1. The title of the page\n", + "2. Whether the company is part of Y Combinator (YC)\n", + "3. Whether the company/product is open source\n", + "\n", + "Return the information in JSON format with the following schema:\n", + "{{\n", + " \"main_header_title\": string,\n", + " \"is_yc_company\": boolean,\n", + " \"is_open_source\": boolean\n", + "}}\n", + "\n", + "Webpage content:\n", + "{page_content['content']}\n", + "\n", + "Return only the JSON, nothing else.\"\"\"\n", + "\n", + "print(\"Prompt prepared for Claude.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Query Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Claude response received.\n" + ] + } + ], + "source": [ + "# Query Claude\n", + "response = anthropic_client.messages.create(\n", + " model=\"claude-3-opus-20240229\",\n", + " max_tokens=1000,\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + ")\n", + "\n", + "print(\"Claude response received.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Parse and Display the Result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"title\": \"Just in time answers for Sales and Support\",\n", + " \"is_yc_company\": true,\n", + " \"is_open_source\": false\n", + "}\n" + ] + } + ], + "source": [ + "# Parse and print the result\n", + "result = json.loads(response.content[0].text)\n", + "print(json.dumps(result, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/img/firecrawl_logo.png b/img/firecrawl_logo.png new file mode 100644 index 00000000..bd723222 Binary files /dev/null and b/img/firecrawl_logo.png differ