Merge remote-tracking branch 'origin/v1/node-sdk' into v1/python-sdk

This commit is contained in:
rafaelsideguide 2024-08-21 12:09:53 -03:00
commit af0e47a30e
118 changed files with 15145 additions and 2520 deletions

View File

@ -1,20 +0,0 @@
name: Check Redis
on:
schedule:
- cron: '*/5 * * * *'
env:
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
jobs:
clean-jobs:
runs-on: ubuntu-latest
steps:
- name: Send GET request to check queues
run: |
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/redis-health)
if [ "$response" -ne 200 ]; then
echo "Failed to check queues. Response: $response"
exit 1
fi
echo "Successfully checked queues. Response: $response"

View File

@ -1,7 +1,7 @@
name: Fly Deploy Direct name: Fly Deploy Direct
on: on:
schedule: schedule:
- cron: '0 */6 * * *' - cron: '0 */2 * * *'
env: env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

View File

@ -169,6 +169,41 @@ jobs:
run: npm run test run: npm run test
working-directory: ./apps/js-sdk/firecrawl working-directory: ./apps/js-sdk/firecrawl
go-sdk-tests:
name: Go SDK Tests
needs: pre-deploy-e2e-tests
runs-on: ubuntu-latest
services:
redis:
image: redis
ports:
- 6379:6379
steps:
- uses: actions/checkout@v3
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: "go.mod"
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies
run: pnpm install
working-directory: ./apps/api
- name: Start the application
run: npm start &
working-directory: ./apps/api
id: start_app
- name: Start workers
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Install dependencies for Go SDK
run: go mod tidy
working-directory: ./apps/go-sdk
- name: Run tests for Go SDK
run: go test -v ./... -timeout 180s
working-directory: ./apps/go-sdk/firecrawl
deploy: deploy:
name: Deploy app name: Deploy app
runs-on: ubuntu-latest runs-on: ubuntu-latest

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "apps/go-sdk/firecrawl"]
path = apps/go-sdk/firecrawl
url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"]
path = apps/go-sdk/examples
url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -1,36 +1,76 @@
## Self-hosting Firecrawl # Self-hosting Firecrawl
_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._ #### Contributor?
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally so you can run it on your own and contribute.
## Getting Started If you're contributing, note that the process is similar to other open-source repos, i.e., fork Firecrawl, make changes, run tests, PR.
First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`. If you have any questions or would like help getting on board, join our Discord community [here](https://discord.gg/gSmWdAkdwd) for more information or submit an issue on Github [here](https://github.com/mendableai/firecrawl/issues/new/choose)!
### Steps ## Why?
1. Clone the repository: Self-hosting Firecrawl is particularly beneficial for organizations with stringent security policies that require data to remain within controlled environments. Here are some key reasons to consider self-hosting:
```bash - **Enhanced Security and Compliance:** By self-hosting, you ensure that all data handling and processing complies with internal and external regulations, keeping sensitive information within your secure infrastructure. Note that Firecrawl is a Mendable product and relies on SOC2 Type2 certification, which means that the platform adheres to high industry standards for managing data security.
git clone https://github.com/mendableai/firecrawl.git - **Customizable Services:** Self-hosting allows you to tailor the services, such as the Playwright service, to meet specific needs or handle particular use cases that may not be supported by the standard cloud offering.
cd firecrawl - **Learning and Community Contribution:** By setting up and maintaining your own instance, you gain a deeper understanding of how Firecrawl works, which can also lead to more meaningful contributions to the project.
cp ./apps/api/.env.example ./.env
```
2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication: ### Considerations
```plaintext However, there are some limitations and additional responsibilities to be aware of:
USE_DB_AUTHENTICATION=false
```
3. Update the Redis URL in the .env file to align with the Docker configuration: 1. **Limited Access to Fire-engine:** Currently, self-hosted instances of Firecrawl do not have access to Fire-engine, which includes advanced features for handling IP blocks, robot detection mechanisms, and more. This means that while you can manage basic scraping tasks, more complex scenarios might require additional configuration or might not be supported.
2. **Manual Configuration Required:** If you need to use scraping methods beyond the basic fetch and Playwright options, you will need to manually configure these in the `.env` file. This requires a deeper understanding of the technologies and might involve more setup time.
```plaintext Self-hosting Firecrawl is ideal for those who need full control over their scraping and data processing environments but comes with the trade-off of additional maintenance and configuration efforts.
REDIS_URL=redis://redis:6379
```
4. #### Option: Running with TypeScript Playwright Service ## Steps
1. First, start by installing the dependencies
- Docker [instructions](https://docs.docker.com/get-docker/)
2. Set environment variables
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
`.env:`
```
# ===== Required ENVS ======
NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://redis:6379
REDIS_RATE_LIMIT_URL=redis://redis:6379
## To turn on DB authentication, you need to set up supabase.
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
# Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
```
3. *(Optional) Running with TypeScript Playwright Service*
* Update the `docker-compose.yml` file to change the Playwright service: * Update the `docker-compose.yml` file to change the Playwright service:
@ -49,16 +89,91 @@ First, clone this repository and copy the example env file from the API folder `
``` ```
* Don't forget to set the proxy server in your `.env` file as needed. * Don't forget to set the proxy server in your `.env` file as needed.
5. Build and run the Docker containers:
4. Build and run the Docker containers:
```bash ```bash
docker compose build docker compose build
docker compose up docker compose up
``` ```
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/@/queues`.
5. *(Optional)* Test the API
If youd like to test the crawl endpoint, you can run this:
```bash
curl -X POST http://localhost:3002/v0/crawl \
-H 'Content-Type: application/json' \
-d '{
"url": "https://mendable.ai"
}'
```
## Troubleshooting
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
### Supabase client is not configured
**Symptom:**
```bash
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Attempted to access Supabase client when it's not configured.
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Error inserting scrape event: Error: Supabase client is not configured.
```
**Explanation:**
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
### You're bypassing authentication
**Symptom:**
```bash
[YYYY-MM-DDTHH:MM:SS.SSSz]WARN - You're bypassing authentication
```
**Explanation:**
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
### Docker containers fail to start
**Symptom:**
Docker containers exit unexpectedly or fail to start.
**Solution:**
Check the Docker logs for any error messages using the command:
```bash
docker logs [container_name]
```
- Ensure all required environment variables are set correctly in the .env file.
- Verify that all Docker services defined in docker-compose.yml are correctly configured and the necessary images are available.
### Connection issues with Redis
**Symptom:**
Errors related to connecting to Redis, such as timeouts or "Connection refused".
**Solution:**
- Ensure that the Redis service is up and running in your Docker environment.
- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance, ensure that it points to the same URL in the `docker-compose.yaml` file (`redis://redis:6379`)
- Check network settings and firewall rules that may block the connection to the Redis port.
### API endpoint does not respond
**Symptom:**
API requests to the Firecrawl instance timeout or return no response.
**Solution:**
- Ensure that the Firecrawl service is running by checking the Docker container status.
- Verify that the PORT and HOST settings in your .env file are correct and that no other service is using the same port.
- Check the network configuration to ensure that the host is accessible from the client making the API request.
By addressing these common issues, you can ensure a smoother setup and operation of your self-hosted Firecrawl instance.
## Install Firecrawl on a Kubernetes Cluster (Simple Version) ## Install Firecrawl on a Kubernetes Cluster (Simple Version)
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.

View File

@ -2,8 +2,8 @@
NUM_WORKERS_PER_QUEUE=8 NUM_WORKERS_PER_QUEUE=8
PORT=3002 PORT=3002
HOST=0.0.0.0 HOST=0.0.0.0
REDIS_URL=redis://localhost:6379 REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
REDIS_RATE_LIMIT_URL=redis://localhost:6379 REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
## To turn on DB authentication, you need to set up supabase. ## To turn on DB authentication, you need to set up supabase.
@ -17,18 +17,27 @@ SUPABASE_URL=
SUPABASE_SERVICE_TOKEN= SUPABASE_SERVICE_TOKEN=
# Other Optionals # Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key # use if you've set up authentication and want to test with a real API key
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit TEST_API_KEY=
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit # set if you'd like to test the scraping rate limit
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking RATE_LIMIT_TEST_API_KEY_SCRAPE=
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) # set if you'd like to test the crawling rate limit
BULL_AUTH_KEY= @ RATE_LIMIT_TEST_API_KEY_CRAWL=
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail # set if you'd like to use scraping Be to handle JS blocking
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SCRAPING_BEE_API_KEY=
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api # add for LLM dependednt features (image alt generation, etc.)
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages OPENAI_API_KEY=
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs BULL_AUTH_KEY=@
POSTHOG_HOST= # set if you'd like to send posthog events like job logs # use if you're configuring basic logging with logtail
LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY=
# set if you'd like to send slack server health status messages
SLACK_WEBHOOK_URL=
# set if you'd like to send posthog events like job logs
POSTHOG_API_KEY=
# set if you'd like to send posthog events like job logs
POSTHOG_HOST=
STRIPE_PRICE_ID_STANDARD= STRIPE_PRICE_ID_STANDARD=
STRIPE_PRICE_ID_SCALE= STRIPE_PRICE_ID_SCALE=
@ -43,7 +52,8 @@ STRIPE_PRICE_ID_GROWTH_YEARLY=
HYPERDX_API_KEY= HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1 HDX_NODE_BETA_MODE=1
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta # set if you'd like to use the fire engine closed beta
FIRE_ENGINE_BETA_URL=
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) # Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
PROXY_SERVER= PROXY_SERVER=

2
apps/api/.gitignore vendored
View File

@ -6,3 +6,5 @@ dump.rdb
/mongo-data /mongo-data
/.next/ /.next/
.rdb

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 100 # hard_limit = 100
soft_limit = 50 soft_limit = 100
[[http_service.checks]] [[http_service.checks]]
grace_period = "10s" grace_period = "10s"
@ -51,12 +51,13 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 25 # hard_limit = 25
soft_limit = 20 soft_limit = 100
[[vm]] [[vm]]
size = 'performance-1x' size = 'performance-2x'
processes = ['app','worker'] processes = ['app','worker']
memory = 8192

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 200 # hard_limit = 200
soft_limit = 75 soft_limit = 200
[[http_service.checks]] [[http_service.checks]]
grace_period = "20s" grace_period = "20s"
@ -50,8 +50,8 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 30 # hard_limit = 30
soft_limit = 12 soft_limit = 200
[[vm]] [[vm]]
size = 'performance-4x' size = 'performance-4x'

924
apps/api/openapi-v0.json Normal file
View File

@ -0,0 +1,924 @@
{
"openapi": "3.0.0",
"info": {
"title": "Firecrawl API",
"version": "0.0.0",
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
"servers": [
{
"url": "https://api.firecrawl.dev/v0"
}
],
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
},
"extractorOptions": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScrapeResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl": {
"post": {
"summary": "Crawl multiple URLs based on options",
"operationId": "crawlUrls",
"tags": ["Crawling"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The base URL to start crawling from"
},
"crawlerOptions": {
"type": "object",
"properties": {
"includes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
},
"excludes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
},
"generateImgAltText": {
"type": "boolean",
"description": "Generate alt text for images using LLMs (must have a paid plan)",
"default": false
},
"returnOnlyUrls": {
"type": "boolean",
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false
},
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
},
"allowBackwardCrawling": {
"type": "boolean",
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
"default": false
},
"allowExternalContentLinks": {
"type": "boolean",
"description": "Allows the crawler to follow links to external websites.",
"default": false
}
}
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CrawlResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/search": {
"post": {
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
"operationId": "searchGoogle",
"tags": ["Search"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"format": "uri",
"description": "The query to search for"
},
"pageOptions": {
"type": "object",
"properties": {
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"fetchPageContent": {
"type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
}
}
},
"searchOptions": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of results. Max is 20 during beta."
}
}
}
},
"required": ["query"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SearchResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/status/{jobId}": {
"get": {
"tags": ["Crawl"],
"summary": "Get the status of a crawl job",
"operationId": "getCrawlStatus",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Status of the job (completed, active, failed, paused)"
},
"current": {
"type": "integer",
"description": "Current page number"
},
"total": {
"type": "integer",
"description": "Total number of pages"
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
}
},
"components": {
"securitySchemes": {
"bearerAuth": {
"type": "http",
"scheme": "bearer"
}
},
"schemas": {
"ScrapeResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
}
},
"SearchResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}
}
}
}
},
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"type": "string"
}
}
}
}
},
"security": [
{
"bearerAuth": []
}
]
}

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL and optionally extract information using an LLM", "summary": "Scrape a single URL",
"operationId": "scrapeAndExtractFromUrl", "operationId": "scrape",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -38,89 +38,47 @@
"format": "uri", "format": "uri",
"description": "The URL to scrape" "description": "The URL to scrape"
}, },
"pageOptions": { "formats": {
"type": "object", "type": "array",
"properties": { "items": {
"type": "string",
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
},
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
"default": ["markdown"]
},
"headers": { "headers": {
"type": "object", "type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
}, },
"includeHtml": { "includeTags": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
}, },
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
}, },
"onlyMainContent": { "excludeTags": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
}, },
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
}, },
"replaceAllPathsWithAbsolutePaths": { "onlyMainContent": {
"type": "boolean", "type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": true
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
},
"extractorOptions": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
}, },
"timeout": { "timeout": {
"type": "integer", "type": "integer",
"description": "Timeout in milliseconds for the request", "description": "Timeout in milliseconds for the request",
"default": 30000 "default": 30000
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
} }
}, },
"required": ["url"] "required": ["url"]
@ -317,6 +275,11 @@
"description": "Include a screenshot of the top of the page that you are scraping.", "description": "Include a screenshot of the top of the page that you are scraping.",
"default": false "default": false
}, },
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": { "waitFor": {
"type": "integer", "type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content", "description": "Wait x amount of milliseconds for the page to load to fetch content",
@ -731,24 +694,42 @@
"success": { "success": {
"type": "boolean" "type": "boolean"
}, },
"warning": {
"type": "string",
"nullable": true,
"description": "Warning message to let you know of any issues."
},
"data": { "data": {
"type": "object", "type": "object",
"properties": { "properties": {
"markdown": { "markdown": {
"type": "string" "type": "string",
}, "nullable": true,
"content": { "description": "Markdown content of the page if the `markdown` format was specified (default)"
"type": "string"
}, },
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true" "description": "HTML version of the content on page if the `html` format was specified"
}, },
"rawHtml": { "rawHtml": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true" "description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -770,27 +751,16 @@
"<any other metadata> ": { "<any other metadata> ": {
"type": "string" "type": "string"
}, },
"pageStatusCode": { "statusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
}, },
"pageError": { "error": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
} }
} }
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
} }
} }
} }
@ -800,24 +770,33 @@
"type": "object", "type": "object",
"properties": { "properties": {
"markdown": { "markdown": {
"type": "string" "type": "string",
}, "nullable": true,
"content": { "description": "Markdown content of the page if the `markdown` format was specified (default)"
"type": "string"
}, },
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true" "description": "HTML version of the content on page if the `html` format was specified"
}, },
"rawHtml": { "rawHtml": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true" "description": "Raw HTML content of the page if the `rawHtml` format was specified"
}, },
"index": { "links": {
"type": "integer", "type": "array",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." "items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -839,11 +818,11 @@
"<any other metadata> ": { "<any other metadata> ": {
"type": "string" "type": "string"
}, },
"pageStatusCode": { "statusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
}, },
"pageError": { "error": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
@ -861,16 +840,34 @@
"data": { "data": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": { "markdown": {
"type": "string" "type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
}, },
"content": { "html": {
"type": "string" "type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -888,7 +885,18 @@
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
} },
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
} }
} }
} }
@ -899,8 +907,15 @@
"CrawlResponse": { "CrawlResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
"jobId": { "success": {
"type": "boolean"
},
"id": {
"type": "string" "type": "string"
},
"url": {
"type": "string",
"format": "uri"
} }
} }
} }

View File

@ -57,6 +57,8 @@
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@sentry/node": "^8.13.0", "@sentry/node": "^8.13.0",
"@supabase/supabase-js": "^2.44.2", "@supabase/supabase-js": "^2.44.2",
"@types/express-ws": "^3.0.4",
"@types/ws": "^8.5.12",
"ajv": "^8.16.0", "ajv": "^8.16.0",
"async": "^3.2.5", "async": "^3.2.5",
"async-mutex": "^0.5.0", "async-mutex": "^0.5.0",
@ -71,6 +73,7 @@
"date-fns": "^3.6.0", "date-fns": "^3.6.0",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",
"express-rate-limit": "^7.3.1", "express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"form-data": "^4.0.0", "form-data": "^4.0.0",
"glob": "^10.4.2", "glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5", "gpt3-tokenizer": "^1.1.5",
@ -93,6 +96,7 @@
"promptable": "^0.0.10", "promptable": "^0.0.10",
"puppeteer": "^22.12.1", "puppeteer": "^22.12.1",
"rate-limiter-flexible": "2.4.2", "rate-limiter-flexible": "2.4.2",
"redlock": "5.0.0-beta.2",
"resend": "^3.4.0", "resend": "^3.4.0",
"robots-parser": "^3.0.1", "robots-parser": "^3.0.1",
"scrapingbee": "^1.7.4", "scrapingbee": "^1.7.4",
@ -104,8 +108,9 @@
"unstructured-client": "^0.11.3", "unstructured-client": "^0.11.3",
"uuid": "^10.0.0", "uuid": "^10.0.0",
"wordpos": "^2.1.0", "wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2", "xml2js": "^0.6.2",
"zod": "^3.23.4", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1" "zod-to-json-schema": "^3.23.1"
}, },
"nodemonConfig": { "nodemonConfig": {

112
apps/api/pnpm-lock.yaml generated
View File

@ -41,6 +41,12 @@ importers:
'@supabase/supabase-js': '@supabase/supabase-js':
specifier: ^2.44.2 specifier: ^2.44.2
version: 2.44.2 version: 2.44.2
'@types/express-ws':
specifier: ^3.0.4
version: 3.0.4
'@types/ws':
specifier: ^8.5.12
version: 8.5.12
ajv: ajv:
specifier: ^8.16.0 specifier: ^8.16.0
version: 8.16.0 version: 8.16.0
@ -83,6 +89,9 @@ importers:
express-rate-limit: express-rate-limit:
specifier: ^7.3.1 specifier: ^7.3.1
version: 7.3.1(express@4.19.2) version: 7.3.1(express@4.19.2)
express-ws:
specifier: ^5.0.2
version: 5.0.2(express@4.19.2)
form-data: form-data:
specifier: ^4.0.0 specifier: ^4.0.0
version: 4.0.0 version: 4.0.0
@ -106,7 +115,7 @@ importers:
version: 0.0.28 version: 0.0.28
langchain: langchain:
specifier: ^0.2.8 specifier: ^0.2.8
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
languagedetect: languagedetect:
specifier: ^2.0.0 specifier: ^2.0.0
version: 2.0.0 version: 2.0.0
@ -149,6 +158,9 @@ importers:
rate-limiter-flexible: rate-limiter-flexible:
specifier: 2.4.2 specifier: 2.4.2
version: 2.4.2 version: 2.4.2
redlock:
specifier: 5.0.0-beta.2
version: 5.0.0-beta.2
resend: resend:
specifier: ^3.4.0 specifier: ^3.4.0
version: 3.4.0 version: 3.4.0
@ -182,11 +194,14 @@ importers:
wordpos: wordpos:
specifier: ^2.1.0 specifier: ^2.1.0
version: 2.1.0 version: 2.1.0
ws:
specifier: ^8.18.0
version: 8.18.0
xml2js: xml2js:
specifier: ^0.6.2 specifier: ^0.6.2
version: 0.6.2 version: 0.6.2
zod: zod:
specifier: ^3.23.4 specifier: ^3.23.8
version: 3.23.8 version: 3.23.8
zod-to-json-schema: zod-to-json-schema:
specifier: ^3.23.1 specifier: ^3.23.1
@ -1556,6 +1571,9 @@ packages:
'@types/express-serve-static-core@4.19.3': '@types/express-serve-static-core@4.19.3':
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==} resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
'@types/express-ws@3.0.4':
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
'@types/express@4.17.21': '@types/express@4.17.21':
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==} resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
@ -1658,8 +1676,8 @@ packages:
'@types/whatwg-url@11.0.5': '@types/whatwg-url@11.0.5':
resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==} resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==}
'@types/ws@8.5.10': '@types/ws@8.5.12':
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==}
'@types/yargs-parser@21.0.3': '@types/yargs-parser@21.0.3':
resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==}
@ -2413,6 +2431,12 @@ packages:
peerDependencies: peerDependencies:
express: 4 || 5 || ^5.0.0-beta.1 express: 4 || 5 || ^5.0.0-beta.1
express-ws@5.0.2:
resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==}
engines: {node: '>=4.5.0'}
peerDependencies:
express: ^4.0.0 || ^5.0.0-alpha.1
express@4.19.2: express@4.19.2:
resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==} resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==}
engines: {node: '>= 0.10.0'} engines: {node: '>= 0.10.0'}
@ -3950,6 +3974,10 @@ packages:
redis@4.6.14: redis@4.6.14:
resolution: {integrity: sha512-GrNg/e33HtsQwNXL7kJT+iNFPSwE1IPmd7wzV3j4f2z0EYxZfZE7FVTmUysgAtqQQtg5NXF5SNLR9OdO/UHOfw==} resolution: {integrity: sha512-GrNg/e33HtsQwNXL7kJT+iNFPSwE1IPmd7wzV3j4f2z0EYxZfZE7FVTmUysgAtqQQtg5NXF5SNLR9OdO/UHOfw==}
redlock@5.0.0-beta.2:
resolution: {integrity: sha512-2RDWXg5jgRptDrB1w9O/JgSZC0j7y4SlaXnor93H/UJm/QyDiFgBKNtrh0TI6oCXqYSaSoXxFh6Sd3VtYfhRXw==}
engines: {node: '>=12'}
regenerator-runtime@0.14.1: regenerator-runtime@0.14.1:
resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==} resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==}
@ -4540,8 +4568,20 @@ packages:
resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==} resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==}
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
ws@8.17.1: ws@7.5.10:
resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==} resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
engines: {node: '>=8.3.0'}
peerDependencies:
bufferutil: ^4.0.1
utf-8-validate: ^5.0.2
peerDependenciesMeta:
bufferutil:
optional: true
utf-8-validate:
optional: true
ws@8.18.0:
resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==}
engines: {node: '>=10.0.0'} engines: {node: '>=10.0.0'}
peerDependencies: peerDependencies:
bufferutil: ^4.0.1 bufferutil: ^4.0.1
@ -5178,13 +5218,13 @@ snapshots:
'@js-sdsl/ordered-map@4.4.2': {} '@js-sdsl/ordered-map@4.4.2': {}
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
dependencies: dependencies:
ansi-styles: 5.2.0 ansi-styles: 5.2.0
camelcase: 6.3.0 camelcase: 6.3.0
decamelize: 1.2.0 decamelize: 1.2.0
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
ml-distance: 4.0.1 ml-distance: 4.0.1
mustache: 4.2.0 mustache: 4.2.0
p-queue: 6.6.2 p-queue: 6.6.2
@ -5196,9 +5236,9 @@ snapshots:
- langchain - langchain
- openai - openai
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))': '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
openai: 4.52.2 openai: 4.52.2
zod: 3.23.8 zod: 3.23.8
@ -5207,9 +5247,9 @@ snapshots:
- encoding - encoding
- langchain - langchain
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)': '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
transitivePeerDependencies: transitivePeerDependencies:
- langchain - langchain
@ -6367,8 +6407,8 @@ snapshots:
dependencies: dependencies:
'@supabase/node-fetch': 2.6.15 '@supabase/node-fetch': 2.6.15
'@types/phoenix': 1.6.5 '@types/phoenix': 1.6.5
'@types/ws': 8.5.10 '@types/ws': 8.5.12
ws: 8.17.1 ws: 8.18.0
transitivePeerDependencies: transitivePeerDependencies:
- bufferutil - bufferutil
- utf-8-validate - utf-8-validate
@ -6465,6 +6505,12 @@ snapshots:
'@types/range-parser': 1.2.7 '@types/range-parser': 1.2.7
'@types/send': 0.17.4 '@types/send': 0.17.4
'@types/express-ws@3.0.4':
dependencies:
'@types/express': 4.17.21
'@types/express-serve-static-core': 4.19.3
'@types/ws': 8.5.12
'@types/express@4.17.21': '@types/express@4.17.21':
dependencies: dependencies:
'@types/body-parser': 1.19.5 '@types/body-parser': 1.19.5
@ -6588,7 +6634,7 @@ snapshots:
dependencies: dependencies:
'@types/webidl-conversions': 7.0.3 '@types/webidl-conversions': 7.0.3
'@types/ws@8.5.10': '@types/ws@8.5.12':
dependencies: dependencies:
'@types/node': 20.14.1 '@types/node': 20.14.1
@ -7329,6 +7375,14 @@ snapshots:
dependencies: dependencies:
express: 4.19.2 express: 4.19.2
express-ws@5.0.2(express@4.19.2):
dependencies:
express: 4.19.2
ws: 7.5.10
transitivePeerDependencies:
- bufferutil
- utf-8-validate
express@4.19.2: express@4.19.2:
dependencies: dependencies:
accepts: 1.3.8 accepts: 1.3.8
@ -8241,17 +8295,17 @@ snapshots:
kleur@3.0.3: {} kleur@3.0.3: {}
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1): langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)) '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
binary-extensions: 2.3.0 binary-extensions: 2.3.0
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
js-yaml: 4.1.0 js-yaml: 4.1.0
jsonpointer: 5.0.1 jsonpointer: 5.0.1
langchainhub: 0.0.11 langchainhub: 0.0.11
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
ml-distance: 4.0.1 ml-distance: 4.0.1
openapi-types: 12.1.3 openapi-types: 12.1.3
p-retry: 4.6.2 p-retry: 4.6.2
@ -8271,14 +8325,14 @@ snapshots:
pdf-parse: 1.1.1 pdf-parse: 1.1.1
puppeteer: 22.12.1(typescript@5.4.5) puppeteer: 22.12.1(typescript@5.4.5)
redis: 4.6.14 redis: 4.6.14
ws: 8.17.1 ws: 8.18.0
transitivePeerDependencies: transitivePeerDependencies:
- encoding - encoding
- openai - openai
langchainhub@0.0.11: {} langchainhub@0.0.11: {}
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2): langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2):
dependencies: dependencies:
'@types/uuid': 9.0.8 '@types/uuid': 9.0.8
commander: 10.0.1 commander: 10.0.1
@ -8287,8 +8341,8 @@ snapshots:
p-retry: 4.6.2 p-retry: 4.6.2
uuid: 9.0.1 uuid: 9.0.1
optionalDependencies: optionalDependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1) langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.52.2 openai: 4.52.2
languagedetect@2.0.0: {} languagedetect@2.0.0: {}
@ -8992,7 +9046,7 @@ snapshots:
chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070) chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070)
debug: 4.3.5 debug: 4.3.5
devtools-protocol: 0.0.1299070 devtools-protocol: 0.0.1299070
ws: 8.17.1 ws: 8.18.0
transitivePeerDependencies: transitivePeerDependencies:
- bufferutil - bufferutil
- supports-color - supports-color
@ -9098,6 +9152,10 @@ snapshots:
'@redis/search': 1.1.6(@redis/client@1.5.16) '@redis/search': 1.1.6(@redis/client@1.5.16)
'@redis/time-series': 1.0.5(@redis/client@1.5.16) '@redis/time-series': 1.0.5(@redis/client@1.5.16)
redlock@5.0.0-beta.2:
dependencies:
node-abort-controller: 3.1.1
regenerator-runtime@0.14.1: {} regenerator-runtime@0.14.1: {}
require-directory@2.1.1: {} require-directory@2.1.1: {}
@ -9670,7 +9728,9 @@ snapshots:
imurmurhash: 0.1.4 imurmurhash: 0.1.4
signal-exit: 4.1.0 signal-exit: 4.1.0
ws@8.17.1: {} ws@7.5.10: {}
ws@8.18.0: {}
xml2js@0.6.2: xml2js@0.6.2:
dependencies: dependencies:

View File

@ -0,0 +1,609 @@
import request from "supertest";
import dotenv from "dotenv";
import {
ScrapeOptions,
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for v1 API Routes", () => {
beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true";
});
afterAll(() => {
delete process.env.USE_DB_AUTHENTICATION;
});
describe("GET /is-production", () => {
it.concurrent("should return the production status", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
"/is-production"
);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("isProduction");
});
});
describe("POST /v1/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/scrape"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
);
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
);
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
); // 30 seconds timeout
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
// formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true
};
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
if (!("data" in responseWithoutRemoveTags.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
const scrapeRequestWithRemoveTags: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
excludeTags: ['.nav', '#footer', 'strong'],
onlyMainContent: false // default is true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithRemoveTags);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
}, 30000);
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(400);
}, 60000);
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(401);
}, 60000);
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(403);
}, 60000);
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(404);
}, 60000);
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(405);
}, 60000);
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(500);
}, 60000);
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408);
}, 3000);
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.rawHtml).toContain("<html");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with waitFor",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 5000
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("links");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("PagerDuty");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with a valid links on page",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["links"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("links");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.links).toContain("https://firecrawl.dev");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
});
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/map"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should return a successful response with a valid API key", async () => {
const mapRequest = {
url: "https://roastmywebsite.ai"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
});
it.concurrent("should return a successful response with a valid API key and search", async () => {
const mapRequest = {
url: "https://usemotion.com",
search: "pricing"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("usemotion.com/pricing");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
const mapRequest = {
url: "https://firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: false
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).not.toContain("docs.firecrawl.dev");
})
it.concurrent("should return an error for invalid URL", async () => {
const mapRequest = {
url: "invalid-url",
includeSubdomains: true,
search: "test",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("error");
});
});
});

View File

@ -1,11 +1,15 @@
import request from "supertest"; import request from "supertest";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types"; import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config(); dotenv.config();
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for API Routes", () => { describe("E2E Tests for v0 API Routes", () => {
beforeAll(() => { beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true"; process.env.USE_DB_AUTHENTICATION = "true";
}); });
@ -24,20 +28,27 @@ describe("E2E Tests for API Routes", () => {
describe("POST /v0/scrape", () => { describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape"); const response: FirecrawlScrapeResponse = await request(TEST_URL).post(
"/v0/scrape"
);
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it.concurrent("should return an error response with an invalid API key", async () => { it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer invalid-api-key`) .set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); }
);
it.concurrent("should return a successful response with a valid API key", async () => { it.concurrent(
"should return a successful response with a valid API key",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -52,21 +63,36 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain("_Roast_"); expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website"); expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); expect(response.body.data.metadata.description).toBe(
expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
);
expect(response.body.data.metadata.robots).toBe("follow, index"); expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); expect(response.body.data.metadata.ogDescription).toBe(
expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); );
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai"); expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
}, 30000); // 30 seconds timeout },
30000
); // 30 seconds timeout
it.concurrent(
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { "should return a successful response with a valid API key and includeHtml set to true",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -86,44 +112,61 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.html).toContain("<h1"); expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout },
30000
); // 30 seconds timeout
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { it.concurrent(
"should return a successful response for a valid scrape with PDF file",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
await new Promise((r) => setTimeout(r, 6000)); await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds },
60000
); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { it.concurrent(
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
await new Promise((r) => setTimeout(r, 6000)); await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds },
60000
); // 60 seconds
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { it.concurrent(
const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL) "should return a successful response with a valid API key with removeTags option",
async () => {
const responseWithoutRemoveTags: FirecrawlScrapeResponse =
await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -134,16 +177,27 @@ describe("E2E Tests for API Routes", () => {
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); expect(responseWithoutRemoveTags.body.data.content).toContain(
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer "Scrape This Site"
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav );
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong expect(responseWithoutRemoveTags.body.data.content).toContain(
"Lessons and Videos"
); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain(
"[Sandbox]("
); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain(
"web scraping"
); // strong
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); .send({
url: "https://www.scrapethissite.com/",
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
});
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data"); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("content");
@ -154,121 +208,157 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
expect(response.body.data.content).not.toContain("web scraping"); // strong expect(response.body.data.content).not.toContain("web scraping"); // strong
}, 30000); // 30 seconds timeout },
30000
); // 30 seconds timeout
it.concurrent('should return a successful response for a scrape with 400 page', async () => { it.concurrent(
"should return a successful response for a scrape with 400 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/400' }); .send({ url: "https://httpstat.us/400" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(400); expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
}, 60000); // 60 seconds "bad request"
);
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 401 page', async () => { it.concurrent(
"should return a successful response for a scrape with 401 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/401' }); .send({ url: "https://httpstat.us/401" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(401); expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
}, 60000); // 60 seconds "unauthorized"
);
},
60000
); // 60 seconds
it.concurrent("should return a successful response for a scrape with 403 page", async () => { it.concurrent(
"should return a successful response for a scrape with 403 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/403' }); .send({ url: "https://httpstat.us/403" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(403); expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
}, 60000); // 60 seconds "forbidden"
);
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 404 page', async () => { it.concurrent(
"should return a successful response for a scrape with 404 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/404' }); .send({ url: "https://httpstat.us/404" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(404); expect(response.body.data.metadata.pageStatusCode).toBe(404);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); },
}, 60000); // 60 seconds 60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 405 page', async () => { it.concurrent(
"should return a successful response for a scrape with 405 page",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/405' }); .send({ url: "https://httpstat.us/405" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(405); expect(response.body.data.metadata.pageStatusCode).toBe(405);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); },
}, 60000); // 60 seconds 60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 500 page', async () => { it.concurrent(
"should return a successful response for a scrape with 500 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape') .post("/v0/scrape")
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set("Content-Type", "application/json")
.send({ url: 'https://httpstat.us/500' }); .send({ url: "https://httpstat.us/500" });
await new Promise((r) => setTimeout(r, 5000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(500); expect(response.body.data.metadata.pageStatusCode).toBe(500);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); },
}, 60000); // 60 seconds 60000
); // 60 seconds
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl"); const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
"/v0/crawl"
);
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it.concurrent("should return an error response with an invalid API key", async () => { it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL) const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer invalid-api-key`) .set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); }
);
it.concurrent("should return a successful response with a valid API key for crawl", async () => { it.concurrent(
"should return a successful response with a valid API key for crawl",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL) const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -279,9 +369,12 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.jobId).toMatch( expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
); );
}); }
);
it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { it.concurrent(
"should return a successful response with a valid API key and valid includes option",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -311,7 +404,10 @@ describe("E2E Tests for API Routes", () => {
} }
} }
const completedResponse = response; await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
@ -329,11 +425,19 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); 200
}, 180000); // 180 seconds );
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
},
180000
); // 180 seconds
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { it.concurrent(
"should return a successful response with a valid API key and valid excludes option",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -363,7 +467,12 @@ describe("E2E Tests for API Routes", () => {
} }
} }
const completedResponse: FirecrawlCrawlStatusResponse = response; await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
@ -372,9 +481,13 @@ describe("E2E Tests for API Routes", () => {
urls.forEach((url: string) => { urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
}); });
}, 90000); // 90 seconds },
90000
); // 90 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { it.concurrent(
"should return a successful response with max depth option for a valid crawl job",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -403,7 +516,9 @@ describe("E2E Tests for API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
} }
} }
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -414,8 +529,12 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); 200
);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
); );
@ -423,29 +542,43 @@ describe("E2E Tests for API Routes", () => {
// Check if all URLs have a maximum depth of 1 // Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => { urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split('/'); const pathSplits = new URL(url).pathname.split("/");
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); const depth =
pathSplits.length -
(pathSplits[0].length === 0 &&
pathSplits[pathSplits.length - 1].length === 0
? 1
: 0);
expect(depth).toBeLessThanOrEqual(2); expect(depth).toBeLessThanOrEqual(2);
}); });
}, 180000); },
180000
);
}); });
describe("POST /v0/crawlWebsitePreview", () => { describe("POST /v0/crawlWebsitePreview", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview"); const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
"/v0/crawlWebsitePreview"
);
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it.concurrent("should return an error response with an invalid API key", async () => { it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL) const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawlWebsitePreview") .post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer invalid-api-key`) .set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); }
);
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { it.concurrent(
"should return a timeout error when scraping takes longer than the specified timeout",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL) const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -453,7 +586,9 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://firecrawl.dev", timeout: 1000 }); .send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408); expect(response.statusCode).toBe(408);
}, 3000); },
3000
);
}); });
describe("POST /v0/search", () => { describe("POST /v0/search", () => {
@ -462,16 +597,21 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it.concurrent("should return an error response with an invalid API key", async () => { it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/search") .post("/v0/search")
.set("Authorization", `Bearer invalid-api-key`) .set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ query: "test" }); .send({ query: "test" });
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); }
);
it.concurrent("should return a successful response with a valid API key for search", async () => { it.concurrent(
"should return a successful response with a valid API key for search",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/search") .post("/v0/search")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -481,7 +621,9 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("success"); expect(response.body).toHaveProperty("success");
expect(response.body.success).toBe(true); expect(response.body.success).toBe(true);
expect(response.body).toHaveProperty("data"); expect(response.body).toHaveProperty("data");
}, 30000); // 30 seconds timeout },
60000
); // 60 seconds timeout
}); });
describe("GET /v0/crawl/status/:jobId", () => { describe("GET /v0/crawl/status/:jobId", () => {
@ -490,21 +632,29 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it.concurrent("should return an error response with an invalid API key", async () => { it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.get("/v0/crawl/status/123") .get("/v0/crawl/status/123")
.set("Authorization", `Bearer invalid-api-key`); .set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); }
);
it.concurrent("should return Job not found for invalid job ID", async () => { it.concurrent(
"should return Job not found for invalid job ID",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.get("/v0/crawl/status/invalidJobId") .get("/v0/crawl/status/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404); expect(response.statusCode).toBe(404);
}); }
);
it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { it.concurrent(
"should return a successful crawl status response for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -513,7 +663,6 @@ describe("E2E Tests for API Routes", () => {
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false; let isCompleted = false;
let completedResponse;
while (!isCompleted) { while (!isCompleted) {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -524,11 +673,16 @@ describe("E2E Tests for API Routes", () => {
if (response.body.status === "completed") { if (response.body.status === "completed") {
isCompleted = true; isCompleted = true;
completedResponse = response;
} else { } else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
} }
} }
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body).toHaveProperty("data");
@ -536,15 +690,24 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); 200
);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(doc => const childrenLinks = completedResponse.body.data.filter(
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") (doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
doc.metadata.sourceURL.includes("mendable.ai/blog")
); );
expect(childrenLinks.length).toBe(completedResponse.body.data.length); expect(childrenLinks.length).toBe(completedResponse.body.data.length);
}, 180000); // 120 seconds },
180000
); // 120 seconds
// TODO: review the test below // TODO: review the test below
// it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
@ -592,7 +755,9 @@ describe("E2E Tests for API Routes", () => {
// expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// }, 180000); // 120 seconds // }, 180000); // 120 seconds
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { it.concurrent(
"If someone cancels a crawl job, it should turn into failed status",
async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -619,18 +784,41 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("failed"); expect(completedResponse.body.status).toBe("failed");
expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data).toBeNull();
let isNullOrEmptyArray = false;
if (
completedResponse.body.data === null ||
completedResponse.body.data.length === 0
) {
isNullOrEmptyArray = true;
}
expect(isNullOrEmptyArray).toBe(true);
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
expect(completedResponse.body).toHaveProperty("partial_data"); expect(completedResponse.body).toHaveProperty("partial_data");
expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty(
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); "content"
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); );
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.partial_data[0]).toHaveProperty(
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); "markdown"
}, 60000); // 60 seconds );
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"metadata"
);
expect(
completedResponse.body.partial_data[0].metadata.pageStatusCode
).toBe(200);
expect(
completedResponse.body.partial_data[0].metadata.pageError
).toBeUndefined();
},
60000
); // 60 seconds
}); });
describe("POST /v0/scrape with LLM Extraction", () => { describe("POST /v0/scrape with LLM Extraction", () => {
it.concurrent("should extract data using LLM extraction mode", async () => { it.concurrent(
"should extract data using LLM extraction mode",
async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -677,63 +865,100 @@ describe("E2E Tests for API Routes", () => {
expect(llmExtraction).toHaveProperty("is_open_source"); expect(llmExtraction).toHaveProperty("is_open_source");
expect(llmExtraction.is_open_source).toBe(false); expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean"); expect(typeof llmExtraction.is_open_source).toBe("boolean");
}, 60000); // 60 secs },
60000
); // 60 secs
}); });
describe("POST /v0/crawl with fast mode", () => { describe("POST /v0/map", () => {
it.concurrent("should complete the crawl under 20 seconds", async () => { it.concurrent(
const startTime = Date.now(); "should return a list of links for mendable.ai without subdomains included",
async () => {
const crawlResponse = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawl") .post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ .send({
url: "https://flutterbricks.com", url: "https://mendable.ai",
crawlerOptions: {
mode: "fast"
}
}); });
expect(crawlResponse.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(response.body.links).not.toContain("https://docs.mendable.ai");
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
const jobId = crawlResponse.body.jobId; it.concurrent(
let statusResponse; "should return a list of links for a given URL with subdomains included",
let isFinished = false; async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
});
while (!isFinished) { expect(response.statusCode).toBe(200);
statusResponse = await request(TEST_URL) expect(response.body).toHaveProperty("success", true);
.get(`/v0/crawl/status/${jobId}`) expect(response.body).toHaveProperty("links");
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
expect(statusResponse.statusCode).toBe(200); it.concurrent(
isFinished = statusResponse.body.status === "completed"; "should return a list of links for a given URL with subdomains and search",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
search: "agents",
});
if (!isFinished) { expect(response.statusCode).toBe(200);
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again expect(response.body).toHaveProperty("success", true);
} expect(response.body).toHaveProperty("links");
} expect(response.body.links).toContain(
"https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html"
);
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
response.body.links.forEach((link) => {
expect(link).toContain("python.langchain.com");
});
},
60000
); // 60 secs
// const endTime = Date.now(); it.concurrent(
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds "should handle invalid URL input gracefully",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "invalid-url",
includeSubdomains: true,
search: "agents",
});
// console.log(`Time elapsed: ${timeElapsed} seconds`); expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(statusResponse.body.status).toBe("completed"); expect(response.body).toHaveProperty("details");
expect(statusResponse.body).toHaveProperty("data"); },
expect(statusResponse.body.data[0]).toHaveProperty("content"); 60000
expect(statusResponse.body.data[0]).toHaveProperty("markdown"); ); // 60 secs
expect(statusResponse.body.data[0]).toHaveProperty("metadata");
expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
const results = statusResponse.body.data;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
// });
expect(results.length).toBeGreaterThanOrEqual(10);
expect(results.length).toBeLessThanOrEqual(15);
}, 20000);
}); });
}); });

View File

@ -1,6 +1,6 @@
import { crawlController } from '../crawl' import { crawlController } from '../v0/crawl'
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct import { authenticateUser } from '../v0/auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create'; import { createIdempotencyKey } from '../../services/idempotency/create';
import { validateIdempotencyKey } from '../../services/idempotency/validate'; import { validateIdempotencyKey } from '../../services/idempotency/validate';
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';

View File

@ -1,69 +0,0 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlStatusController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + req.params.jobId);
let progress = job.progress;
if(typeof progress !== 'object') {
progress = {
current: 0,
current_url: '',
total: 0,
current_step: '',
partialDocs: []
}
}
const {
current = 0,
current_url = '',
total = 0,
current_step = '',
partialDocs = []
} = progress as { current: number, current_url: string, total: number, current_step: string, partialDocs: any[] };
let data = job.returnvalue;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(req.params.jobId);
if (supabaseData) {
data = supabaseData.docs;
}
}
const jobStatus = await job.getState();
res.json({
status: isCancelled ? "failed" : jobStatus,
// progress: job.progress(),
current,
current_url,
current_step,
total,
data: data && !isCancelled ? data : null,
partial_data: jobStatus == 'completed' && !isCancelled ? [] : partialDocs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,110 +0,0 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
import { billTeam } from "../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
try {
const a = new WebScraperDataProvider();
await a.setOptions({
jobId: uuidv4(),
mode: "single_urls",
urls: [url],
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
pageOptions: pageOptions,
});
const docs = await a.getDocuments(false, (progress) => {
job.updateProgress({
current: progress.current,
total: progress.total,
current_step: "SCRAPING",
current_url: progress.currentDocumentUrl,
});
});
return res.json({
success: true,
documents: docs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
});
await logCrawl(job.id.toString(), team_id);
res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,46 +0,0 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../src/lib/logger";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
if (!success) {
return res.status(status).json({ error });
}
// authenticate on supabase
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview",
pageOptions: pageOptions,
origin: "website-preview",
});
res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,59 +0,0 @@
import { Request, Response } from "express";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
let progress = job.progress;
if(typeof progress !== 'object') {
progress = {
current: 0,
current_url: '',
total: 0,
current_step: '',
partialDocs: []
}
}
const {
current = 0,
current_url = '',
total = 0,
current_step = '',
partialDocs = []
} = progress as { current: number, current_url: string, total: number, current_step: string, partialDocs: any[] };
let data = job.returnvalue;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(req.params.jobId);
if (supabaseData) {
data = supabaseData.docs;
}
}
let jobStatus = await job.getState();
if (jobStatus === 'waiting' || jobStatus === 'delayed' || jobStatus === 'waiting-children' || jobStatus === 'unknown' || jobStatus === 'prioritized') {
jobStatus = 'active';
}
res.json({
status: jobStatus,
// progress: job.progress(),
current,
current_url,
current_step,
total,
data: data ? data : null,
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,9 +1,9 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { Job } from "bullmq"; import { Job } from "bullmq";
import { Logger } from "../../lib/logger"; import { Logger } from "../../../lib/logger";
import { getWebScraperQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../services/alerts"; import { checkAlerts } from "../../../services/alerts";
export async function cleanBefore24hCompleteJobsController( export async function cleanBefore24hCompleteJobsController(
req: Request, req: Request,
@ -11,13 +11,13 @@ export async function cleanBefore24hCompleteJobsController(
) { ) {
Logger.info("🐂 Cleaning jobs older than 24h"); Logger.info("🐂 Cleaning jobs older than 24h");
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const batchSize = 10; const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = []; const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) { for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push( completedJobsPromises.push(
webScraperQueue.getJobs( scrapeQueue.getJobs(
["completed"], ["completed"],
i * batchSize, i * batchSize,
i * batchSize + batchSize, i * batchSize + batchSize,
@ -68,10 +68,10 @@ export async function checkQueuesController(req: Request, res: Response) {
// Use this as a "health check" that way we dont destroy the server // Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) { export async function queuesController(req: Request, res: Response) {
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const [webScraperActive] = await Promise.all([ const [webScraperActive] = await Promise.all([
webScraperQueue.getActiveCount(), scrapeQueue.getActiveCount(),
]); ]);
const noActiveJobs = webScraperActive === 0; const noActiveJobs = webScraperActive === 0;

View File

@ -1,8 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import Redis from "ioredis"; import Redis from "ioredis";
import { Logger } from "../../lib/logger"; import { Logger } from "../../../lib/logger";
import { sendSlackWebhook } from "../../services/alerts/slack"; import { redisRateLimitClient } from "../../../services/rate-limiter";
import { redisRateLimitClient } from "../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) { export async function redisHealthController(req: Request, res: Response) {
const retryOperation = async (operation, retries = 3) => { const retryOperation = async (operation, retries = 3) => {
@ -63,22 +62,22 @@ export async function redisHealthController(req: Request, res: Response) {
Logger.info( Logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}` `Redis instances health check: ${JSON.stringify(healthStatus)}`
); );
await sendSlackWebhook( // await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${JSON.stringify( // `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
healthStatus // healthStatus
)}`, // )}`,
true // true
); // );
return res return res
.status(500) .status(500)
.json({ status: "unhealthy", details: healthStatus }); .json({ status: "unhealthy", details: healthStatus });
} }
} catch (error) { } catch (error) {
Logger.error(`Redis health check failed: ${error}`); Logger.error(`Redis health check failed: ${error}`);
await sendSlackWebhook( // await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${error.message}`, // `[REDIS DOWN] Redis instances health check: ${error.message}`,
true // true
); // );
return res return res
.status(500) .status(500)
.json({ status: "unhealthy", message: error.message }); .json({ status: "unhealthy", message: error.message });

View File

@ -1,26 +1,77 @@
import { parseApi } from "../../src/lib/parseApi"; import { parseApi } from "../../../src/lib/parseApi";
import { getRateLimiter, } from "../../src/services/rate-limiter"; import { getRateLimiter } from "../../../src/services/rate-limiter";
import { AuthResponse, NotificationType, RateLimiterMode } from "../../src/types"; import {
import { supabase_service } from "../../src/services/supabase"; AuthResponse,
import { withAuth } from "../../src/lib/withAuth"; NotificationType,
RateLimiterMode,
} from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase";
import { withAuth } from "../../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification"; import { sendNotification } from "../../services/notification/email_notification";
import { Logger } from "../lib/logger"; import { Logger } from "../../lib/logger";
import { redlock } from "../../../src/services/redlock";
import { getValue } from "../../../src/services/redis";
import { setValue } from "../../../src/services/redis";
import { validate } from "uuid";
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> { function normalizedApiIsUuid(potentialUuid: string): boolean {
// Check if the string is a valid UUID
return validate(potentialUuid);
}
export async function authenticateUser(
req,
res,
mode?: RateLimiterMode
): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser)(req, res, mode);
} }
function setTrace(team_id: string, api_key: string) { function setTrace(team_id: string, api_key: string) {
try { try {
setTraceAttributes({ setTraceAttributes({
team_id, team_id,
api_key api_key,
}); });
} catch (error) { } catch (error) {
Logger.error(`Error setting trace attributes: ${error.message}`); Logger.error(`Error setting trace attributes: ${error.message}`);
} }
}
async function getKeyAndPriceId(normalizedApi: string): Promise<{
success: boolean;
teamId?: string;
priceId?: string;
error?: string;
status?: number;
}> {
const { data, error } = await supabase_service.rpc("get_key_and_price_id_2", {
api_key: normalizedApi,
});
if (error) {
Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`);
return {
success: false,
error:
"The server seems overloaded. Please contact hello@firecrawl.com if you aren't sending too many requests at once.",
status: 500,
};
}
if (!data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
// TODO: change this error code ?
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
} else {
return {
success: true,
teamId: data[0].team_id,
priceId: data[0].price_id,
};
}
} }
export async function supaAuthenticateUser( export async function supaAuthenticateUser(
req, req,
@ -51,20 +102,83 @@ export async function supaAuthenticateUser(
const iptoken = incomingIP + token; const iptoken = incomingIP + token;
let rateLimiter: RateLimiterRedis; let rateLimiter: RateLimiterRedis;
let subscriptionData: { team_id: string, plan: string } | null = null; let subscriptionData: { team_id: string; plan: string } | null = null;
let normalizedApi: string; let normalizedApi: string;
let team_id: string; let cacheKey = "";
let redLockKey = "";
const lockTTL = 15000; // 10 seconds
let teamId: string | null = null;
let priceId: string | null = null;
if (token == "this_is_just_a_preview_token") { if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
team_id = "preview"; teamId = "preview";
} else { } else {
normalizedApi = parseApi(token); normalizedApi = parseApi(token);
if (!normalizedApiIsUuid(normalizedApi)) {
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const { data, error } = await supabase_service.rpc( cacheKey = `api_key:${normalizedApi}`;
'get_key_and_price_id_2', { api_key: normalizedApi }
try {
const teamIdPriceId = await getValue(cacheKey);
if (teamIdPriceId) {
const { team_id, price_id } = JSON.parse(teamIdPriceId);
teamId = team_id;
priceId = price_id;
} else {
const {
success,
teamId: tId,
priceId: pId,
error,
status,
} = await getKeyAndPriceId(normalizedApi);
if (!success) {
return { success, error, status };
}
teamId = tId;
priceId = pId;
await setValue(
cacheKey,
JSON.stringify({ team_id: teamId, price_id: priceId }),
10
); );
}
} catch (error) {
Logger.error(`Error with auth function: ${error.message}`);
// const {
// success,
// teamId: tId,
// priceId: pId,
// error: e,
// status,
// } = await getKeyAndPriceId(normalizedApi);
// if (!success) {
// return { success, error: e, status };
// }
// teamId = tId;
// priceId = pId;
// const {
// success,
// teamId: tId,
// priceId: pId,
// error: e,
// status,
// } = await getKeyAndPriceId(normalizedApi);
// if (!success) {
// return { success, error: e, status };
// }
// teamId = tId;
// priceId = pId;
}
// get_key_and_price_id_2 rpc definition: // get_key_and_price_id_2 rpc definition:
// create or replace function get_key_and_price_id_2(api_key uuid) // create or replace function get_key_and_price_id_2(api_key uuid)
// returns table(key uuid, team_id uuid, price_id text) as $$ // returns table(key uuid, team_id uuid, price_id text) as $$
@ -82,41 +196,34 @@ export async function supaAuthenticateUser(
// end; // end;
// $$ language plpgsql; // $$ language plpgsql;
if (error) { const plan = getPlanByPriceId(priceId);
Logger.warn(`Error fetching key and price_id: ${error.message}`);
} else {
// console.log('Key and Price ID:', data);
}
if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const internal_team_id = data[0].team_id;
team_id = internal_team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging // HyperDX Logging
setTrace(team_id, normalizedApi); setTrace(teamId, normalizedApi);
subscriptionData = { subscriptionData = {
team_id: team_id, team_id: teamId,
plan: plan plan: plan,
} };
switch (mode) { switch (mode) {
case RateLimiterMode.Crawl: case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan); rateLimiter = getRateLimiter(
RateLimiterMode.Crawl,
token,
subscriptionData.plan
);
break; break;
case RateLimiterMode.Scrape: case RateLimiterMode.Scrape:
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan); rateLimiter = getRateLimiter(
RateLimiterMode.Scrape,
token,
subscriptionData.plan
);
break; break;
case RateLimiterMode.Search: case RateLimiterMode.Search:
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan); rateLimiter = getRateLimiter(
RateLimiterMode.Search,
token,
subscriptionData.plan
);
break; break;
case RateLimiterMode.CrawlStatus: case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
@ -134,7 +241,8 @@ export async function supaAuthenticateUser(
} }
} }
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id; const team_endpoint_token =
token === "this_is_just_a_preview_token" ? iptoken : teamId;
try { try {
await rateLimiter.consume(team_endpoint_token); await rateLimiter.consume(team_endpoint_token);
@ -147,7 +255,17 @@ export async function supaAuthenticateUser(
const startDate = new Date(); const startDate = new Date();
const endDate = new Date(); const endDate = new Date();
endDate.setDate(endDate.getDate() + 7); endDate.setDate(endDate.getDate() + 7);
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
// Cache longer for 429s
if (teamId && priceId && mode !== RateLimiterMode.Preview) {
await setValue(
cacheKey,
JSON.stringify({ team_id: teamId, price_id: priceId }),
60 // 10 seconds, cache for everything
);
}
return { return {
success: false, success: false,
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
@ -157,7 +275,9 @@ export async function supaAuthenticateUser(
if ( if (
token === "this_is_just_a_preview_token" && token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search) (mode === RateLimiterMode.Scrape ||
mode === RateLimiterMode.Preview ||
mode === RateLimiterMode.Search)
) { ) {
return { success: true, team_id: "preview" }; return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev // check the origin of the request and make sure its from firecrawl.dev
@ -181,8 +301,6 @@ export async function supaAuthenticateUser(
.select("*") .select("*")
.eq("key", normalizedApi); .eq("key", normalizedApi);
if (error || !data || data.length === 0) { if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`); Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return { return {
@ -195,26 +313,32 @@ export async function supaAuthenticateUser(
subscriptionData = data[0]; subscriptionData = data[0];
} }
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; return {
success: true,
team_id: subscriptionData.team_id,
plan: subscriptionData.plan ?? "",
};
} }
function getPlanByPriceId(price_id: string) { function getPlanByPriceId(price_id: string) {
switch (price_id) { switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER: case process.env.STRIPE_PRICE_ID_STARTER:
return 'starter'; return "starter";
case process.env.STRIPE_PRICE_ID_STANDARD: case process.env.STRIPE_PRICE_ID_STANDARD:
return 'standard'; return "standard";
case process.env.STRIPE_PRICE_ID_SCALE: case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale'; return "scale";
case process.env.STRIPE_PRICE_ID_HOBBY: case process.env.STRIPE_PRICE_ID_HOBBY:
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
return 'hobby'; return "hobby";
case process.env.STRIPE_PRICE_ID_STANDARD_NEW: case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
return 'standardnew'; return "standardnew";
case process.env.STRIPE_PRICE_ID_GROWTH: case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return 'growth'; return "growth";
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
return "growthdouble";
default: default:
return 'free'; return "free";
} }
} }

View File

@ -1,10 +1,9 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabase_service } from "../../../src/services/supabase";
import { supabase_service } from "../../src/services/supabase"; import { Logger } from "../../../src/lib/logger";
import { billTeam } from "../../src/services/billing/credit_billing"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import { Logger } from "../../src/lib/logger";
export async function crawlCancelController(req: Request, res: Response) { export async function crawlCancelController(req: Request, res: Response) {
try { try {
@ -18,8 +17,9 @@ export async function crawlCancelController(req: Request, res: Response) {
if (!success) { if (!success) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) { const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
} }
@ -39,27 +39,9 @@ export async function crawlCancelController(req: Request, res: Response) {
} }
} }
const jobState = await job.getState();
let progress = job.progress;
if(typeof progress !== 'object') {
progress = {
partialDocs: []
}
}
const {
partialDocs = []
} = progress as { partialDocs: any[] };
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length);
}
try { try {
await (await getWebScraperQueue().client).set("cancelled:" + job.id, "true", "EX", 60 * 60); sc.cancelled = true;
await job.discard(); await saveCrawl(req.params.jobId, sc);
} catch (error) { } catch (error) {
Logger.error(error); Logger.error(error);
} }

View File

@ -0,0 +1,60 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
export async function crawlStatusController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
if (sc.team_id !== team_id) {
return res.status(403).json({ error: "Forbidden" });
}
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobs = (await Promise.all(jobIDs.map(async x => {
const job = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(job.id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
return job;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
res.json({
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length,
data: jobStatus === "completed" ? data : null,
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,171 @@
import { Request, Response } from "express";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
await logCrawl(id, team_id);
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (_) {}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
const jobs = sitemap.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: 20,
}
};
})
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
}, {
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
});
await addCrawlJob(id, job.id);
}
res.json({ jobId: id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,135 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
const team_id = "preview";
if (!success) {
return res.status(status).json({ error });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
robots,
createdAt: Date.now(),
};
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
});
await addCrawlJob(id, job.id);
}
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
});
await addCrawlJob(id, job.id);
}
res.json({ jobId: id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,5 +1,5 @@
import { AuthResponse, RateLimiterMode } from "../types"; import { AuthResponse, RateLimiterMode } from "../../types";
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";

View File

@ -1,17 +1,17 @@
import { ExtractorOptions, PageOptions } from './../lib/entities'; import { ExtractorOptions, PageOptions } from './../../lib/entities';
import { Request, Response } from "express"; import { Request, Response } from "express";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { Document } from "../lib/entities"; import { Document } from "../../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../lib/default-values';
import { addScrapeJob } from '../services/queue-jobs'; import { addScrapeJob } from '../../services/queue-jobs';
import { scrapeQueueEvents } from '../services/queue-service'; import { scrapeQueueEvents } from '../../services/queue-service';
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger'; import { Logger } from '../../lib/logger';
export async function scrapeHelper( export async function scrapeHelper(
jobId: string, jobId: string,
@ -45,7 +45,7 @@ export async function scrapeHelper(
pageOptions, pageOptions,
extractorOptions, extractorOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
}); }, {}, jobId);
let doc; let doc;
try { try {
@ -62,6 +62,8 @@ export async function scrapeHelper(
} }
} }
await job.remove();
if (!doc) { if (!doc) {
console.error("!!! PANIC DOC IS", doc, job); console.error("!!! PANIC DOC IS", doc, job);
return { success: true, error: "No page found", returnCode: 200, data: doc }; return { success: true, error: "No page found", returnCode: 200, data: doc };
@ -121,13 +123,7 @@ export async function scrapeController(req: Request, res: Response) {
}; };
// Async check saves 500ms in average case
// Don't async check in llm extraction mode as it could be expensive
if (extractorOptions.mode.includes("llm-extraction")) {
await checkCredits(); await checkCredits();
} else {
checkCredits();
}
const jobId = uuidv4(); const jobId = uuidv4();

View File

@ -1,14 +1,15 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper"; import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities"; import { PageOptions, SearchOptions } from "../../lib/entities";
import { search } from "../search"; import { search } from "../../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger"; import { Logger } from "../../lib/logger";
import { getScrapeQueue, scrapeQueueEvents } from "../../services/queue-service";
export async function searchHelper( export async function searchHelper(
jobId: string, jobId: string,
@ -75,26 +76,28 @@ export async function searchHelper(
// filter out social media links // filter out social media links
const jobDatas = res.map(x => {
const a = new WebScraperDataProvider(); const url = x.url;
await a.setOptions({ const uuid = uuidv4();
jobId, return {
name: uuid,
data: {
url,
mode: "single_urls", mode: "single_urls",
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), crawlerOptions: crawlerOptions,
crawlerOptions: { team_id: team_id,
...crawlerOptions, pageOptions: pageOptions,
}, },
pageOptions: { opts: {
...pageOptions, jobId: uuid,
onlyMainContent: pageOptions?.onlyMainContent ?? true, priority: 10,
fetchPageContent: pageOptions?.fetchPageContent ?? true, }
includeHtml: pageOptions?.includeHtml ?? false, };
removeTags: pageOptions?.removeTags ?? [], })
fallback: false,
},
});
const docs = await a.getDocuments(false); const jobs = await getScrapeQueue().addBulk(jobDatas);
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
if (docs.length === 0) { if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
@ -109,19 +112,6 @@ export async function searchHelper(
return { success: true, error: "No page found", returnCode: 200, data: docs }; return { success: true, error: "No page found", returnCode: 200, data: docs };
} }
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return { return {
success: true, success: true,
data: filteredDocs, data: filteredDocs,
@ -150,7 +140,7 @@ export async function searchController(req: Request, res: Response) {
}; };
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const searchOptions = req.body.searchOptions ?? { limit: 7 }; const searchOptions = req.body.searchOptions ?? { limit: 5 };
const jobId = uuidv4(); const jobId = uuidv4();

View File

@ -0,0 +1,54 @@
import { Request, Response } from "express";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
const jobIDs = await getCrawlJobs(req.params.jobId);
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") {
// const supabaseData = await supabaseGetJobById(req.params.jobId);
// if (supabaseData) {
// data = supabaseData.docs;
// }
// }
const jobs = (await Promise.all(jobIDs.map(async x => {
const job = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(job.id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
return job;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
res.json({
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length,
data: jobStatus === "completed" ? data : null,
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -26,13 +26,7 @@ export async function supaAuthenticateUser(
req, req,
res, res,
mode?: RateLimiterMode mode?: RateLimiterMode
): Promise<{ ): Promise<AuthResponse> {
success: boolean;
team_id?: string;
error?: string;
status?: number;
plan?: string;
}> {
const authHeader = req.headers.authorization; const authHeader = req.headers.authorization;
if (!authHeader) { if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 }; return { success: false, error: "Unauthorized", status: 401 };
@ -106,7 +100,7 @@ export async function supaAuthenticateUser(
setTrace(team_id, normalizedApi); setTrace(team_id, normalizedApi);
subscriptionData = { subscriptionData = {
team_id: team_id, team_id: team_id,
plan: plan plan: plan,
} }
switch (mode) { switch (mode) {
case RateLimiterMode.Crawl: case RateLimiterMode.Crawl:
@ -121,6 +115,9 @@ export async function supaAuthenticateUser(
case RateLimiterMode.CrawlStatus: case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break; break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
break;
case RateLimiterMode.Preview: case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
@ -157,7 +154,7 @@ export async function supaAuthenticateUser(
if ( if (
token === "this_is_just_a_preview_token" && token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search) (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
) { ) {
return { success: true, team_id: "preview" }; return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev // check the origin of the request and make sure its from firecrawl.dev
@ -195,7 +192,12 @@ export async function supaAuthenticateUser(
subscriptionData = data[0]; subscriptionData = data[0];
} }
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; return {
success: true,
team_id: subscriptionData.team_id,
plan: subscriptionData.plan ?? "",
api_key: normalizedApi
};
} }
function getPlanByPriceId(price_id: string) { function getPlanByPriceId(price_id: string) {
switch (price_id) { switch (price_id) {

View File

@ -0,0 +1,148 @@
import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../v0/auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
import { getScrapeQueue, scrapeQueueEvents } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
type ErrorMessage = {
type: "error",
error: string,
}
type CatchupMessage = {
type: "catchup",
data: CrawlStatusResponse,
}
type DocumentMessage = {
type: "document",
data: Document,
}
type DoneMessage = { type: "done" }
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
function send(ws: WebSocket, msg: Message) {
if (ws.readyState === 1) {
return new Promise((resolve, reject) => {
ws.send(JSON.stringify(msg), (err) => {
if (err) reject(err);
else resolve(null);
});
});
}
}
function close(ws: WebSocket, code: number, msg: Message) {
if (ws.readyState <= 1) {
ws.close(code, JSON.stringify(msg));
}
}
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return close(ws, 1008, { type: "error", error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return close(ws, 3003, { type: "error", error: "Forbidden" });
}
let doneJobIDs = [];
const completedListener = async e => {
const job = await getScrapeQueue().getJob(e.jobId)
if (job.data.crawl_id === req.params.jobId) {
if (doneJobIDs.includes(job.id)) return;
const j = await getJob(job.id);
if (j.returnvalue) {
send(ws, {
type: "document",
data: legacyDocumentConverter(j.returnvalue),
});
if (await isCrawlFinishedLocked(req.params.jobId)) {
await new Promise((resolve) => setTimeout(() => resolve(true), 5000)) // wait for last events to pour in
scrapeQueueEvents.removeListener("completed", completedListener);
close(ws, 1000, { type: "done" })
}
} else {
// FAILED
}
}
};
// TODO: handle failed jobs
scrapeQueueEvents.addListener("completed", completedListener);
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
const doneJobs = await getJobs(doneJobIDs);
const data = doneJobs.map(x => x.returnvalue);
send(ws, {
type: "catchup",
data: {
status,
totalCount: jobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data.map(x => legacyDocumentConverter(x)),
}
});
if (status !== "scraping") {
scrapeQueueEvents.removeListener("completed", completedListener);
return close(ws, 1000, { type: "done" });
}
}
// Basically just middleware and error wrapping
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
try {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
null,
RateLimiterMode.CrawlStatus,
);
if (!success) {
return close(ws, 3000, {
type: "error",
error,
});
}
req.auth = { team_id, plan };
await crawlStatusWS(ws, req);
} catch (err) {
const id = uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
});
}
}

View File

@ -1,89 +1,115 @@
import { Request, Response } from "express"; import { Response } from "express";
import { authenticateUser } from "./auth"; import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { RateLimiterMode } from "../../../src/types"; import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { addWebScraperJob } from "../../../src/services/queue-jobs"; import { getScrapeQueue } from "../../services/queue-service";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
import { Logger } from "../../../src/lib/logger";
import { v4 as uuidv4 } from "uuid";
export async function crawlStatusController(req: Request, res: Response) { export async function getJob(id: string) {
// TODO: validate req.params.jobId const job = await getScrapeQueue().getJob(id);
if (!job) return job;
try { if (process.env.USE_DB_AUTHENTICATION === "true") {
const { success, team_id, error, status } = await authenticateUser( const supabaseData = await supabaseGetJobById(id);
req,
res, if (supabaseData) {
RateLimiterMode.CrawlStatus job.returnvalue = supabaseData.docs;
); }
if (!success) {
return res.status(status).json({ error });
} }
// const job = await getWebScraperQueue().getJob(req.params.jobId); job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
// if (!job) {
// return res.status(404).json({ error: "Job not found" });
// }
// const { current, current_url, total, current_step, partialDocs } = await job.progress(); return job;
}
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") { export async function getJobs(ids: string[]) {
// const supabaseData = await supabaseGetJobById(req.params.jobId); const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
// if (supabaseData) { if (process.env.USE_DB_AUTHENTICATION === "true") {
// data = supabaseData.docs; const supabaseData = await supabaseGetJobsById(ids);
// }
// } supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
// const jobStatus = await job.getState(); if (job) {
job.returnvalue = x.docs;
// mock: }
const id = uuidv4(); })
const result = { }
totalCount: 100,
creditsUsed: 2, jobs.forEach(job => {
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(), job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
status: "scraping", // scraping, completed, failed });
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
data: [{ return jobs;
markdown: "test", }
content: "test",
html: "test", export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
rawHtml: "test", const sc = await getCrawl(req.params.jobId);
linksOnPage: ["test1", "test2"], if (!sc) {
screenshot: "test", return res.status(404).json({ success: false, error: "Job not found" });
metadata: { }
title: "test",
description: "test", if (sc.team_id !== req.auth.team_id) {
language: "test", return res.status(403).json({ success: false, error: "Forbidden" });
sourceURL: "test", }
statusCode: 200,
error: "test" const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
} const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
},
{ const jobIDs = await getCrawlJobs(req.params.jobId);
markdown: "test", const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
content: "test", const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
html: "test", const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
rawHtml: "test", const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
linksOnPage: ["test1", "test2"],
screenshot: "test", let doneJobs = [];
metadata: {
title: "test", if (end === undefined) { // determine 10 megabyte limit
description: "test", let bytes = 0;
language: "test", const bytesLimit = 10485760; // 10 MiB in bytes
sourceURL: "test", const factor = 100; // chunking for faster retrieval
statusCode: 200,
error: "test" for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
} // get current chunk and retrieve jobs
}] const currentIDs = doneJobsOrder.slice(i, i+factor);
} const jobs = await getJobs(currentIDs);
res.status(200).json(result); // iterate through jobs and add them one them one to the byte counter
} catch (error) { // both loops will break once we cross the byte counter
Logger.error(error); for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
return res.status(500).json({ error: error.message }); const job = jobs[ii];
} doneJobs.push(job);
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
}
}
// if we ran over the bytes limit, remove the last document
if (bytes > bytesLimit) {
doneJobs.splice(doneJobs.length - 1, 1);
}
} else {
doneJobs = await getJobs(doneJobsOrder);
}
const data = doneJobs.map(x => x.returnvalue);
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());
if (typeof req.query.limit === "string") {
nextURL.searchParams.set("limit", req.query.limit);
}
res.status(200).json({
status,
totalCount: jobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
next:
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
? undefined
: nextURL.href,
data: data.map(x => legacyDocumentConverter(x)),
});
} }

View File

@ -1,139 +1,126 @@
import { Request, Response } from "express"; import { Response } from "express";
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
import { billTeam } from "../../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger"; import {
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; CrawlRequest,
crawlRequestSchema,
CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger";
export async function crawlController(req: Request, res: Response) { export async function crawlController(
// expected req.body req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
res: Response<CrawlResponse>
// req.body = { ) {
// url: string req.body = crawlRequestSchema.parse(req.body);
// crawlerOptions: {
// includePaths: string[]
// excludePaths: string[]
// maxDepth: number
// limit: number
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
// allowExternalLinks: boolean
// ignoreSitemap: number
// }
// scrapeOptions: Exclude<Scrape, "url">
// }
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
try {
url = checkAndUpdateURL(url);
} catch (error) {
return res.status(400).json({ error: 'Invalid Url' });
}
// TODO: add job to queue
const id = uuidv4(); const id = uuidv4();
return res.status(200).json({ jobId: id, url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}` });
// const mode = req.body.mode ?? "crawl"; await logCrawl(id, req.auth.team_id);
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const { remainingCredits } = req.account;
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // TODO: Get rid of crawlerOptions
// try { const crawlerOptions = legacyCrawlerOptions(req.body);
// const a = new WebScraperDataProvider(); const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => { crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({ const sc: StoredCrawl = {
// url: url, originUrl: req.body.url,
// mode: mode ?? "crawl", // fix for single urls not working crawlerOptions,
// crawlerOptions: crawlerOptions, pageOptions,
// team_id: team_id, team_id: req.auth.team_id,
// pageOptions: pageOptions, createdAt: Date.now(),
// origin: req.body.origin ?? defaultOrigin, };
// });
// await logCrawl(job.id.toString(), team_id); const crawler = crawlToCrawler(id, sc);
// res.json({ jobId: job.id }); try {
} catch (error) { sc.robots = await crawler.getRobotsTxt();
Logger.error(error); } catch (e) {
return res.status(500).json({ error: error.message }); Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
);
} }
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) {
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
const job = await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
},
{
priority: 15,
}
);
await addCrawlJob(id, job.id);
}
return res.status(200).json({
success: true,
id,
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
});
} }

View File

@ -1,128 +1,94 @@
import { Request, Response } from "express"; import { Response } from "express";
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
import { billTeam } from "../../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger"; import {
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
import {
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
export async function mapController(req: Request, res: Response) { configDotenv();
// expected req.body
// req.body = { export async function mapController(
// url: string req: RequestWithAuth<{}, MapResponse, MapRequest>,
// ignoreSitemap: true?? res: Response<MapResponse>
// other crawler options? ) {
// } req.body = mapRequestSchema.parse(req.body);
const id = uuidv4();
let links: string[] = [req.body.url];
try { const sc: StoredCrawl = {
const { success, team_id, error, status } = await authenticateUser( originUrl: req.body.url,
req, crawlerOptions: legacyCrawlerOptions(req.body),
res, pageOptions: {},
RateLimiterMode.Crawl team_id: req.auth.team_id,
); createdAt: Date.now(),
if (!success) { };
return res.status(status).json({ error });
}
// if (req.headers["x-idempotency-key"]) { const crawler = crawlToCrawler(id, sc);
// const isIdempotencyValid = await validateIdempotencyKey(req);
// if (!isIdempotencyValid) {
// return res.status(409).json({ error: "Idempotency key already used" });
// }
// try {
// createIdempotencyKey(req);
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = const sitemap =
// await checkTeamCredits(team_id, 1); req.body.ignoreSitemap
// if (!creditsCheckSuccess) { ? null
// return res.status(402).json({ error: "Insufficient credits" }); : await crawler.tryGetSitemap();
// }
let url = req.body.url; if (sitemap !== null) {
if (!url) { sitemap.map((x) => {
return res.status(400).json({ error: "Url is required" }); links.push(x.url);
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
}); });
} }
try { let urlWithoutWww = req.body.url.replace("www.", "");
url = checkAndUpdateURL(url);
} catch (error) { let mapUrl = req.body.search
return res.status(400).json({ error: 'Invalid Url' }); ? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
numResults: 50,
});
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
} else {
mapResults.map((x) => {
links.push(x.url);
});
}
} }
return res.status(200).json({ urls: [ "test1", "test2" ] }); links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
// const mode = req.body.mode ?? "crawl";
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // allows for subdomains to be included
// try { links = links.filter((x) => isSameDomain(x, req.body.url));
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => { // if includeSubdomains is false, filter out subdomains
// job.progress({ if (!req.body.includeSubdomains) {
// current: progress.current, links = links.filter((x) => isSameSubdomain(x, req.body.url));
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
} }
// remove duplicates that could be due to http/https or www
links = [...new Set(links)];
await billTeam(req.auth.team_id, 1);
return res.status(200).json({
success: true,
links,
});
} }

View File

@ -1,253 +1,105 @@
// import { ExtractorOptions, PageOptions } from './../../lib/entities';
import { Request, Response } from "express"; import { Request, Response } from "express";
// import { WebScraperDataProvider } from "../../scraper/WebScraper";
// import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../types";
// import { logJob } from "../../services/logging/log_job";
// import { Document } from "../../lib/entities";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
// import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
// import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../../src/lib/default-values';
// import { v4 as uuidv4 } from "uuid";
import { Logger } from '../../lib/logger'; import { Logger } from '../../lib/logger';
import { checkAndUpdateURL } from '../../lib/validateUrl'; import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from 'uuid';
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob } from "../../services/queue-jobs";
import { scrapeQueueEvents } from '../../services/queue-service';
import { logJob } from "../../services/logging/log_job";
export async function scrapeController(req: Request, res: Response) { export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
let url = req.body.url; req.body = scrapeRequestSchema.parse(req.body);
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
try {
url = checkAndUpdateURL(url);
} catch (error) {
return { success: false, error: "Invalid URL", returnCode: 400 };
}
// TODO: check req.body
// mockup req.body
// req.body = {
// url: "test",
// headers: {
// "x-key": "test"
// },
// formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
// includeTags: ["test"],
// excludeTags: ["test"],
// onlyMainContent: false,
// timeout: 30000,
// waitFor: number
// }
try {
let earlyReturn = false; let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser( const origin = req.body.origin;
req, const timeout = req.body.timeout;
res, const pageOptions = legacyScrapeOptions(req.body);
RateLimiterMode.Scrape const jobId = uuidv4();
);
if (!success) { const startTime = new Date().getTime();
return res.status(status).json({ error }); const job = await addScrapeJob({
url: req.body.url,
mode: "single_urls",
crawlerOptions: {},
team_id: req.auth.team_id,
pageOptions,
extractorOptions: {},
origin: req.body.origin,
}, {}, jobId);
let doc: any | undefined;
try {
doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; // 60 seconds timeout
} catch (e) {
Logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && e.message.startsWith("Job wait")) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
} else {
return res.status(500).json({
success: false,
error: "Internal server error",
});
}
} }
// check credits await job.remove();
const result = { if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({
success: true, success: true,
warning: "test", warning: "No page found",
data: { data: doc
markdown: "test", });
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}
} }
return res.status(200).json(result); delete doc.index;
delete doc.provider;
// const crawlerOptions = req.body.crawlerOptions ?? {}; const endTime = new Date().getTime();
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const timeTakenInSeconds = (endTime - startTime) / 1000;
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
// const origin = req.body.origin ?? defaultOrigin;
// let timeout = req.body.timeout ?? defaultTimeout;
// if (extractorOptions.mode.includes("llm-extraction")) { let creditsToBeBilled = 1; // Assuming 1 credit per document
// pageOptions.onlyMainContent = true; if (earlyReturn) {
// timeout = req.body.timeout ?? 90000; // Don't bill if we're early returning
// } return;
// const checkCredits = async () => {
// try {
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// earlyReturn = true;
// return res.status(402).json({ error: "Insufficient credits" });
// }
// } catch (error) {
// Logger.error(error);
// earlyReturn = true;
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
// }
// };
// await checkCredits();
// const jobId = uuidv4();
// const startTime = new Date().getTime();
// const result = await scrapeHelper(
// jobId,
// req,
// team_id,
// crawlerOptions,
// pageOptions,
// extractorOptions,
// timeout,
// plan
// );
// const endTime = new Date().getTime();
// const timeTakenInSeconds = (endTime - startTime) / 1000;
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
// if (result.success) {
// let creditsToBeBilled = 1; // Assuming 1 credit per document
// const creditsPerLLMExtract = 50;
// if (extractorOptions.mode.includes("llm-extraction")) {
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
// creditsToBeBilled += creditsPerLLMExtract;
// }
// let startTimeBilling = new Date().getTime();
// if (earlyReturn) {
// // Don't bill if we're early returning
// return;
// }
// const billingResult = await billTeam(
// team_id,
// creditsToBeBilled
// );
// if (!billingResult.success) {
// return res.status(402).json({
// success: false,
// error: "Failed to bill team. Insufficient credits or subscription not found.",
// });
// }
// }
// logJob({
// job_id: jobId,
// success: result.success,
// message: result.error,
// num_docs: 1,
// docs: [result.data],
// time_taken: timeTakenInSeconds,
// team_id: team_id,
// mode: "scrape",
// url: req.body.url,
// crawlerOptions: crawlerOptions,
// pageOptions: pageOptions,
// origin: origin,
// extractor_options: extractorOptions,
// num_tokens: numTokens,
// });
// return res.status(result.returnCode).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
} }
const billingResult = await billTeam(
req.auth.team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.",
});
}
logJob({
job_id: jobId,
success: true,
message: "Scrape completed",
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: {},
pageOptions: pageOptions,
origin: origin,
extractor_options: { mode: "markdown" },
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
});
} }
// export async function scrapeHelper(
// jobId: string,
// req: Request,
// team_id: string,
// crawlerOptions: any,
// pageOptions: PageOptions,
// extractorOptions: ExtractorOptions,
// timeout: number,
// plan?: string
// ): Promise<{
// success: boolean;
// error?: string;
// data?: Document;
// returnCode: number;
// }> {
// const url = req.body.url;
// if (!url) {
// return { success: false, error: "Url is required", returnCode: 400 };
// }
// if (isUrlBlocked(url)) {
// return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
// }
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId,
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
// );
// const docsPromise = a.getDocuments(false);
// let docs;
// try {
// docs = await Promise.race([docsPromise, timeoutPromise]);
// } catch (error) {
// return error;
// }
// // make sure doc.content is not empty
// let filteredDocs = docs.filter(
// (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
// );
// if (filteredDocs.length === 0) {
// return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
// }
// // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
// if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
// filteredDocs.forEach(doc => {
// delete doc.rawHtml;
// });
// }
// return {
// success: true,
// data: filteredDocs[0],
// returnCode: 200,
// };
// }

View File

@ -0,0 +1,306 @@
import { Request } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
export type Format =
| "markdown"
| "html"
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage";
const url = z.preprocess(
(x) => {
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
if (x.startsWith("://")) {
return "http" + x;
} else {
return "http://" + x;
}
} else {
return x;
}
},
z
.string()
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => !isUrlBlocked(x),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
);
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const scrapeOptions = z.object({
formats: z
.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
])
.array()
.optional()
.default(["markdown"]),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), // default?
waitFor: z.number().int().nonnegative().finite().safe().default(0),
parsePDF: z.boolean().default(true),
}).strict(strictMessage);
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
url,
origin: z.string().optional().default("api"),
}).strict(strictMessage);
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
maxDepth: z.number().default(10), // default?
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
}).strict(strictMessage);
// export type CrawlerOptions = {
// includePaths?: string[];
// excludePaths?: string[];
// maxDepth?: number;
// limit?: number;
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
// allowExternalLinks?: boolean;
// ignoreSitemap?: boolean;
// };
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
limit: z.number().default(10000),
}).strict(strictMessage);
// export type CrawlRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
// };
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
url: z.string().url(),
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
}).strict(strictMessage);
// export type MapRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// };
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
metadata: {
title?: string;
description?: string;
language?: string;
keywords?: string;
robots?: string;
ogTitle?: string;
ogDescription?: string;
ogUrl?: string;
ogImage?: string;
ogAudio?: string;
ogDeterminer?: string;
ogLocale?: string;
ogLocaleAlternate?: string[];
ogSiteName?: string;
ogVideo?: string;
dcTermsCreated?: string;
dcDateCreated?: string;
dcDate?: string;
dcTermsType?: string;
dcType?: string;
dcTermsAudience?: string;
dcTermsSubject?: string;
dcSubject?: string;
dcDescription?: string;
dcTermsKeywords?: string;
modifiedTime?: string;
publishedTime?: string;
articleTag?: string;
articleSection?: string;
sourceURL?: string;
statusCode?: number;
error?: string;
};
};
export type ErrorResponse = {
success: false;
error: string;
details?: any;
};
export type ScrapeResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document;
};
export interface ScrapeResponseRequestTest {
statusCode: number;
body: ScrapeResponse;
error?: string;
}
export type CrawlResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
};
export type MapResponse =
| ErrorResponse
| {
success: true;
links: string[];
};
export type CrawlStatusParams = {
jobId: string;
};
export type CrawlStatusResponse =
| ErrorResponse
| {
status: "scraping" | "completed" | "failed" | "cancelled";
totalCount: number;
creditsUsed: number;
expiresAt: string;
next?: string;
data: Document[];
};
type AuthObject = {
team_id: string;
plan: string;
};
type Account = {
remainingCredits: number;
};
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
account?: Account;
}
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
}
export function legacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxCrawledDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
};
}
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
};
}
export function legacyDocumentConverter(doc: any): Document {
if (doc.metadata.screenshot) {
doc.screenshot = doc.metadata.screenshot;
delete doc.metadata.screenshot;
}
if (doc.metadata.fullPageScreenshot) {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
return {
markdown: doc.markdown,
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
metadata: {
...doc.metadata,
pageError: undefined,
pageStatusCode: undefined,
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
};
}

View File

@ -2,7 +2,7 @@ import express from "express";
import bodyParser from "body-parser"; import bodyParser from "body-parser";
import cors from "cors"; import cors from "cors";
import "dotenv/config"; import "dotenv/config";
import { getScrapeQueue, getWebScraperQueue } from "./services/queue-service"; import { getScrapeQueue } from "./services/queue-service";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import cluster from "cluster"; import cluster from "cluster";
@ -14,6 +14,8 @@ import http from 'node:http';
import https from 'node:https'; import https from 'node:https';
import CacheableLookup from 'cacheable-lookup'; import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1"; import { v1Router } from "./routes/v1";
import expressWs from "express-ws";
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -46,7 +48,8 @@ if (cluster.isMaster) {
} }
}); });
} else { } else {
const app = express(); const ws = expressWs(express());
const app = ws.app;
global.isProduction = process.env.IS_PRODUCTION === "true"; global.isProduction = process.env.IS_PRODUCTION === "true";
@ -59,7 +62,7 @@ if (cluster.isMaster) {
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
queues: [new BullAdapter(getWebScraperQueue()), new BullAdapter(getScrapeQueue())], queues: [new BullAdapter(getScrapeQueue())],
serverAdapter: serverAdapter, serverAdapter: serverAdapter,
}); });
@ -79,7 +82,7 @@ if (cluster.isMaster) {
// register router // register router
app.use(v0Router); app.use(v0Router);
app.use(v1Router); app.use("/v1", v1Router);
app.use(adminRouter); app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002; const DEFAULT_PORT = process.env.PORT ?? 3002;
@ -106,9 +109,9 @@ if (cluster.isMaster) {
app.get(`/serverHealthCheck`, async (req, res) => { app.get(`/serverHealthCheck`, async (req, res) => {
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const [waitingJobs] = await Promise.all([ const [waitingJobs] = await Promise.all([
webScraperQueue.getWaitingCount(), scrapeQueue.getWaitingCount(),
]); ]);
const noWaitingJobs = waitingJobs === 0; const noWaitingJobs = waitingJobs === 0;
@ -128,9 +131,9 @@ if (cluster.isMaster) {
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
const getWaitingJobsCount = async () => { const getWaitingJobsCount = async () => {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const [waitingJobsCount] = await Promise.all([ const [waitingJobsCount] = await Promise.all([
webScraperQueue.getWaitingCount(), scrapeQueue.getWaitingCount(),
]); ]);
return waitingJobsCount; return waitingJobsCount;
@ -183,11 +186,12 @@ if (cluster.isMaster) {
Logger.info(`Worker ${process.pid} started`); Logger.info(`Worker ${process.pid} started`);
} }
// const wsq = getWebScraperQueue(); // const sq = getScrapeQueue();
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
// sq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
// sq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
// wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
// wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
// wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
// wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

View File

@ -0,0 +1,32 @@
import { checkTeamCredits } from "../services/billing/credit_billing";
import { Logger } from "./logger";
type checkCreditsResponse = {
status: number;
error: string | null;
}
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
try {
const {
success: creditsCheckSuccess,
message: creditsCheckMessage
} = await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return {
status: 402,
error: "Insufficient credits"
};
}
} catch (error) {
Logger.error(error);
return {
status: 500,
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
};
}
return {
status: 200,
error: null
}
};

View File

@ -0,0 +1,123 @@
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
export type StoredCrawl = {
originUrl: string;
crawlerOptions: any;
pageOptions: any;
team_id: string;
robots?: string;
cancelled?: boolean;
createdAt: number;
};
export async function saveCrawl(id: string, crawl: StoredCrawl) {
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
}
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
const x = await redisConnection.get("crawl:" + id);
if (x === null) {
return null;
}
return JSON.parse(x);
}
export async function getCrawlExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("crawl:" + id);
d.setMilliseconds(d.getMilliseconds() + ttl);
d.setMilliseconds(0);
return d;
}
export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobs(id: string, job_ids: string[]) {
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
}
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
}
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
}
export async function isCrawlFinished(id: string) {
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
}
export async function isCrawlFinishedLocked(id: string) {
return (await redisConnection.exists("crawl:" + id + ":finish"));
}
export async function finishCrawl(id: string) {
if (await isCrawlFinished(id)) {
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
if (set === 1) {
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
}
return set === 1
}
}
export async function getCrawlJobs(id: string): Promise<string[]> {
return await redisConnection.smembers("crawl:" + id + ":jobs");
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
return false;
}
}
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
initialUrl: sc.originUrl,
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
limit: sc.crawlerOptions?.limit ?? 10000,
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
});
if (sc.robots !== undefined) {
try {
crawler.importRobotsTxt(sc.robots);
} catch (_) {}
}
return crawler;
}

View File

@ -7,6 +7,7 @@ export const defaultPageOptions = {
includeHtml: false, includeHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false,
parsePDF: true parsePDF: true
}; };

View File

@ -11,6 +11,7 @@ export interface Progress {
} }
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean;
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean; includeRawHtml?: boolean;
@ -18,11 +19,13 @@ export type PageOptions = {
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean; parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
onlyIncludeTags?: string | string[]; onlyIncludeTags?: string | string[];
includeLinks?: boolean;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -42,8 +45,8 @@ export type SearchOptions = {
export type CrawlerOptions = { export type CrawlerOptions = {
returnOnlyUrls?: boolean; returnOnlyUrls?: boolean;
includes?: string[]; includes?: string | string[];
excludes?: string[]; excludes?: string | string[];
maxCrawledLinks?: number; maxCrawledLinks?: number;
maxDepth?: number; maxDepth?: number;
limit?: number; limit?: number;
@ -64,6 +67,7 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;
bullJobId?: string; bullJobId?: string;
priority?: number;
}; };
export interface DocumentUrl { export interface DocumentUrl {

View File

@ -46,7 +46,7 @@ export class ScrapeEvents {
}).select().single(); }).select().single();
return (result.data as any).id; return (result.data as any).id;
} catch (error) { } catch (error) {
Logger.error(`Error inserting scrape event: ${error}`); // Logger.error(`Error inserting scrape event: ${error}`);
return null; return null;
} }
} }

View File

@ -17,3 +17,21 @@ export const supabaseGetJobById = async (jobId: string) => {
return data; return data;
} }
export const supabaseGetJobsById = async (jobIds: string[]) => {
const { data, error } = await supabase_service
.from('firecrawl_jobs')
.select('*')
.in('job_id', jobIds);
if (error) {
return [];
}
if (!data) {
return [];
}
return data;
}

View File

@ -0,0 +1,88 @@
import { isSameDomain } from "./validateUrl";
import { isSameSubdomain } from "./validateUrl";
describe("isSameDomain", () => {
it("should return true for a subdomain", () => {
const result = isSameDomain("http://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain", () => {
const result = isSameDomain("http://example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for different domains", () => {
const result = isSameDomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return true for a subdomain with different protocols", () => {
const result = isSameDomain("https://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for invalid URLs", () => {
const result = isSameDomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameDomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for a subdomain with www prefix", () => {
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain with www prefix", () => {
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
expect(result).toBe(true);
});
});
describe("isSameSubdomain", () => {
it("should return false for a subdomain", () => {
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
expect(result).toBe(false);
});
it("should return true for the same subdomain", () => {
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for different subdomains", () => {
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
it("should return false for different domains", () => {
const result = isSameSubdomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return false for invalid URLs", () => {
const result = isSameSubdomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameSubdomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for the same subdomain with different protocols", () => {
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return true for the same subdomain with www prefix", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for a subdomain with www prefix and different subdomain", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
});

View File

@ -1,9 +1,8 @@
const protocolIncluded = (url: string) => { const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?) // if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any . // regex checks if :// appears before any .
return(/^([^.:]+:\/\/)/.test(url)); return /^([^.:]+:\/\/)/.test(url);
} };
const getURLobj = (s: string) => { const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com // URL fails if we dont include the protocol ie google.com
@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
}; };
export const checkAndUpdateURL = (url: string) => { export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) { if (!protocolIncluded(url)) {
url = `http://${url}`; url = `http://${url}`;
} }
@ -30,9 +28,95 @@ export const checkAndUpdateURL = (url: string) => {
const typedUrlObj = urlObj as URL; const typedUrlObj = urlObj as URL;
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL"); throw new Error("Invalid URL");
} }
return { urlObj: typedUrlObj, url: url }; return { urlObj: typedUrlObj, url: url };
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
return domain1 === domain2;
} }
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};

View File

@ -12,7 +12,7 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { ScrapeEvents } from "../lib/scrape-events";
import { getWebScraperQueue } from "../services/queue-service"; import { getScrapeQueue } from "../services/queue-service";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
@ -27,7 +27,12 @@ export async function startWebScraperPipeline({
mode: job.data.mode, mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
extractorOptions: job.data.extractorOptions, extractorOptions: job.data.extractorOptions,
pageOptions: job.data.pageOptions, pageOptions: {
...job.data.pageOptions,
...(job.data.crawl_id ? ({
includeRawHtml: true,
}): {}),
},
inProgress: (progress) => { inProgress: (progress) => {
Logger.debug(`🐂 Job in progress ${job.id}`); Logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) { if (progress.currentDocument) {
@ -35,7 +40,7 @@ export async function startWebScraperPipeline({
if (partialDocs.length > 50) { if (partialDocs.length > 50) {
partialDocs = partialDocs.slice(-50); partialDocs = partialDocs.slice(-50);
} }
job.updateProgress({ ...progress, partialDocs: partialDocs }); // job.updateProgress({ ...progress, partialDocs: partialDocs });
} }
}, },
onSuccess: (result, mode) => { onSuccess: (result, mode) => {
@ -49,6 +54,7 @@ export async function startWebScraperPipeline({
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority,
})) as { success: boolean; message: string; docs: Document[] }; })) as { success: boolean; message: string; docs: Document[] };
} }
export async function runWebScraper({ export async function runWebScraper({
@ -62,6 +68,7 @@ export async function runWebScraper({
onError, onError,
team_id, team_id,
bull_job_id, bull_job_id,
priority,
}: RunWebScraperParams): Promise<RunWebScraperResult> { }: RunWebScraperParams): Promise<RunWebScraperResult> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
@ -74,6 +81,7 @@ export async function runWebScraper({
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
bullJobId: bull_job_id, bullJobId: bull_job_id,
priority,
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
@ -83,6 +91,7 @@ export async function runWebScraper({
extractorOptions, extractorOptions,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
}); });
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
@ -104,11 +113,8 @@ export async function runWebScraper({
return { url: doc.metadata.sourceURL }; return { url: doc.metadata.sourceURL };
} }
}) })
: docs.filter((doc) => doc.content.trim().length > 0); : docs;
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + bull_job_id);
if (!isCancelled) {
const billingResult = await billTeam(team_id, filteredDocs.length); const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) { if (!billingResult.success) {
@ -119,7 +125,6 @@ export async function runWebScraper({
docs: [], docs: [],
}; };
} }
}
// This is where the returnvalue from the job is set // This is where the returnvalue from the job is set
onSuccess(filteredDocs, mode); onSuccess(filteredDocs, mode);
@ -141,21 +146,21 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
.eq("job_id", job.id); .eq("job_id", job.id);
if (error) throw new Error(error.message); if (error) throw new Error(error.message);
try { // try {
if (mode === "crawl") { // if (mode === "crawl") {
await job.moveToCompleted(null, token, false); // await job.moveToCompleted(null, token, false);
} else { // } else {
await job.moveToCompleted(result, token, false); // await job.moveToCompleted(result, token, false);
} // }
} catch (error) { // } catch (error) {
// I think the job won't exist here anymore // // I think the job won't exist here anymore
} // }
} else { // } else {
try { // try {
await job.moveToCompleted(result, token, false); // await job.moveToCompleted(result, token, false);
} catch (error) { // } catch (error) {
// I think the job won't exist here anymore // // I think the job won't exist here anymore
} // }
} }
ScrapeEvents.logJobEvent(job, "completed"); ScrapeEvents.logJobEvent(job, "completed");
} catch (error) { } catch (error) {

View File

@ -1,10 +1,10 @@
import express from "express"; import express from "express";
import { redisHealthController } from "../controllers/admin/redis-health"; import { redisHealthController } from "../controllers/v0/admin/redis-health";
import { import {
checkQueuesController, checkQueuesController,
cleanBefore24hCompleteJobsController, cleanBefore24hCompleteJobsController,
queuesController, queuesController,
} from "../controllers/admin/queue"; } from "../controllers/v0/admin/queue";
export const adminRouter = express.Router(); export const adminRouter = express.Router();

View File

@ -1,14 +1,14 @@
import express from "express"; import express from "express";
import { crawlController } from "../../src/controllers/crawl"; import { crawlController } from "../../src/controllers/v0/crawl";
import { crawlStatusController } from "../../src/controllers/crawl-status"; import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
import { scrapeController } from "../../src/controllers/scrape"; import { scrapeController } from "../../src/controllers/v0/scrape";
import { crawlPreviewController } from "../../src/controllers/crawlPreview"; import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
import { searchController } from "../../src/controllers/search"; import { searchController } from "../../src/controllers/v0/search";
import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
import { keyAuthController } from "../../src/controllers/keyAuth"; import { keyAuthController } from "../../src/controllers/v0/keyAuth";
import { livenessController } from "../controllers/liveness"; import { livenessController } from "../controllers/v0/liveness";
import { readinessController } from "../controllers/readiness"; import { readinessController } from "../controllers/v0/readiness";
export const v0Router = express.Router(); export const v0Router = express.Router();

View File

@ -1,9 +1,21 @@
import express from "express"; import express, { NextFunction, Request, Response } from "express";
import { crawlController } from "../../src/controllers/v1/crawl"; import { crawlController } from "../../src/controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; // import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape"; import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../../src/controllers/v1/crawl-status"; import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { mapController } from "../../src/controllers/v1/map"; import { mapController } from "../../src/controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/v1/auth";
import { Logger } from "../lib/logger";
import { createIdempotencyKey } from "../services/idempotency/create";
import { validateIdempotencyKey } from "../services/idempotency/validate";
import { ZodError } from "zod";
import { checkTeamCredits } from "../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search"; // import { searchController } from "../../src/controllers/v1/search";
@ -12,23 +24,142 @@ import { mapController } from "../../src/controllers/v1/map";
// import { livenessController } from "../controllers/v1/liveness"; // import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness"; // import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
}
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
req.account = { remainingCredits }
next();
})()
.catch(err => next(err));
};
}
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
rateLimiterMode,
);
if (!success) {
return res.status(status).json({ success: false, error });
}
req.auth = { team_id, plan };
next();
})()
.catch(err => next(err));
}
}
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
(async () => {
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
createIdempotencyKey(req);
}
next();
})()
.catch(err => next(err));
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (req.body.url && isUrlBlocked(req.body.url)) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
}
}
expressWs(express());
export const v1Router = express.Router(); export const v1Router = express.Router();
v1Router.post("/v1/scrape", scrapeController); v1Router.post(
v1Router.post("/v1/crawl", crawlController); "/scrape",
v1Router.get("/v1/crawl/:jobId", crawlStatusController); blocklistMiddleware,
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController); authMiddleware(RateLimiterMode.Scrape),
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController); checkCreditsMiddleware(1),
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController); wrap(scrapeController)
);
v1Router.post(
"/crawl",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
idempotencyMiddleware,
checkCreditsMiddleware(),
wrap(crawlController)
);
v1Router.post(
"/map",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
wrap(mapController)
);
v1Router.get(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
);
v1Router.ws(
"/crawl/:jobId",
crawlStatusWSController
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
// v1Router.delete("/crawl/:jobId", crawlCancelController);
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication // // Auth route for key based authentication
// v1Router.get("/v1/keyAuth", keyAuthController); // v1Router.get("/keyAuth", keyAuthController);
// // Search routes // // Search routes
// v0Router.post("/v1/search", searchController); // v0Router.post("/search", searchController);
// Health/Probe routes // Health/Probe routes
// v1Router.get("/v1/health/liveness", livenessController); // v1Router.get("/health/liveness", livenessController);
// v1Router.get("/v1/health/readiness", readinessController); // v1Router.get("/health/readiness", readinessController);
v1Router.post("/v1/map", mapController); v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
if (err instanceof ZodError) {
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
} else {
const id = uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
}
});

175
apps/api/src/run-req.ts Normal file
View File

@ -0,0 +1,175 @@
import axios from "axios";
import { promises as fs } from "fs";
import { v4 as uuidV4 } from "uuid";
interface Result {
start_url: string;
job_id?: string;
idempotency_key?: string;
result_data_jsonb?: any;
}
async function sendCrawl(result: Result): Promise<string | undefined> {
const idempotencyKey = uuidV4();
const url = result.start_url;
try {
const response = await axios.post(
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
{
url: url,
crawlerOptions: {
limit: 75,
},
pageOptions: {
includeHtml: true,
replaceAllPathsWithAbsolutePaths: true,
waitFor: 1000,
},
},
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
result.idempotency_key = idempotencyKey;
return response.data.jobId;
} catch (error) {
console.error("Error sending crawl:", error);
return undefined;
}
}
async function getContent(result: Result): Promise<boolean> {
let attempts = 0;
while (attempts < 120) {
// Reduce the number of attempts to speed up
try {
const response = await axios.get(
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
if (response.data.status === "completed") {
result.result_data_jsonb = response.data.data;
// Job actually completed
return true;
}
} catch (error) {
console.error("Error getting content:", error);
}
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
attempts++;
}
// Set result as null if timed out
result.result_data_jsonb = null;
return false;
}
async function processResults(results: Result[]): Promise<void> {
let processedCount = 0;
let starterCount = 0;
const queue: Result[] = [];
const processedUrls = new Set<string>();
// Initialize the queue with the first 1000 results
for (let i = 0; i < Math.min(100, results.length); i++) {
queue.push(results[i]);
processedUrls.add(results[i].start_url);
}
// Function to process a single result
const processSingleResult = async (result: Result) => {
const jobId = await sendCrawl(result);
if (jobId) {
console.log(`Job requested count: ${starterCount}`);
starterCount++;
result.job_id = jobId;
processedCount++;
// Save the result to the file
try {
// Save job id along with the start_url
const resultWithJobId = results.map(r => ({
start_url: r.start_url,
job_id: r.job_id,
}));
await fs.writeFile(
"results_with_job_id_4000_6000.json",
JSON.stringify(resultWithJobId, null, 4)
);
} catch (error) {
console.error("Error writing to results_with_content.json:", error);
}
// Add a new result to the queue if there are more results to process
// if (processedCount < results.length) {
// for (let i = queue.length; i < results.length; i++) {
// if (!processedUrls.has(results[i].start_url)) {
// const nextResult = results[i];
// console.log("Next result:", nextResult.start_url);
// queue.push(nextResult);
// processedUrls.add(nextResult.start_url);
// console.log(`Queue length: ${queue.length}`);
// processSingleResult(nextResult);
// break;
// }
// }
// }
}
};
// Start processing the initial queue concurrently
// for (let i = 0; i < queue.length; i++) {
// processSingleResult(queue[i]);
// if ((i + 1) % 500 === 0) {
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
// }
// }
// Start processing the initial queue concurrently
// await Promise.all(queue.map(result => processSingleResult(result)));
for (let i = 0; i < results.length; i += 100) {
const batch = results.slice(i, i + 100);
Promise.all(batch.map((result) => processSingleResult(result)))
.then(() => {
console.log(`Processed ${i + 100} results.`);
})
.catch((error) => {
console.error(`Error processing batch starting at index ${i}:`, error);
});
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
}
}
// Example call
async function getStartUrls(): Promise<Result[]> {
try {
const data = await fs.readFile("starturls.json", "utf-8");
return JSON.parse(data);
} catch (error) {
console.error("Error reading starturls.json:", error);
return [];
}
}
async function main() {
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
// console.log(results.map((r) => r.start_url).slice(0, 3));
processResults(results)
.then(() => {
console.log("All results processed.");
})
.catch((error) => {
console.error("Error processing results:", error);
});
}
main();

View File

@ -23,8 +23,8 @@ describe('scrapSingleUrl', () => {
}, 10000); }, 10000);
}); });
it('should return a list of links on the mendable.ai page', async () => { it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://mendable.ai'; const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true }; const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl("TEST", url, pageOptions); const result = await scrapSingleUrl("TEST", url, pageOptions);
@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
expect(Array.isArray(result.linksOnPage)).toBe(true); expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://mendable.ai/blog') expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 10000); }, 10000);

View File

@ -1,4 +1,4 @@
import axios from "axios"; import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio"; import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
@ -22,7 +22,7 @@ export class WebCrawler {
private crawledUrls: Map<string, string> = new Map(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; public robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
@ -66,7 +66,7 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
const url = new URL(link.trim(), this.baseUrl); const url = new URL(link.trim(), this.baseUrl);
@ -130,6 +130,25 @@ export class WebCrawler {
.slice(0, limit); .slice(0, limit);
} }
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
return response.data;
}
public importRobotsTxt(txt: string) {
this.robots = robotsParser(this.robotsTxtUrl, txt);
}
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
return filteredLinks.map(link => ({ url: link, html: "" }));
}
return null;
}
public async start( public async start(
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions, pageOptions?: PageOptions,
@ -142,19 +161,17 @@ export class WebCrawler {
Logger.debug(`Crawler starting with ${this.initialUrl}`); Logger.debug(`Crawler starting with ${this.initialUrl}`);
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); const txt = await this.getRobotsTxt();
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.importRobotsTxt(txt);
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
if (!crawlerOptions?.ignoreSitemap){ if (!crawlerOptions?.ignoreSitemap){
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sm = await this.tryGetSitemap();
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sm !== null) {
if (sitemapLinks.length > 0) { return sm;
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" }));
} }
} }
@ -241,6 +258,54 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
public filterURL(href: string, url: string): string | null {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
}
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
return fullUrl;
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
return fullUrl;
}
}
return null;
}
public extractLinksFromHTML(html: string, url: string) {
let links: string[] = [];
const $ = load(html);
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
const u = this.filterURL(href, url);
if (u !== null) {
links.push(u);
}
}
});
return links;
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
@ -284,37 +349,7 @@ export class WebCrawler {
links.push({ url, html: content, pageStatusCode, pageError }); links.push({ url, html: content, pageStatusCode, pageError });
} }
$("a").each((_, element) => { links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
const href = $(element).attr("href");
if (href) {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
}
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
}
});
if (this.visited.size === 1) { if (this.visited.size === 1) {
return links; return links;
@ -420,9 +455,10 @@ export class WebCrawler {
".woff", ".woff",
".ttf", ".ttf",
".woff2", ".woff2",
".webp" ".webp",
".inc"
]; ];
return fileExtensions.some((ext) => url.endsWith(ext)); return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext));
} }
private isSocialMediaOrEmail(url: string): boolean { private isSocialMediaOrEmail(url: string): boolean {
@ -464,24 +500,32 @@ export class WebCrawler {
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) { if (response) {
sitemapLinks = response; sitemapLinks = response;
} }
} }
}
if (sitemapLinks.length === 0) { if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try { try {
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
} }
} }
}
const normalizedUrl = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));

View File

@ -16,18 +16,19 @@ import {
replacePathsWithAbsolutePaths, replacePathsWithAbsolutePaths,
} from "./utils/replacePaths"; } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction"; import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { getScrapeQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private jobId: string; private jobId: string;
private bullJobId: string; private bullJobId: string;
private urls: string[] = [""]; private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
private includes: string[]; private includes: string | string[];
private excludes: string[]; private excludes: string | string[];
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number = 10; private maxCrawledDepth: number = 10;
private returnOnlyUrls: boolean; private returnOnlyUrls: boolean;
@ -43,6 +44,7 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false; private allowExternalContentLinks: boolean = false;
private priority?: number;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -71,7 +73,8 @@ export class WebScraperDataProvider {
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
existingHTML existingHTML,
this.priority,
); );
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
@ -87,21 +90,6 @@ export class WebScraperDataProvider {
results[i + index] = result; results[i + index] = result;
}) })
); );
try {
if (this.mode === "crawl" && this.bullJobId) {
const job = await getWebScraperQueue().getJob(this.bullJobId);
const jobStatus = await job.getState();
if (jobStatus === "failed") {
Logger.info(
"Job has failed or has been cancelled by the user. Stopping the job..."
);
return [] as Document[];
}
}
} catch (error) {
Logger.error(error.message);
return [] as Document[];
}
} }
return results.filter((result) => result !== null) as Document[]; return results.filter((result) => result !== null) as Document[];
} }
@ -167,11 +155,29 @@ export class WebScraperDataProvider {
private async handleCrawlMode( private async handleCrawlMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let includes: string[];
if (Array.isArray(this.includes)) {
if (this.includes[0] != "") {
includes = this.includes;
}
} else {
includes = this.includes.split(',');
}
let excludes: string[];
if (Array.isArray(this.excludes)) {
if (this.excludes[0] != "") {
excludes = this.excludes;
}
} else {
excludes = this.excludes.split(',');
}
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: this.jobId, jobId: this.jobId,
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes,
excludes: this.excludes, excludes,
maxCrawledLinks: this.maxCrawledLinks, maxCrawledLinks: this.maxCrawledLinks,
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth), maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
limit: this.limit, limit: this.limit,
@ -287,7 +293,10 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
} }
if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
}
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
(this.extractorOptions.mode === "llm-extraction" || (this.extractorOptions.mode === "llm-extraction" ||
@ -316,12 +325,31 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: pdfLink,
worker: process.env.FLY_MACHINE_ID,
method: "pdf-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
pdfLink, pdfLink,
this.pageOptions.parsePDF this.pageOptions.parsePDF
); );
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
content: content, content: content,
markdown: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
@ -330,12 +358,32 @@ export class WebScraperDataProvider {
} }
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> { private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
docxLinks.map(async (p) => { docxLinks.map(async (docxLink) => {
const { content, pageStatusCode, pageError } = const timer = Date.now();
await fetchAndProcessDocx(p); const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: docxLink,
worker: process.env.FLY_MACHINE_ID,
method: "docx-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
docxLink
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
content, content,
metadata: { sourceURL: p, pageStatusCode, pageError }, metadata: { sourceURL: docxLink, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
}) })
@ -406,6 +454,10 @@ export class WebScraperDataProvider {
const url = new URL(document.metadata.sourceURL); const url = new URL(document.metadata.sourceURL);
const path = url.pathname; const path = url.pathname;
if (!Array.isArray(this.excludes)) {
this.excludes = this.excludes.split(',');
}
if (this.excludes.length > 0 && this.excludes[0] !== "") { if (this.excludes.length > 0 && this.excludes[0] !== "") {
// Check if the link should be excluded // Check if the link should be excluded
if ( if (
@ -417,6 +469,10 @@ export class WebScraperDataProvider {
} }
} }
if (!Array.isArray(this.includes)) {
this.includes = this.includes.split(',');
}
if (this.includes.length > 0 && this.includes[0] !== "") { if (this.includes.length > 0 && this.includes[0] !== "") {
// Check if the link matches the include patterns, if any are specified // Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0) { if (this.includes.length > 0) {
@ -528,14 +584,22 @@ export class WebScraperDataProvider {
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
false; false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); if (typeof options.crawlerOptions?.excludes === 'string') {
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
}
if (typeof options.crawlerOptions?.includes === 'string') {
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
}
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = this.allowBackwardCrawling =
options.crawlerOptions?.allowBackwardCrawling ?? false; options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
* @param url The URL to scrape * @param url The URL to scrape
* @param waitFor The time to wait for the page to load * @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot * @param screenshot Whether to take a screenshot
* @param fullPageScreenshot Whether to take a full page screenshot
* @param pageOptions The options for the page * @param pageOptions The options for the page
* @param headers The headers to send with the request * @param headers The headers to send with the request
* @param options The options for the request * @param options The options for the request
@ -20,18 +21,22 @@ export async function scrapWithFireEngine({
url, url,
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
priority,
}: { }: {
url: string; url: string;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
priority?: number;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
const logParams = { const logParams = {
url, url,
@ -47,8 +52,9 @@ export async function scrapWithFireEngine({
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright"; const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@ -61,17 +67,20 @@ export async function scrapWithFireEngine({
let engine = engineParam; // do we want fireEngineOptions as first choice? let engine = engineParam; // do we want fireEngineOptions as first choice?
Logger.info( Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
const response = await axios.post( const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
url: url, url: url,
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
...fireEngineOptionsParam, ...fireEngineOptionsParam,
}, },
{ {

View File

@ -123,17 +123,21 @@ export async function scrapSingleUrl(
jobId: string, jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
includeMarkdown: true,
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: false,
includeRawHtml: false, includeRawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false,
headers: undefined, headers: undefined,
includeLinks: true
}, },
extractorOptions: ExtractorOptions = { extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown", mode: "llm-extraction-from-markdown",
}, },
existingHtml: string = "" existingHtml: string = "",
priority?: number,
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -171,11 +175,13 @@ export async function scrapSingleUrl(
url, url,
waitFor: pageOptions.waitFor, waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot, screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
engine: engine, engine: engine,
} },
priority,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;
@ -306,7 +312,7 @@ export async function scrapSingleUrl(
const scrapersInOrder = getScrapingFallbackOrder( const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper, defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined pageOptions && pageOptions.headers && pageOptions.headers !== undefined
); );
@ -334,8 +340,8 @@ export async function scrapSingleUrl(
pageError = undefined; pageError = undefined;
} }
if (text && text.trim().length >= 100) { if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break; break;
} }
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) { if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
@ -357,20 +363,22 @@ export async function scrapSingleUrl(
let linksOnPage: string[] | undefined; let linksOnPage: string[] | undefined;
if (pageOptions.includeLinks) {
linksOnPage = extractLinks(rawHtml, urlToScrap); linksOnPage = extractLinks(rawHtml, urlToScrap);
}
let document: Document; let document: Document;
if (screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html" extractorOptions.mode === "llm-extraction-from-raw-html"
? rawHtml ? rawHtml
: undefined, : undefined,
linksOnPage, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
@ -382,7 +390,7 @@ export async function scrapSingleUrl(
} else { } else {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
@ -395,7 +403,7 @@ export async function scrapSingleUrl(
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError, pageError: pageError,
}, },
linksOnPage, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
}; };
} }
@ -409,9 +417,9 @@ export async function scrapSingleUrl(
}); });
return { return {
content: "", content: "",
markdown: "", markdown: pageOptions.includeMarkdown ? "" : undefined,
html: "", html: "",
linksOnPage: [], linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: { metadata: {
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,

View File

@ -19,7 +19,7 @@ export async function getLinksFromSitemap(
try { try {
let content: string; let content: string;
try { try {
if (mode === 'axios') { if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {

View File

@ -1,24 +1,11 @@
export const urlSpecificParams = { export const urlSpecificParams = {
"platform.openai.com": { "platform.openai.com": {
params: { defaultScraper: "fire-engine",
wait_browser: "networkidle2", params:{
block_resources: false, wait: 3000,
fireEngineOptions:{
engine: "chrome-cdp"
}, },
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
cookies: {
__cf_bm:
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
}, },
}, },
"support.greenpay.me":{ "support.greenpay.me":{
@ -232,4 +219,28 @@ export const urlSpecificParams = {
} }
}, },
}, },
"amazon.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
},
"digikey.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "tlsclient",
},
},
},
"zoopla.co.uk":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
}
}; };

View File

@ -4,15 +4,36 @@ import { createWriteStream } from "node:fs";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
import mammoth from "mammoth"; import mammoth from "mammoth";
import { Logger } from "../../../lib/logger";
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> { export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url); let tempFilePath = '';
const content = await processDocxToText(tempFilePath); let pageStatusCode = 200;
let pageError = '';
let content = '';
try {
const downloadResult = await downloadDocx(url);
tempFilePath = downloadResult.tempFilePath;
pageStatusCode = downloadResult.pageStatusCode;
pageError = downloadResult.pageError;
content = await processDocxToText(tempFilePath);
} catch (error) {
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
pageStatusCode = 500;
pageError = error.message;
content = '';
} finally {
if (tempFilePath) {
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
}
}
return { content, pageStatusCode, pageError }; return { content, pageStatusCode, pageError };
} }
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> { async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
try {
const response = await axios({ const response = await axios({
url, url,
method: "GET", method: "GET",
@ -26,16 +47,33 @@ async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageSt
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject); writer.on("error", () => {
Logger.error('Failed to write DOCX file to disk');
reject(new Error('Failed to write DOCX file to disk'));
}); });
});
} catch (error) {
Logger.error(`Failed to download DOCX: ${error.message}`);
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
}
} }
export async function processDocxToText(filePath: string): Promise<string> { export async function processDocxToText(filePath: string): Promise<string> {
try {
const content = await extractTextFromDocx(filePath); const content = await extractTextFromDocx(filePath);
return content; return content;
} catch (error) {
Logger.error(`Failed to process DOCX to text: ${error.message}`);
return "";
}
} }
async function extractTextFromDocx(filePath: string): Promise<string> { async function extractTextFromDocx(filePath: string): Promise<string> {
try {
const result = await mammoth.extractRawText({ path: filePath }); const result = await mammoth.extractRawText({ path: filePath });
return result.value; return result.value;
} catch (error) {
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
return "";
}
} }

View File

@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null; description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
const pattern = /([a-zA-Z]+-[A-Z]{2})/; language = soup('html').attr('lang') || null;
const match = pattern.exec(url);
language = match ? match[1] : null;
keywords = soup('meta[name="keywords"]').attr("content") || null; keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null;

View File

@ -76,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
let attempt = 0; let attempt = 0;
const maxAttempts = 10; // Maximum number of attempts const maxAttempts = 10; // Maximum number of attempts
let resultAvailable = false; let resultAvailable = false;
while (attempt < maxAttempts && !resultAvailable) { while (attempt < maxAttempts && !resultAvailable) {
try { try {
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
@ -90,13 +89,22 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
} catch (error) { } catch (error) {
Logger.debug("Error fetching result w/ LlamaIndex"); Logger.debug("Error fetching result w/ LlamaIndex");
attempt++; attempt++;
if (attempt >= maxAttempts) {
Logger.error("Max attempts reached, unable to fetch result.");
break; // Exit the loop if max attempts are reached
}
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
} }
} }
if (!resultAvailable) { if (!resultAvailable) {
try {
content = await processPdf(filePath); content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
} }
content = resultResponse.data[resultType]; content = resultResponse.data[resultType];
} catch (error) { } catch (error) {
@ -104,15 +112,29 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
content = await processPdf(filePath); content = await processPdf(filePath);
} }
} else if (parsePDF) { } else if (parsePDF) {
try {
content = await processPdf(filePath); content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
} else { } else {
try {
content = fs.readFileSync(filePath, "utf-8"); content = fs.readFileSync(filePath, "utf-8");
} catch (error) {
Logger.error(`Failed to read PDF file: ${error}`);
content = "";
}
} }
return content; return content;
} }
async function processPdf(file: string) { async function processPdf(file: string) {
try {
const fileContent = fs.readFileSync(file); const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;
} catch (error) {
throw error;
}
} }

View File

@ -41,10 +41,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
links.push(href); links.push(href);
} else if (href.startsWith('/')) { } else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin // Relative URL starting with '/', append to origin
links.push(`${origin}${href}`); links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) { } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL // Relative URL not starting with '/', append to base URL
links.push(`${baseUrl}/${href}`); links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) { } else if (href.startsWith('mailto:')) {
// mailto: links, add as is // mailto: links, add as is
links.push(href); links.push(href);

View File

@ -0,0 +1,44 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function fireEngineMap(q: string, options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
numResults: number;
page?: number;
}): Promise<SearchResult[]> {
let data = JSON.stringify({
query: q,
lang: options.lang,
country: options.country,
location: options.location,
tbs: options.tbs,
numResults: options.numResults,
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
return [];
}
let config = {
method: "POST",
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response) {
return response.data
} else {
return [];
}
}

View File

@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> { export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
let proxies = null; let proxies = null;
if (proxy) { if (proxy) {
if (proxy.startsWith("https")) { if (proxy.startsWith("https")) {

View File

@ -1,11 +1,9 @@
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities"; import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch"; import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { serper_search } from "./serper"; import { serper_search } from "./serper";
export async function search({ export async function search({
query, query,
advanced = false, advanced = false,
@ -30,12 +28,20 @@ export async function search({
proxy?: string; proxy?: string;
sleep_interval?: number; sleep_interval?: number;
timeout?: number; timeout?: number;
}) : Promise<SearchResult[]> { }): Promise<SearchResult[]> {
try { try {
if (process.env.SERPER_API_KEY ) {
return await serper_search(query, {num_results, tbs, filter, lang, country, location}); if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
} }
return await google_search( return await googleSearch(
query, query,
advanced, advanced,
num_results, num_results,
@ -49,7 +55,6 @@ export async function search({
); );
} catch (error) { } catch (error) {
Logger.error(`Error in search function: ${error}`); Logger.error(`Error in search function: ${error}`);
return [] return [];
} }
// if process.env.SERPER_API_KEY is set, use serper
} }

View File

@ -1,5 +1,5 @@
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";
import { getWebScraperQueue } from "../queue-service"; import { getScrapeQueue } from "../queue-service";
import { sendSlackWebhook } from "./slack"; import { sendSlackWebhook } from "./slack";
export async function checkAlerts() { export async function checkAlerts() {
@ -13,8 +13,8 @@ export async function checkAlerts() {
Logger.info("Initializing alerts"); Logger.info("Initializing alerts");
const checkActiveJobs = async () => { const checkActiveJobs = async () => {
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const activeJobs = await webScraperQueue.getActiveCount(); const activeJobs = await scrapeQueue.getActiveCount();
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
Logger.warn( Logger.warn(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
@ -34,8 +34,8 @@ export async function checkAlerts() {
}; };
const checkWaitingQueue = async () => { const checkWaitingQueue = async () => {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const waitingJobs = await webScraperQueue.getWaitingCount(); const waitingJobs = await scrapeQueue.getWaitingCount();
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
Logger.warn( Logger.warn(
@ -49,7 +49,7 @@ export async function checkAlerts() {
}; };
const checkAll = async () => { const checkAll = async () => {
await checkActiveJobs(); // await checkActiveJobs();
await checkWaitingQueue(); await checkWaitingQueue();
}; };

View File

@ -3,9 +3,13 @@ import { withAuth } from "../../lib/withAuth";
import { sendNotification } from "../notification/email_notification"; import { sendNotification } from "../notification/email_notification";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
import { getValue, setValue } from "../redis";
import { redlock } from "../redlock";
const FREE_CREDITS = 500; const FREE_CREDITS = 500;
export async function billTeam(team_id: string, credits: number) { export async function billTeam(team_id: string, credits: number) {
return withAuth(supaBillTeam)(team_id, credits); return withAuth(supaBillTeam)(team_id, credits);
} }
@ -164,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
export async function checkTeamCredits(team_id: string, credits: number) { export async function checkTeamCredits(team_id: string, credits: number) {
return withAuth(supaCheckTeamCredits)(team_id, credits); return withAuth(supaCheckTeamCredits)(team_id, credits);
} }
// if team has enough credits for the operation, return true, else return false // if team has enough credits for the operation, return true, else return false
export async function supaCheckTeamCredits(team_id: string, credits: number) { export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (team_id === "preview") { if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" }; return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
} }
// Retrieve the team's active subscription and check for available coupons concurrently // Retrieve the team's active subscription and check for available coupons concurrently
@ -198,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (subscriptionError || !subscription) { if (subscriptionError || !subscription) {
// If there is no active subscription but there are available coupons // If there is no active subscription but there are available coupons
if (couponCredits >= credits) { if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
} }
const { data: creditUsages, error: creditUsageError } = const { data: creditUsages, error: creditUsageError } =
@ -248,13 +253,26 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { return {
success: false, success: false,
message: "Insufficient credits, please upgrade!", message: "Insufficient credits, please upgrade!",
remainingCredits: FREE_CREDITS - totalCreditsUsed
}; };
} }
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
} }
let totalCreditsUsed = 0; let totalCreditsUsed = 0;
const cacheKey = `credit_usage_${subscription.id}_${subscription.current_period_start}_${subscription.current_period_end}_lc`;
const redLockKey = `lock_${cacheKey}`;
const lockTTL = 10000; // 10 seconds
try { try {
const lock = await redlock.acquire([redLockKey], lockTTL);
try {
const cachedCreditUsage = await getValue(cacheKey);
if (cachedCreditUsage) {
totalCreditsUsed = parseInt(cachedCreditUsage);
} else {
const { data: creditUsages, error: creditUsageError } = const { data: creditUsages, error: creditUsageError } =
await supabase_service.rpc("get_credit_usage_2", { await supabase_service.rpc("get_credit_usage_2", {
sub_id: subscription.id, sub_id: subscription.id,
@ -268,9 +286,15 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) { if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used; totalCreditsUsed = creditUsages[0].total_credits_used;
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
}
}
} finally {
await lock.release();
} }
} catch (error) { } catch (error) {
Logger.error(`Error calculating credit usage: ${error}`); Logger.error(`Error acquiring lock or calculating credit usage: ${error}`);
} }
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
@ -299,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
subscription.current_period_start, subscription.current_period_start,
subscription.current_period_end subscription.current_period_end
); );
return { success: false, message: "Insufficient credits, please upgrade!" }; return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
} else if (creditUsagePercentage >= 0.8) { } else if (creditUsagePercentage >= 0.8) {
// Send email notification for approaching credit limit // Send email notification for approaching credit limit
await sendNotification( await sendNotification(
@ -310,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
); );
} }
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
} }
// Count the total credits used by a team within the current billing period and return the remaining credits. // Count the total credits used by a team within the current billing period and return the remaining credits.

View File

@ -40,10 +40,11 @@ export async function logJob(job: FirecrawlJob) {
extractor_options: job.extractor_options, extractor_options: job.extractor_options,
num_tokens: job.num_tokens, num_tokens: job.num_tokens,
retry: !!job.retry, retry: !!job.retry,
crawl_id: job.crawl_id,
}, },
]); ]);
if (process.env.POSTHOG_API_KEY) { if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = { let phLog = {
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
...(job.team_id !== "preview" && { ...(job.team_id !== "preview" && {

View File

@ -44,9 +44,9 @@ export async function logScrape(
]); ]);
if (error) { if (error) {
Logger.error(`Error logging proxy:\n${error}`); Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
} }
} catch (error) { } catch (error) {
Logger.error(`Error logging proxy:\n${error}`); Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
} }
} }

View File

@ -1,28 +1,15 @@
import { Job, Queue } from "bullmq"; import { Job, Queue } from "bullmq";
import { import { getScrapeQueue } from "./queue-service";
getScrapeQueue,
getWebScraperQueue,
} from "./queue-service";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { WebScraperOptions } from "../types"; import { WebScraperOptions } from "../types";
export async function addWebScraperJob(
webScraperOptions: WebScraperOptions,
options: any = {},
jobId: string = uuidv4(),
): Promise<Job> {
return await getWebScraperQueue().add(jobId, webScraperOptions, {
...options,
jobId,
});
}
export async function addScrapeJob( export async function addScrapeJob(
webScraperOptions: WebScraperOptions, webScraperOptions: WebScraperOptions,
options: any = {}, options: any = {},
jobId: string = uuidv4(), jobId: string = uuidv4(),
): Promise<Job> { ): Promise<Job> {
return await getScrapeQueue().add(jobId, webScraperOptions, { return await getScrapeQueue().add(jobId, webScraperOptions, {
priority: webScraperOptions.crawl_id ? 20 : 10,
...options, ...options,
jobId, jobId,
}); });

View File

@ -2,38 +2,13 @@ import { Queue } from "bullmq";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import IORedis from "ioredis"; import IORedis from "ioredis";
let webScraperQueue: Queue;
let scrapeQueue: Queue; let scrapeQueue: Queue;
export const redisConnection = new IORedis(process.env.REDIS_URL, { export const redisConnection = new IORedis(process.env.REDIS_URL, {
maxRetriesPerRequest: null, maxRetriesPerRequest: null,
}); });
export const webScraperQueueName = "{crawlQueue}";
export const scrapeQueueName = "{scrapeQueue}"; export const scrapeQueueName = "{scrapeQueue}";
export function getWebScraperQueue() {
if (!webScraperQueue) {
webScraperQueue = new Queue(
webScraperQueueName,
{
connection: redisConnection,
}
// {
// settings: {
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
// stalledInterval: 30 * 1000,
// maxStalledCount: 10,
// },
// defaultJobOptions:{
// attempts: 5
// }
// }
);
Logger.info("Web scraper queue created");
}
return webScraperQueue;
}
export function getScrapeQueue() { export function getScrapeQueue() {
if (!scrapeQueue) { if (!scrapeQueue) {
@ -63,4 +38,3 @@ export function getScrapeQueue() {
import { QueueEvents } from 'bullmq'; import { QueueEvents } from 'bullmq';
export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection }); export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection });
export const webScraperQueueEvents = new QueueEvents(webScraperQueueName, { connection: redisConnection });

View File

@ -1,23 +1,24 @@
import "dotenv/config";
import { CustomError } from "../lib/custom-error"; import { CustomError } from "../lib/custom-error";
import { import {
getWebScraperQueue,
getScrapeQueue, getScrapeQueue,
redisConnection, redisConnection,
webScraperQueueName,
scrapeQueueName, scrapeQueueName,
} from "./queue-service"; } from "./queue-service";
import "dotenv/config";
import { logtail } from "./logtail"; import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper"; import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job, QueueEvents, tryCatch } from "bullmq"; import { Job } from "bullmq";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { Worker } from "bullmq"; import { Worker } from "bullmq";
import systemMonitor from "./system-monitor"; import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
if (process.env.ENV === "production") { if (process.env.ENV === "production") {
initSDK({ initSDK({
@ -33,30 +34,29 @@ const workerStalledCheckInterval =
const jobLockExtendInterval = const jobLockExtendInterval =
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000; Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime = const jobLockExtensionTime =
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 15000; Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
const cantAcceptConnectionInterval = const cantAcceptConnectionInterval =
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000; Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval = const connectionMonitorInterval =
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10; Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const wsq = getWebScraperQueue();
const sq = getScrapeQueue();
const processJobInternal = async (token: string, job: Job) => { const processJobInternal = async (token: string, job: Job) => {
const extendLockInterval = setInterval(async () => { const extendLockInterval = setInterval(async () => {
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
await job.extendLock(token, jobLockExtensionTime); await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval); }, jobLockExtendInterval);
try { try {
const result = await processJob(job, token); const result = await processJob(job, token);
const jobState = await job.getState();
if(jobState !== "completed" && jobState !== "failed"){
try{ try{
await job.moveToCompleted(result.docs, token, false); //3rd arg fetchNext if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
}catch(e){ await job.moveToCompleted(null, token, false);
// console.log("Job already completed, error:", e); } else {
await job.moveToCompleted(result.docs, token, false);
} }
}catch(e){
} }
} catch (error) { } catch (error) {
console.log("Job failed, error:", error); console.log("Job failed, error:", error);
@ -110,11 +110,10 @@ const workerFun = async (queueName: string, processJobInternal: (token: string,
} }
}; };
workerFun(webScraperQueueName, processJobInternal);
workerFun(scrapeQueueName, processJobInternal); workerFun(scrapeQueueName, processJobInternal);
async function processJob(job: Job, token: string) { async function processJob(job: Job, token: string) {
Logger.debug(`🐂 Worker taking job ${job.id}`); Logger.info(`🐂 Worker taking job ${job.id}`);
try { try {
job.updateProgress({ job.updateProgress({
@ -131,18 +130,16 @@ async function processJob(job: Job, token: string) {
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id); const rawHtml = docs[0].rawHtml;
if (isCancelled) { if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
await job.discard(); delete docs[0].rawHtml;
await job.moveToFailed(Error("Job cancelled by user"), job.token);
await job.discard();
} }
const data = { const data = {
success, success,
result: { result: {
links: isCancelled ? [] : docs.map((doc) => { links: docs.map((doc) => {
return { return {
content: doc, content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "", source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
@ -150,20 +147,21 @@ async function processJob(job: Job, token: string) {
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,
error: isCancelled ? "Job cancelled by user" : message /* etc... */, error: message /* etc... */,
docs: isCancelled ? [] : docs, docs,
}; };
if (job.data.mode === "crawl" && !isCancelled) { if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook);
} }
if (job.data.crawl_id) {
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
success: success && !isCancelled, success: success,
message: isCancelled ? "Job cancelled by user" : message, message: message,
num_docs: isCancelled ? 0 : docs.length, num_docs: docs.length,
docs: isCancelled ? [] : docs, docs: docs,
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
team_id: job.data.team_id, team_id: job.data.team_id,
mode: job.data.mode, mode: job.data.mode,
@ -171,15 +169,110 @@ async function processJob(job: Job, token: string) {
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
origin: job.data.origin, origin: job.data.origin,
crawl_id: job.data.crawl_id,
}); });
Logger.debug(`🐂 Job done ${job.id}`);
await addCrawlJobDone(job.data.crawl_id, job.id);
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
)
for (const link of links) {
if (await lockURL(job.data.crawl_id, sc, link)) {
const newJob = await addScrapeJob({
url: link,
mode: "single_urls",
crawlerOptions: sc.crawlerOptions,
team_id: sc.team_id,
pageOptions: sc.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
await addCrawlJob(job.data.crawl_id, newJob.id);
}
}
}
}
if (await finishCrawl(job.data.crawl_id)) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await Promise.all(jobIDs.map(async x => {
if (x === job.id) {
return {
async getState() {
return "completed"
},
timestamp: Date.now(),
returnvalue: docs,
}
}
const j = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(j.id);
if (supabaseData) {
j.returnvalue = supabaseData.docs;
}
}
return j;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
await callWebhook(job.data.team_id, job.data.crawl_id, data);
}
}
Logger.info(`🐂 Job done ${job.id}`);
return data; return data;
} catch (error) { } catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`); Logger.error(`🐂 Job errored ${job.id} - ${error}`);
if (await getWebScraperQueue().isPaused()) {
Logger.debug("🐂Queue is paused, ignoring");
return;
}
if (error instanceof CustomError) { if (error instanceof CustomError) {
// Here we handle the error, then save the failed job // Here we handle the error, then save the failed job
@ -192,6 +285,9 @@ async function processJob(job: Job, token: string) {
}); });
} }
Logger.error(error); Logger.error(error);
if (error.stack) {
Logger.error(error.stack);
}
logtail.error("Overall error ingesting", { logtail.error("Overall error ingesting", {
job_id: job.id, job_id: job.id,
@ -205,9 +301,12 @@ async function processJob(job: Job, token: string) {
error: error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */, "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
}; };
if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data); if (job.data.mode === "crawl" || job.data.crawl_id) {
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
} }
if (job.data.crawl_id) {
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
success: false, success: false,
@ -219,12 +318,34 @@ async function processJob(job: Job, token: string) {
docs: [], docs: [],
time_taken: 0, time_taken: 0,
team_id: job.data.team_id, team_id: job.data.team_id,
mode: "crawl", mode: job.data.mode,
url: job.data.url, url: job.data.url,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
origin: job.data.origin, origin: job.data.origin,
crawl_id: job.data.crawl_id,
}); });
const sc = await getCrawl(job.data.crawl_id);
await logJob({
job_id: job.data.crawl_id,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
origin: job.data.origin,
});
}
// done(null, data); // done(null, data);
return data; return data;
} }

View File

@ -14,18 +14,20 @@ const RATE_LIMITS = {
standardNew: 10, standardNew: 10,
standardnew: 10, standardnew: 10,
growth: 50, growth: 50,
growthdouble: 50,
}, },
scrape: { scrape: {
default: 20, default: 20,
free: 5, free: 5,
starter: 20, starter: 20,
standard: 50, standard: 100,
standardOld: 40, standardOld: 40,
scale: 500, scale: 500,
hobby: 10, hobby: 10,
standardNew: 50, standardNew: 100,
standardnew: 50, standardnew: 100,
growth: 500, growth: 1000,
growthdouble: 1000,
}, },
search: { search: {
default: 20, default: 20,
@ -38,6 +40,20 @@ const RATE_LIMITS = {
standardNew: 50, standardNew: 50,
standardnew: 50, standardnew: 50,
growth: 500, growth: 500,
growthdouble: 500,
},
map:{
default: 20,
free: 5,
starter: 20,
standard: 40,
standardOld: 40,
scale: 500,
hobby: 10,
standardNew: 50,
standardnew: 50,
growth: 500,
growthdouble: 500,
}, },
preview: { preview: {
free: 5, free: 5,

View File

@ -0,0 +1,29 @@
import Redlock from "redlock";
import Client from "ioredis";
export const redlock = new Redlock(
// You should have one client for each independent redis node
// or cluster.
[new Client(process.env.REDIS_RATE_LIMIT_URL)],
{
// The expected clock drift; for more details see:
// http://redis.io/topics/distlock
driftFactor: 0.01, // multiplied by lock ttl to determine drift time
// The max number of times Redlock will attempt to lock a resource
// before erroring.
retryCount: 5,
// the time in ms between attempts
retryDelay: 100, // time in ms
// the max time in ms randomly added to retries
// to improve performance under high contention
// see https://www.awsarchitectureblog.com/2015/03/backoff.html
retryJitter: 200, // time in ms
// The minimum remaining time on a lock before an extension is automatically
// attempted with the `using` API.
automaticExtensionThreshold: 500, // time in ms
}
);

View File

@ -36,17 +36,9 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(), new SupabaseService(),
{ {
get: function (target, prop, receiver) { get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
Logger.debug(
"Attempted to access Supabase client when it's not configured."
);
}
const client = target.getClient(); const client = target.getClient();
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
if (client === null) { if (client === null) {
Logger.error(
"Attempted to access Supabase client when it's not configured."
);
return () => { return () => {
throw new Error("Supabase client is not configured."); throw new Error("Supabase client is not configured.");
}; };

View File

@ -1,15 +1,15 @@
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, jobId: string,data: any) => { export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string) => {
try { try {
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId); const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
let webhookUrl = selfHostedUrl; let webhookUrl = specified ?? selfHostedUrl;
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
// and the USE_DB_AUTHENTICATION environment variable is set to true // and the USE_DB_AUTHENTICATION environment variable is set to true
if (!selfHostedUrl && useDbAuthentication) { if (!webhookUrl && useDbAuthentication) {
const { data: webhooksData, error } = await supabase_service const { data: webhooksData, error } = await supabase_service
.from("webhooks") .from("webhooks")
.select("url") .select("url")

View File

@ -28,6 +28,9 @@ export interface WebScraperOptions {
extractorOptions?: any; extractorOptions?: any;
team_id: string; team_id: string;
origin?: string; origin?: string;
crawl_id?: string;
sitemapped?: boolean;
webhook?: string;
} }
export interface RunWebScraperParams { export interface RunWebScraperParams {
@ -41,6 +44,7 @@ export interface RunWebScraperParams {
onError: (error: Error) => void; onError: (error: Error) => void;
team_id: string; team_id: string;
bull_job_id: string; bull_job_id: string;
priority?: number;
} }
export interface RunWebScraperResult { export interface RunWebScraperResult {
@ -65,6 +69,7 @@ export interface FirecrawlJob {
extractor_options?: ExtractorOptions, extractor_options?: ExtractorOptions,
num_tokens?: number, num_tokens?: number,
retry?: boolean, retry?: boolean,
crawl_id?: string;
} }
export interface FirecrawlScrapeResponse { export interface FirecrawlScrapeResponse {
@ -101,6 +106,7 @@ export enum RateLimiterMode {
Scrape = "scrape", Scrape = "scrape",
Preview = "preview", Preview = "preview",
Search = "search", Search = "search",
Map = "map",
} }
@ -110,6 +116,7 @@ export interface AuthResponse {
error?: string; error?: string;
status?: number; status?: number;
plan?: string; plan?: string;
api_key?: string;
} }

View File

@ -8,10 +8,6 @@
"sourceMap": true, "sourceMap": true,
"outDir": "./dist/src", "outDir": "./dist/src",
"moduleResolution": "node", "moduleResolution": "node",
"baseUrl": ".",
"paths": {
"*": ["node_modules/*", "src/types/*"],
}
}, },
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
} }

25
apps/go-sdk/examples/.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
#
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
# Go workspace file
go.work
go.work.sum
# env file
.env

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Mendable
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,87 @@
package main
import (
"encoding/json"
"fmt"
"log"
"github.com/google/uuid"
"github.com/mendableai/firecrawl-go"
)
func main() {
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
if err != nil {
log.Fatalf("Failed to create FirecrawlApp: %v", err)
}
// Scrape a website
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
if err != nil {
log.Fatalf("Failed to scrape URL: %v", err)
}
fmt.Println(scrapeResult.Markdown)
// Crawl a website
idempotencyKey := uuid.New().String() // optional idempotency key
crawlParams := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
if err != nil {
log.Fatalf("Failed to crawl URL: %v", err)
}
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
if err != nil {
log.Fatalf("Failed to marshal crawl result: %v", err)
}
fmt.Println(string(jsonCrawlResult))
// LLM Extraction using JSON schema
jsonSchema := map[string]any{
"type": "object",
"properties": map[string]any{
"top": map[string]any{
"type": "array",
"items": map[string]any{
"type": "object",
"properties": map[string]any{
"title": map[string]string{"type": "string"},
"points": map[string]string{"type": "number"},
"by": map[string]string{"type": "string"},
"commentsURL": map[string]string{"type": "string"},
},
"required": []string{"title", "points", "by", "commentsURL"},
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News",
},
},
"required": []string{"top"},
}
llmExtractionParams := map[string]any{
"extractorOptions": firecrawl.ExtractorOptions{
ExtractionSchema: jsonSchema,
Mode: "llm-extraction",
},
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
if err != nil {
log.Fatalf("Failed to perform LLM extraction: %v", err)
}
// Pretty print the LLM extraction result
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
if err != nil {
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
}
fmt.Println(string(jsonResult))
}

View File

@ -0,0 +1,9 @@
module github.com/mendableai/firecrawl-go-examples
go 1.22.5
replace github.com/mendableai/firecrawl => ../
require github.com/google/uuid v1.6.0
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect

View File

@ -0,0 +1,14 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -0,0 +1,2 @@
API_URL=http://localhost:3002
TEST_API_KEY=fc-YOUR-API-KEY

2
apps/go-sdk/firecrawl/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
vendor

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Sideguide Technologies Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,189 @@
# Firecrawl Go SDK
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
## Installation
To install the Firecrawl Go SDK, you can
```bash
go get github.com/mendableai/firecrawl
```
## Usage
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
Here's an example of how to use the SDK with error handling:
```go
import (
"fmt"
"log"
"github.com/mendableai/firecrawl/firecrawl"
)
func main() {
// Initialize the FirecrawlApp with your API key
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
if err != nil {
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
}
// Scrape a single URL
url := "https://mendable.ai"
scrapedData, err := app.ScrapeURL(url, nil)
if err != nil {
log.Fatalf("Error occurred while scraping: %v", err)
}
fmt.Println(scrapedData)
// Crawl a website
crawlUrl := "https://mendable.ai"
params := map[string]any{
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
crawlResult, err := app.CrawlURL(crawlUrl, params)
if err != nil {
log.Fatalf("Error occurred while crawling: %v", err)
}
fmt.Println(crawlResult)
}
```
### Scraping a URL
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```go
url := "https://mendable.ai"
scrapedData, err := app.ScrapeURL(url, nil)
if err != nil {
log.Fatalf("Failed to scrape URL: %v", err)
}
fmt.Println(scrapedData)
```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
```go
jsonSchema := map[string]any{
"type": "object",
"properties": map[string]any{
"top": map[string]any{
"type": "array",
"items": map[string]any{
"type": "object",
"properties": map[string]any{
"title": map[string]string{"type": "string"},
"points": map[string]string{"type": "number"},
"by": map[string]string{"type": "string"},
"commentsURL": map[string]string{"type": "string"},
},
"required": []string{"title", "points", "by", "commentsURL"},
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News",
},
},
"required": []string{"top"},
}
llmExtractionParams := map[string]any{
"extractorOptions": firecrawl.ExtractorOptions{
ExtractionSchema: jsonSchema,
},
}
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
if err != nil {
log.Fatalf("Failed to perform LLM extraction: %v", err)
}
fmt.Println(scrapeResult)
```
### Search for a query
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
```go
query := "what is mendable?"
searchResult, err := app.Search(query)
if err != nil {
log.Fatalf("Failed to search: %v", err)
}
fmt.Println(searchResult)
```
### Crawling a Website
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
```go
crawlParams := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
"includes": []string{}, // leave empty for all pages
"limit": 1000,
},
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
if err != nil {
log.Fatalf("Failed to crawl URL: %v", err)
}
fmt.Println(crawlResult)
```
### Checking Crawl Status
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```go
status, err := app.CheckCrawlStatus(jobId)
if err != nil {
log.Fatalf("Failed to check crawl status: %v", err)
}
fmt.Println(status)
```
### Canceling a Crawl Job
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
```go
canceled, err := app.CancelCrawlJob(jobId)
if err != nil {
log.Fatalf("Failed to cancel crawl job: %v", err)
}
fmt.Println(canceled)
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
## Contributing
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
## License
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.

View File

@ -0,0 +1,584 @@
// Package firecrawl provides a client for interacting with the Firecrawl API.
package firecrawl
import (
"bytes"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"os"
"time"
)
// FirecrawlDocumentMetadata represents metadata for a Firecrawl document
type FirecrawlDocumentMetadata struct {
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
Keywords string `json:"keywords,omitempty"`
Robots string `json:"robots,omitempty"`
OGTitle string `json:"ogTitle,omitempty"`
OGDescription string `json:"ogDescription,omitempty"`
OGURL string `json:"ogUrl,omitempty"`
OGImage string `json:"ogImage,omitempty"`
OGAudio string `json:"ogAudio,omitempty"`
OGDeterminer string `json:"ogDeterminer,omitempty"`
OGLocale string `json:"ogLocale,omitempty"`
OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"`
OGSiteName string `json:"ogSiteName,omitempty"`
OGVideo string `json:"ogVideo,omitempty"`
DCTermsCreated string `json:"dctermsCreated,omitempty"`
DCDateCreated string `json:"dcDateCreated,omitempty"`
DCDate string `json:"dcDate,omitempty"`
DCTermsType string `json:"dctermsType,omitempty"`
DCType string `json:"dcType,omitempty"`
DCTermsAudience string `json:"dctermsAudience,omitempty"`
DCTermsSubject string `json:"dctermsSubject,omitempty"`
DCSubject string `json:"dcSubject,omitempty"`
DCDescription string `json:"dcDescription,omitempty"`
DCTermsKeywords string `json:"dctermsKeywords,omitempty"`
ModifiedTime string `json:"modifiedTime,omitempty"`
PublishedTime string `json:"publishedTime,omitempty"`
ArticleTag string `json:"articleTag,omitempty"`
ArticleSection string `json:"articleSection,omitempty"`
SourceURL string `json:"sourceURL,omitempty"`
PageStatusCode int `json:"pageStatusCode,omitempty"`
PageError string `json:"pageError,omitempty"`
}
// FirecrawlDocument represents a document in Firecrawl
type FirecrawlDocument struct {
ID string `json:"id,omitempty"`
URL string `json:"url,omitempty"`
Content string `json:"content"`
Markdown string `json:"markdown,omitempty"`
HTML string `json:"html,omitempty"`
LLMExtraction map[string]any `json:"llm_extraction,omitempty"`
CreatedAt *time.Time `json:"createdAt,omitempty"`
UpdatedAt *time.Time `json:"updatedAt,omitempty"`
Type string `json:"type,omitempty"`
Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"`
ChildrenLinks []string `json:"childrenLinks,omitempty"`
Provider string `json:"provider,omitempty"`
Warning string `json:"warning,omitempty"`
Index int `json:"index,omitempty"`
}
// ExtractorOptions represents options for extraction.
type ExtractorOptions struct {
Mode string `json:"mode,omitempty"`
ExtractionPrompt string `json:"extractionPrompt,omitempty"`
ExtractionSchema any `json:"extractionSchema,omitempty"`
}
// ScrapeResponse represents the response for scraping operations
type ScrapeResponse struct {
Success bool `json:"success"`
Data *FirecrawlDocument `json:"data,omitempty"`
}
// SearchResponse represents the response for searching operations
type SearchResponse struct {
Success bool `json:"success"`
Data []*FirecrawlDocument `json:"data,omitempty"`
}
// CrawlResponse represents the response for crawling operations
type CrawlResponse struct {
Success bool `json:"success"`
JobID string `json:"jobId,omitempty"`
Data []*FirecrawlDocument `json:"data,omitempty"`
}
// JobStatusResponse represents the response for checking crawl job status
type JobStatusResponse struct {
Success bool `json:"success"`
Status string `json:"status"`
Current int `json:"current,omitempty"`
CurrentURL string `json:"current_url,omitempty"`
CurrentStep string `json:"current_step,omitempty"`
Total int `json:"total,omitempty"`
JobID string `json:"jobId,omitempty"`
Data []*FirecrawlDocument `json:"data,omitempty"`
PartialData []*FirecrawlDocument `json:"partial_data,omitempty"`
}
// CancelCrawlJobResponse represents the response for canceling a crawl job
type CancelCrawlJobResponse struct {
Success bool `json:"success"`
Status string `json:"status"`
}
// requestOptions represents options for making requests.
type requestOptions struct {
retries int
backoff int
}
// requestOption is a functional option type for requestOptions.
type requestOption func(*requestOptions)
// newRequestOptions creates a new requestOptions instance with the provided options.
//
// Parameters:
// - opts: Optional request options.
//
// Returns:
// - *requestOptions: A new instance of requestOptions with the provided options.
func newRequestOptions(opts ...requestOption) *requestOptions {
options := &requestOptions{retries: 1}
for _, opt := range opts {
opt(options)
}
return options
}
// withRetries sets the number of retries for a request.
//
// Parameters:
// - retries: The number of retries to be performed.
//
// Returns:
// - requestOption: A functional option that sets the number of retries for a request.
func withRetries(retries int) requestOption {
return func(opts *requestOptions) {
opts.retries = retries
}
}
// withBackoff sets the backoff interval for a request.
//
// Parameters:
// - backoff: The backoff interval (in milliseconds) to be used for retries.
//
// Returns:
// - requestOption: A functional option that sets the backoff interval for a request.
func withBackoff(backoff int) requestOption {
return func(opts *requestOptions) {
opts.backoff = backoff
}
}
// FirecrawlApp represents a client for the Firecrawl API.
type FirecrawlApp struct {
APIKey string
APIURL string
Client *http.Client
}
// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL.
// If the API key or API URL is not provided, it attempts to retrieve them from environment variables.
// If the API key is still not found, it returns an error.
//
// Parameters:
// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable.
// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev".
//
// Returns:
// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL.
// - error: An error if the API key is not provided or retrieved.
func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) {
if apiKey == "" {
apiKey = os.Getenv("FIRECRAWL_API_KEY")
if apiKey == "" {
return nil, fmt.Errorf("no API key provided")
}
}
if apiURL == "" {
apiURL = os.Getenv("FIRECRAWL_API_URL")
if apiURL == "" {
apiURL = "https://api.firecrawl.dev"
}
}
client := &http.Client{
Timeout: 60 * time.Second,
}
return &FirecrawlApp{
APIKey: apiKey,
APIURL: apiURL,
Client: client,
}, nil
}
// ScrapeURL scrapes the content of the specified URL using the Firecrawl API.
//
// Parameters:
// - url: The URL to be scraped.
// - params: Optional parameters for the scrape request, including extractor options for LLM extraction.
//
// Returns:
// - *FirecrawlDocument: The scraped document data.
// - error: An error if the scrape request fails.
func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) {
headers := app.prepareHeaders("")
scrapeBody := map[string]any{"url": url}
if params != nil {
if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok {
if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok {
extractorOptions.ExtractionSchema = schema.schema()
}
if extractorOptions.Mode == "" {
extractorOptions.Mode = "llm-extraction"
}
scrapeBody["extractorOptions"] = extractorOptions
}
for key, value := range params {
if key != "extractorOptions" {
scrapeBody[key] = value
}
}
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/scrape", app.APIURL),
scrapeBody,
headers,
"scrape URL",
)
if err != nil {
return nil, err
}
var scrapeResponse ScrapeResponse
err = json.Unmarshal(resp, &scrapeResponse)
if err != nil {
return nil, err
}
if scrapeResponse.Success {
return scrapeResponse.Data, nil
}
return nil, fmt.Errorf("failed to scrape URL")
}
// Search performs a search query using the Firecrawl API and returns the search results.
//
// Parameters:
// - query: The search query string.
// - params: Optional parameters for the search request.
//
// Returns:
// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results.
// - error: An error if the search request fails.
func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) {
headers := app.prepareHeaders("")
searchBody := map[string]any{"query": query}
for k, v := range params {
searchBody[k] = v
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/search", app.APIURL),
searchBody,
headers,
"search",
)
if err != nil {
return nil, err
}
var searchResponse SearchResponse
err = json.Unmarshal(resp, &searchResponse)
if err != nil {
return nil, err
}
if searchResponse.Success {
return searchResponse.Data, nil
}
return nil, fmt.Errorf("failed to search")
}
// CrawlURL starts a crawl job for the specified URL using the Firecrawl API.
//
// Parameters:
// - url: The URL to crawl.
// - params: Optional parameters for the crawl request.
// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning.
// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true.
// - idempotencyKey: An optional idempotency key to ensure the request is idempotent.
//
// Returns:
// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true.
// - error: An error if the crawl request fails.
func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) {
headers := app.prepareHeaders(idempotencyKey)
crawlBody := map[string]any{"url": url}
for k, v := range params {
crawlBody[k] = v
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/crawl", app.APIURL),
crawlBody,
headers,
"start crawl job",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var crawlResponse CrawlResponse
err = json.Unmarshal(resp, &crawlResponse)
if err != nil {
return nil, err
}
if waitUntilDone {
return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval)
}
if crawlResponse.JobID == "" {
return nil, fmt.Errorf("failed to get job ID")
}
return crawlResponse.JobID, nil
}
// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to check.
//
// Returns:
// - *JobStatusResponse: The status of the crawl job.
// - error: An error if the crawl status check request fails.
func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) {
headers := app.prepareHeaders("")
resp, err := app.makeRequest(
http.MethodGet,
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
nil,
headers,
"check crawl status",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var jobStatusResponse JobStatusResponse
err = json.Unmarshal(resp, &jobStatusResponse)
if err != nil {
return nil, err
}
return &jobStatusResponse, nil
}
// CancelCrawlJob cancels a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to cancel.
//
// Returns:
// - string: The status of the crawl job after cancellation.
// - error: An error if the crawl job cancellation request fails.
func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) {
headers := app.prepareHeaders("")
resp, err := app.makeRequest(
http.MethodDelete,
fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID),
nil,
headers,
"cancel crawl job",
)
if err != nil {
return "", err
}
var cancelCrawlJobResponse CancelCrawlJobResponse
err = json.Unmarshal(resp, &cancelCrawlJobResponse)
if err != nil {
return "", err
}
return cancelCrawlJobResponse.Status, nil
}
// prepareHeaders prepares the headers for an HTTP request.
//
// Parameters:
// - idempotencyKey: A string representing the idempotency key to be included in the headers.
// If the idempotency key is an empty string, it will not be included in the headers.
//
// Returns:
// - map[string]string: A map containing the headers for the HTTP request.
func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string {
headers := map[string]string{
"Content-Type": "application/json",
"Authorization": fmt.Sprintf("Bearer %s", app.APIKey),
}
if idempotencyKey != "" {
headers["x-idempotency-key"] = idempotencyKey
}
return headers
}
// makeRequest makes a request to the specified URL with the provided method, data, headers, and options.
//
// Parameters:
// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE").
// - url: The URL to send the request to.
// - data: The data to be sent in the request body.
// - headers: The headers to be included in the request.
// - action: A string describing the action being performed.
// - opts: Optional request options.
//
// Returns:
// - []byte: The response body from the request.
// - error: An error if the request fails.
func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) {
var body []byte
var err error
if data != nil {
body, err = json.Marshal(data)
if err != nil {
return nil, err
}
}
req, err := http.NewRequest(method, url, bytes.NewBuffer(body))
if err != nil {
return nil, err
}
for key, value := range headers {
req.Header.Set(key, value)
}
var resp *http.Response
options := newRequestOptions(opts...)
for i := 0; i < options.retries; i++ {
resp, err = app.Client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 502 {
break
}
time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond)
}
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
statusCode := resp.StatusCode
if statusCode != 200 {
return nil, app.handleError(statusCode, respBody, action)
}
return respBody, nil
}
// monitorJobStatus monitors the status of a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to monitor.
// - headers: The headers to be included in the request.
// - pollInterval: The interval (in seconds) at which to poll the job status.
//
// Returns:
// - []*FirecrawlDocument: The crawl result if the job is completed.
// - error: An error if the crawl status check request fails.
func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) {
attempts := 0
for {
resp, err := app.makeRequest(
http.MethodGet,
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
nil,
headers,
"check crawl status",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var statusData JobStatusResponse
err = json.Unmarshal(resp, &statusData)
if err != nil {
return nil, err
}
status := statusData.Status
if status == "" {
return nil, fmt.Errorf("invalid status in response")
}
if status == "completed" {
if statusData.Data != nil {
return statusData.Data, nil
}
attempts++
if attempts > 3 {
return nil, fmt.Errorf("crawl job completed but no data was returned")
}
} else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" {
pollInterval = max(pollInterval, 2)
time.Sleep(time.Duration(pollInterval) * time.Second)
} else {
return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status)
}
}
}
// handleError handles errors returned by the Firecrawl API.
//
// Parameters:
// - resp: The HTTP response object.
// - body: The response body from the HTTP response.
// - action: A string describing the action being performed.
//
// Returns:
// - error: An error describing the failure reason.
func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error {
var errorData map[string]any
err := json.Unmarshal(body, &errorData)
if err != nil {
return fmt.Errorf("failed to parse error response: %v", err)
}
errorMessage, _ := errorData["error"].(string)
if errorMessage == "" {
errorMessage = "No additional error details provided."
}
var message string
switch statusCode {
case 402:
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
case 408:
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
case 409:
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
case 500:
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
default:
message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage)
}
return fmt.Errorf(message)
}

View File

@ -0,0 +1,292 @@
package firecrawl
import (
"log"
"os"
"testing"
"time"
"github.com/google/uuid"
"github.com/joho/godotenv"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var API_URL string
var TEST_API_KEY string
func init() {
err := godotenv.Load("../.env")
if err != nil {
log.Fatalf("Error loading .env file: %v", err)
}
API_URL = os.Getenv("API_URL")
TEST_API_KEY = os.Getenv("TEST_API_KEY")
}
func TestNoAPIKey(t *testing.T) {
_, err := NewFirecrawlApp("", API_URL)
assert.Error(t, err)
assert.Contains(t, err.Error(), "no API key provided")
}
func TestScrapeURLInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.ScrapeURL("https://firecrawl.dev", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token")
}
func TestBlocklistedURL(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
_, err = app.ScrapeURL("https://facebook.com/fake-test", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
}
func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) {
app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
}
func TestScrapeURLE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
assert.NotEqual(t, response.Markdown, "")
assert.NotNil(t, response.Metadata)
assert.Equal(t, response.HTML, "")
}
func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"pageOptions": map[string]any{
"includeHtml": true,
},
}
response, err := app.ScrapeURL("https://roastmywebsite.ai", params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
assert.Contains(t, response.Markdown, "_Roast_")
assert.Contains(t, response.HTML, "<h1")
assert.NotNil(t, response.Metadata)
}
func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
assert.NotNil(t, response.Metadata)
}
func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil)
time.Sleep(6 * time.Second) // wait for 6 seconds
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
assert.NotNil(t, response.Metadata)
}
func TestCrawlURLInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token")
}
func TestShouldReturnErrorForBlocklistedURL(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
_, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
}
func TestCrawlURLWaitForCompletionE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
data, ok := response.([]*FirecrawlDocument)
assert.True(t, ok)
assert.Greater(t, len(data), 0)
assert.Contains(t, data[0].Content, "_Roast_")
}
func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
uniqueIdempotencyKey := uuid.New().String()
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, uniqueIdempotencyKey)
require.NoError(t, err)
assert.NotNil(t, response)
data, ok := response.([]*FirecrawlDocument)
assert.True(t, ok)
assert.Greater(t, len(data), 0)
assert.Contains(t, data[0].Content, "_Roast_")
_, err = app.CrawlURL("https://firecrawl.dev", params, true, 2, uniqueIdempotencyKey)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used")
}
func TestCheckCrawlStatusE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://firecrawl.dev", params, false, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
jobID, ok := response.(string)
assert.True(t, ok)
assert.NotEqual(t, "", jobID)
time.Sleep(30 * time.Second) // wait for 30 seconds
statusResponse, err := app.CheckCrawlStatus(jobID)
require.NoError(t, err)
assert.NotNil(t, statusResponse)
assert.Equal(t, "completed", statusResponse.Status)
assert.Greater(t, len(statusResponse.Data), 0)
}
func TestSearchE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.Search("test query", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Greater(t, len(response), 2)
assert.NotEqual(t, response[0].Content, "")
}
func TestSearchInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.Search("test query", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token")
}
func TestLLMExtraction(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"extractorOptions": ExtractorOptions{
Mode: "llm-extraction",
ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
ExtractionSchema: map[string]any{
"type": "object",
"properties": map[string]any{
"company_mission": map[string]string{"type": "string"},
"supports_sso": map[string]string{"type": "boolean"},
"is_open_source": map[string]string{"type": "boolean"},
},
"required": []string{"company_mission", "supports_sso", "is_open_source"},
},
},
}
response, err := app.ScrapeURL("https://mendable.ai", params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.LLMExtraction, "company_mission")
assert.IsType(t, true, response.LLMExtraction["supports_sso"])
assert.IsType(t, true, response.LLMExtraction["is_open_source"])
}
func TestCancelCrawlJobInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.CancelCrawlJob("test query")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token")
}
func TestCancelNonExistingCrawlJob(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
jobID := uuid.New().String()
_, err = app.CancelCrawlJob(jobID)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Job not found")
}
func TestCancelCrawlJobE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
jobID, ok := response.(string)
assert.True(t, ok)
assert.NotEqual(t, "", jobID)
status, err := app.CancelCrawlJob(jobID)
require.NoError(t, err)
assert.Equal(t, "cancelled", status)
}

View File

@ -0,0 +1,15 @@
module github.com/mendableai/firecrawl-go
go 1.22.5
require (
github.com/google/uuid v1.6.0
github.com/joho/godotenv v1.5.1
github.com/stretchr/testify v1.9.0
)
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

View File

@ -0,0 +1,14 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -1,16 +1,16 @@
import { v4 as uuidv4 } from 'uuid'; import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js';
import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
// Scrape a website: // Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev'); const scrapeResult = await app.scrapeUrl('firecrawl.dev');
console.log(scrapeResult.data.content)
if (scrapeResult.data) {
console.log(scrapeResult.data.markdown)
}
// Crawl a website: // Crawl a website:
const idempotencyKey = uuidv4(); // optional const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
console.log(crawlResult) console.log(crawlResult)
const jobId = await crawlResult['jobId']; const jobId = await crawlResult['jobId'];
@ -19,67 +19,15 @@ console.log(jobId);
let job; let job;
while (true) { while (true) {
job = await app.checkCrawlStatus(jobId); job = await app.checkCrawlStatus(jobId);
if (job.status == 'completed') { if (job.status === 'completed') {
break; break;
} }
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
} }
console.log(job.data[0].content); if (job.data) {
console.log(job.data[0].markdown);
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
console.log(searchResult)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
console.log(llmExtractionResult.data.llm_extraction);
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
} }
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { const mapResult = await app.map('https://firecrawl.dev');
extractorOptions: { extractionSchema: jsonSchema }, console.log(mapResult)
});
console.log(llmExtractionResult.data.llm_extraction);

View File

@ -1,5 +1,5 @@
import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; import FirecrawlApp from './firecrawl/src/index' //'@mendable/firecrawl-js';
import { z } from "zod"; import { CrawlStatusResponse } from './firecrawl/src/index';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -7,7 +7,7 @@ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
const scrapeResult = await app.scrapeUrl('firecrawl.dev'); const scrapeResult = await app.scrapeUrl('firecrawl.dev');
if (scrapeResult.data) { if (scrapeResult.data) {
console.log(scrapeResult.data.content) console.log(scrapeResult.data.markdown)
} }
// Crawl a website: // Crawl a website:
@ -17,9 +17,9 @@ console.log(crawlResult)
const jobId: string = await crawlResult['jobId']; const jobId: string = await crawlResult['jobId'];
console.log(jobId); console.log(jobId);
let job: JobStatusResponse; let job: CrawlStatusResponse;
while (true) { while (true) {
job = await app.checkCrawlStatus(jobId); job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse;
if (job.status === 'completed') { if (job.status === 'completed') {
break; break;
} }
@ -27,66 +27,8 @@ while (true) {
} }
if (job.data) { if (job.data) {
console.log(job.data[0].content); console.log(job.data[0].markdown);
}
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data.llm_extraction);
}
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data.llm_extraction);
} }
const mapResult = await app.map('https://firecrawl.dev');
console.log(mapResult)

85
apps/js-sdk/exampleV0.js Normal file
View File

@ -0,0 +1,85 @@
import { v4 as uuidv4 } from 'uuid';
import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
// Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
console.log(scrapeResult.data.content)
// Crawl a website:
const idempotencyKey = uuidv4(); // optional
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
console.log(crawlResult)
const jobId = await crawlResult['jobId'];
console.log(jobId);
let job;
while (true) {
job = await app.checkCrawlStatus(jobId);
if (job.status == 'completed') {
break;
}
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
}
console.log(job.data[0].content);
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
console.log(searchResult)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
console.log(llmExtractionResult.data.llm_extraction);
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
console.log(llmExtractionResult.data.llm_extraction);

95
apps/js-sdk/exampleV0.ts Normal file
View File

@ -0,0 +1,95 @@
import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY", version: "v0"});
// Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev') as ScrapeResponseV0;
if (scrapeResult.data) {
console.log(scrapeResult.data.content)
}
// Crawl a website:
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
console.log(crawlResult)
const jobId: string = await crawlResult['jobId'];
console.log(jobId);
let job: CrawlStatusResponseV0;
while (true) {
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0;
if (job.status === 'completed') {
break;
}
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
}
if (job.data) {
console.log(job.data[0].content);
}
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query) as SearchResponseV0;
if (searchResult.data) {
console.log(searchResult.data[0].content)
}
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data[0].llm_extraction);
}
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data[0].llm_extraction);
}

View File

@ -128,3 +128,5 @@ dist
.yarn/build-state.yml .yarn/build-state.yml
.yarn/install-state.gz .yarn/install-state.gz
.pnp.* .pnp.*
build

Some files were not shown because too many files have changed in this diff Show More