mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 12:40:39 +08:00
Merge remote-tracking branch 'origin/v1/node-sdk' into v1/python-sdk
This commit is contained in:
commit
af0e47a30e
20
.github/workflows/check-redis.yml
vendored
20
.github/workflows/check-redis.yml
vendored
@ -1,20 +0,0 @@
|
||||
name: Check Redis
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/5 * * * *'
|
||||
|
||||
env:
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
||||
jobs:
|
||||
clean-jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send GET request to check queues
|
||||
run: |
|
||||
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/redis-health)
|
||||
if [ "$response" -ne 200 ]; then
|
||||
echo "Failed to check queues. Response: $response"
|
||||
exit 1
|
||||
fi
|
||||
echo "Successfully checked queues. Response: $response"
|
2
.github/workflows/fly-direct.yml
vendored
2
.github/workflows/fly-direct.yml
vendored
@ -1,7 +1,7 @@
|
||||
name: Fly Deploy Direct
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 */6 * * *'
|
||||
- cron: '0 */2 * * *'
|
||||
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
|
35
.github/workflows/fly.yml
vendored
35
.github/workflows/fly.yml
vendored
@ -169,6 +169,41 @@ jobs:
|
||||
run: npm run test
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
go-sdk-tests:
|
||||
name: Go SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: "go.mod"
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Install dependencies for Go SDK
|
||||
run: go mod tidy
|
||||
working-directory: ./apps/go-sdk
|
||||
- name: Run tests for Go SDK
|
||||
run: go test -v ./... -timeout 180s
|
||||
working-directory: ./apps/go-sdk/firecrawl
|
||||
|
||||
deploy:
|
||||
name: Deploy app
|
||||
runs-on: ubuntu-latest
|
||||
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
175
SELF_HOST.md
175
SELF_HOST.md
@ -1,36 +1,76 @@
|
||||
## Self-hosting Firecrawl
|
||||
# Self-hosting Firecrawl
|
||||
|
||||
_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._
|
||||
#### Contributor?
|
||||
|
||||
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
|
||||
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally so you can run it on your own and contribute.
|
||||
|
||||
## Getting Started
|
||||
If you're contributing, note that the process is similar to other open-source repos, i.e., fork Firecrawl, make changes, run tests, PR.
|
||||
|
||||
First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`.
|
||||
If you have any questions or would like help getting on board, join our Discord community [here](https://discord.gg/gSmWdAkdwd) for more information or submit an issue on Github [here](https://github.com/mendableai/firecrawl/issues/new/choose)!
|
||||
|
||||
### Steps
|
||||
## Why?
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/mendableai/firecrawl.git
|
||||
cd firecrawl
|
||||
cp ./apps/api/.env.example ./.env
|
||||
```
|
||||
|
||||
2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication:
|
||||
|
||||
```plaintext
|
||||
USE_DB_AUTHENTICATION=false
|
||||
```
|
||||
|
||||
3. Update the Redis URL in the .env file to align with the Docker configuration:
|
||||
|
||||
```plaintext
|
||||
REDIS_URL=redis://redis:6379
|
||||
```
|
||||
|
||||
4. #### Option: Running with TypeScript Playwright Service
|
||||
Self-hosting Firecrawl is particularly beneficial for organizations with stringent security policies that require data to remain within controlled environments. Here are some key reasons to consider self-hosting:
|
||||
|
||||
- **Enhanced Security and Compliance:** By self-hosting, you ensure that all data handling and processing complies with internal and external regulations, keeping sensitive information within your secure infrastructure. Note that Firecrawl is a Mendable product and relies on SOC2 Type2 certification, which means that the platform adheres to high industry standards for managing data security.
|
||||
- **Customizable Services:** Self-hosting allows you to tailor the services, such as the Playwright service, to meet specific needs or handle particular use cases that may not be supported by the standard cloud offering.
|
||||
- **Learning and Community Contribution:** By setting up and maintaining your own instance, you gain a deeper understanding of how Firecrawl works, which can also lead to more meaningful contributions to the project.
|
||||
|
||||
### Considerations
|
||||
|
||||
However, there are some limitations and additional responsibilities to be aware of:
|
||||
|
||||
1. **Limited Access to Fire-engine:** Currently, self-hosted instances of Firecrawl do not have access to Fire-engine, which includes advanced features for handling IP blocks, robot detection mechanisms, and more. This means that while you can manage basic scraping tasks, more complex scenarios might require additional configuration or might not be supported.
|
||||
2. **Manual Configuration Required:** If you need to use scraping methods beyond the basic fetch and Playwright options, you will need to manually configure these in the `.env` file. This requires a deeper understanding of the technologies and might involve more setup time.
|
||||
|
||||
Self-hosting Firecrawl is ideal for those who need full control over their scraping and data processing environments but comes with the trade-off of additional maintenance and configuration efforts.
|
||||
|
||||
## Steps
|
||||
|
||||
1. First, start by installing the dependencies
|
||||
|
||||
- Docker [instructions](https://docs.docker.com/get-docker/)
|
||||
|
||||
|
||||
2. Set environment variables
|
||||
|
||||
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
|
||||
|
||||
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
|
||||
|
||||
`.env:`
|
||||
```
|
||||
# ===== Required ENVS ======
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://redis:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
USE_DB_AUTHENTICATION=false
|
||||
|
||||
# ===== Optional ENVS ======
|
||||
|
||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
```
|
||||
|
||||
3. *(Optional) Running with TypeScript Playwright Service*
|
||||
|
||||
* Update the `docker-compose.yml` file to change the Playwright service:
|
||||
|
||||
@ -49,16 +89,91 @@ First, clone this repository and copy the example env file from the API folder `
|
||||
```
|
||||
|
||||
* Don't forget to set the proxy server in your `.env` file as needed.
|
||||
5. Build and run the Docker containers:
|
||||
|
||||
4. Build and run the Docker containers:
|
||||
|
||||
```bash
|
||||
docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
|
||||
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
|
||||
|
||||
You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/@/queues`.
|
||||
|
||||
5. *(Optional)* Test the API
|
||||
|
||||
If you’d like to test the crawl endpoint, you can run this:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3002/v0/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
}'
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
|
||||
|
||||
### Supabase client is not configured
|
||||
|
||||
**Symptom:**
|
||||
```bash
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Attempted to access Supabase client when it's not configured.
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Error inserting scrape event: Error: Supabase client is not configured.
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
|
||||
|
||||
### You're bypassing authentication
|
||||
|
||||
**Symptom:**
|
||||
```bash
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]WARN - You're bypassing authentication
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
|
||||
|
||||
### Docker containers fail to start
|
||||
|
||||
**Symptom:**
|
||||
Docker containers exit unexpectedly or fail to start.
|
||||
|
||||
**Solution:**
|
||||
Check the Docker logs for any error messages using the command:
|
||||
```bash
|
||||
docker logs [container_name]
|
||||
```
|
||||
|
||||
- Ensure all required environment variables are set correctly in the .env file.
|
||||
- Verify that all Docker services defined in docker-compose.yml are correctly configured and the necessary images are available.
|
||||
|
||||
### Connection issues with Redis
|
||||
|
||||
**Symptom:**
|
||||
Errors related to connecting to Redis, such as timeouts or "Connection refused".
|
||||
|
||||
**Solution:**
|
||||
- Ensure that the Redis service is up and running in your Docker environment.
|
||||
- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance, ensure that it points to the same URL in the `docker-compose.yaml` file (`redis://redis:6379`)
|
||||
- Check network settings and firewall rules that may block the connection to the Redis port.
|
||||
|
||||
### API endpoint does not respond
|
||||
|
||||
**Symptom:**
|
||||
API requests to the Firecrawl instance timeout or return no response.
|
||||
|
||||
**Solution:**
|
||||
- Ensure that the Firecrawl service is running by checking the Docker container status.
|
||||
- Verify that the PORT and HOST settings in your .env file are correct and that no other service is using the same port.
|
||||
- Check the network configuration to ensure that the host is accessible from the client making the API request.
|
||||
|
||||
By addressing these common issues, you can ensure a smoother setup and operation of your self-hosted Firecrawl instance.
|
||||
|
||||
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
|
||||
|
||||
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
@ -2,8 +2,8 @@
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://localhost:6379
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
@ -17,18 +17,27 @@ SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
# use if you've set up authentication and want to test with a real API key
|
||||
TEST_API_KEY=
|
||||
# set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE=
|
||||
# set if you'd like to test the crawling rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL=
|
||||
# set if you'd like to use scraping Be to handle JS blocking
|
||||
SCRAPING_BEE_API_KEY=
|
||||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_API_KEY=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST=
|
||||
|
||||
STRIPE_PRICE_ID_STANDARD=
|
||||
STRIPE_PRICE_ID_SCALE=
|
||||
@ -43,7 +52,8 @@ STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||
PROXY_SERVER=
|
||||
|
2
apps/api/.gitignore
vendored
2
apps/api/.gitignore
vendored
@ -6,3 +6,5 @@ dump.rdb
|
||||
/mongo-data
|
||||
|
||||
/.next/
|
||||
|
||||
.rdb
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
||||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 100
|
||||
soft_limit = 50
|
||||
# hard_limit = 100
|
||||
soft_limit = 100
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "10s"
|
||||
@ -51,12 +51,13 @@ kill_timeout = '30s'
|
||||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 25
|
||||
soft_limit = 20
|
||||
# hard_limit = 25
|
||||
soft_limit = 100
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-1x'
|
||||
size = 'performance-2x'
|
||||
processes = ['app','worker']
|
||||
memory = 8192
|
||||
|
||||
|
||||
|
||||
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
||||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 200
|
||||
soft_limit = 75
|
||||
# hard_limit = 200
|
||||
soft_limit = 200
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "20s"
|
||||
@ -50,8 +50,8 @@ kill_timeout = '30s'
|
||||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 30
|
||||
soft_limit = 12
|
||||
# hard_limit = 30
|
||||
soft_limit = 200
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-4x'
|
||||
|
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
@ -0,0 +1,924 @@
|
||||
{
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": "Firecrawl API",
|
||||
"version": "0.0.0",
|
||||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
"url": "https://api.firecrawl.dev/v0"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl": {
|
||||
"post": {
|
||||
"summary": "Crawl multiple URLs based on options",
|
||||
"operationId": "crawlUrls",
|
||||
"tags": ["Crawling"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The base URL to start crawling from"
|
||||
},
|
||||
"crawlerOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"includes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
},
|
||||
"excludes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
},
|
||||
"generateImgAltText": {
|
||||
"type": "boolean",
|
||||
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||
"default": false
|
||||
},
|
||||
"returnOnlyUrls": {
|
||||
"type": "boolean",
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
"description": "Ignore the website sitemap when crawling",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CrawlResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search": {
|
||||
"post": {
|
||||
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||
"operationId": "searchGoogle",
|
||||
"tags": ["Search"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"fetchPageContent": {
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"searchOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results. Max is 20 during beta."
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SearchResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/status/{jobId}": {
|
||||
"get": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Get the status of a crawl job",
|
||||
"operationId": "getCrawlStatus",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Status of the job (completed, active, failed, paused)"
|
||||
},
|
||||
"current": {
|
||||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"bearerAuth": {
|
||||
"type": "http",
|
||||
"scheme": "bearer"
|
||||
}
|
||||
},
|
||||
"schemas": {
|
||||
"ScrapeResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"SearchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
@ -18,8 +18,8 @@
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrape",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
@ -38,89 +38,47 @@
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
"formats": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||
},
|
||||
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||
"default": ["markdown"]
|
||||
},
|
||||
"extractorOptions": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": true
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
@ -317,6 +275,11 @@
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
@ -731,24 +694,42 @@
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Warning message to let you know of any issues."
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -770,27 +751,16 @@
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -800,24 +770,33 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -839,11 +818,11 @@
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
@ -861,34 +840,63 @@
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
"markdown": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -899,8 +907,15 @@
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -57,6 +57,8 @@
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/node": "^8.13.0",
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
@ -71,6 +73,7 @@
|
||||
"date-fns": "^3.6.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
@ -93,6 +96,7 @@
|
||||
"promptable": "^0.0.10",
|
||||
"puppeteer": "^22.12.1",
|
||||
"rate-limiter-flexible": "2.4.2",
|
||||
"redlock": "5.0.0-beta.2",
|
||||
"resend": "^3.4.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"scrapingbee": "^1.7.4",
|
||||
@ -104,8 +108,9 @@
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
"zod": "^3.23.4",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.1"
|
||||
},
|
||||
"nodemonConfig": {
|
||||
|
112
apps/api/pnpm-lock.yaml
generated
112
apps/api/pnpm-lock.yaml
generated
@ -41,6 +41,12 @@ importers:
|
||||
'@supabase/supabase-js':
|
||||
specifier: ^2.44.2
|
||||
version: 2.44.2
|
||||
'@types/express-ws':
|
||||
specifier: ^3.0.4
|
||||
version: 3.0.4
|
||||
'@types/ws':
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
@ -83,6 +89,9 @@ importers:
|
||||
express-rate-limit:
|
||||
specifier: ^7.3.1
|
||||
version: 7.3.1(express@4.19.2)
|
||||
express-ws:
|
||||
specifier: ^5.0.2
|
||||
version: 5.0.2(express@4.19.2)
|
||||
form-data:
|
||||
specifier: ^4.0.0
|
||||
version: 4.0.0
|
||||
@ -106,7 +115,7 @@ importers:
|
||||
version: 0.0.28
|
||||
langchain:
|
||||
specifier: ^0.2.8
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
languagedetect:
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
@ -149,6 +158,9 @@ importers:
|
||||
rate-limiter-flexible:
|
||||
specifier: 2.4.2
|
||||
version: 2.4.2
|
||||
redlock:
|
||||
specifier: 5.0.0-beta.2
|
||||
version: 5.0.0-beta.2
|
||||
resend:
|
||||
specifier: ^3.4.0
|
||||
version: 3.4.0
|
||||
@ -182,11 +194,14 @@ importers:
|
||||
wordpos:
|
||||
specifier: ^2.1.0
|
||||
version: 2.1.0
|
||||
ws:
|
||||
specifier: ^8.18.0
|
||||
version: 8.18.0
|
||||
xml2js:
|
||||
specifier: ^0.6.2
|
||||
version: 0.6.2
|
||||
zod:
|
||||
specifier: ^3.23.4
|
||||
specifier: ^3.23.8
|
||||
version: 3.23.8
|
||||
zod-to-json-schema:
|
||||
specifier: ^3.23.1
|
||||
@ -1556,6 +1571,9 @@ packages:
|
||||
'@types/express-serve-static-core@4.19.3':
|
||||
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
|
||||
|
||||
'@types/express@4.17.21':
|
||||
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
|
||||
|
||||
@ -1658,8 +1676,8 @@ packages:
|
||||
'@types/whatwg-url@11.0.5':
|
||||
resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==}
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
|
||||
'@types/ws@8.5.12':
|
||||
resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==}
|
||||
|
||||
'@types/yargs-parser@21.0.3':
|
||||
resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==}
|
||||
@ -2413,6 +2431,12 @@ packages:
|
||||
peerDependencies:
|
||||
express: 4 || 5 || ^5.0.0-beta.1
|
||||
|
||||
express-ws@5.0.2:
|
||||
resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==}
|
||||
engines: {node: '>=4.5.0'}
|
||||
peerDependencies:
|
||||
express: ^4.0.0 || ^5.0.0-alpha.1
|
||||
|
||||
express@4.19.2:
|
||||
resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==}
|
||||
engines: {node: '>= 0.10.0'}
|
||||
@ -3950,6 +3974,10 @@ packages:
|
||||
redis@4.6.14:
|
||||
resolution: {integrity: sha512-GrNg/e33HtsQwNXL7kJT+iNFPSwE1IPmd7wzV3j4f2z0EYxZfZE7FVTmUysgAtqQQtg5NXF5SNLR9OdO/UHOfw==}
|
||||
|
||||
redlock@5.0.0-beta.2:
|
||||
resolution: {integrity: sha512-2RDWXg5jgRptDrB1w9O/JgSZC0j7y4SlaXnor93H/UJm/QyDiFgBKNtrh0TI6oCXqYSaSoXxFh6Sd3VtYfhRXw==}
|
||||
engines: {node: '>=12'}
|
||||
|
||||
regenerator-runtime@0.14.1:
|
||||
resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==}
|
||||
|
||||
@ -4540,8 +4568,20 @@ packages:
|
||||
resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==}
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
|
||||
ws@8.17.1:
|
||||
resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==}
|
||||
ws@7.5.10:
|
||||
resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
|
||||
engines: {node: '>=8.3.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
utf-8-validate: ^5.0.2
|
||||
peerDependenciesMeta:
|
||||
bufferutil:
|
||||
optional: true
|
||||
utf-8-validate:
|
||||
optional: true
|
||||
|
||||
ws@8.18.0:
|
||||
resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==}
|
||||
engines: {node: '>=10.0.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
@ -5178,13 +5218,13 @@ snapshots:
|
||||
|
||||
'@js-sdsl/ordered-map@4.4.2': {}
|
||||
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
||||
dependencies:
|
||||
ansi-styles: 5.2.0
|
||||
camelcase: 6.3.0
|
||||
decamelize: 1.2.0
|
||||
js-tiktoken: 1.0.12
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
ml-distance: 4.0.1
|
||||
mustache: 4.2.0
|
||||
p-queue: 6.6.2
|
||||
@ -5196,9 +5236,9 @@ snapshots:
|
||||
- langchain
|
||||
- openai
|
||||
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))':
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
js-tiktoken: 1.0.12
|
||||
openai: 4.52.2
|
||||
zod: 3.23.8
|
||||
@ -5207,9 +5247,9 @@ snapshots:
|
||||
- encoding
|
||||
- langchain
|
||||
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
js-tiktoken: 1.0.12
|
||||
transitivePeerDependencies:
|
||||
- langchain
|
||||
@ -6367,8 +6407,8 @@ snapshots:
|
||||
dependencies:
|
||||
'@supabase/node-fetch': 2.6.15
|
||||
'@types/phoenix': 1.6.5
|
||||
'@types/ws': 8.5.10
|
||||
ws: 8.17.1
|
||||
'@types/ws': 8.5.12
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
@ -6465,6 +6505,12 @@ snapshots:
|
||||
'@types/range-parser': 1.2.7
|
||||
'@types/send': 0.17.4
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
dependencies:
|
||||
'@types/express': 4.17.21
|
||||
'@types/express-serve-static-core': 4.19.3
|
||||
'@types/ws': 8.5.12
|
||||
|
||||
'@types/express@4.17.21':
|
||||
dependencies:
|
||||
'@types/body-parser': 1.19.5
|
||||
@ -6588,7 +6634,7 @@ snapshots:
|
||||
dependencies:
|
||||
'@types/webidl-conversions': 7.0.3
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
'@types/ws@8.5.12':
|
||||
dependencies:
|
||||
'@types/node': 20.14.1
|
||||
|
||||
@ -7329,6 +7375,14 @@ snapshots:
|
||||
dependencies:
|
||||
express: 4.19.2
|
||||
|
||||
express-ws@5.0.2(express@4.19.2):
|
||||
dependencies:
|
||||
express: 4.19.2
|
||||
ws: 7.5.10
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
|
||||
express@4.19.2:
|
||||
dependencies:
|
||||
accepts: 1.3.8
|
||||
@ -8241,17 +8295,17 @@ snapshots:
|
||||
|
||||
kleur@3.0.3: {}
|
||||
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1):
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
binary-extensions: 2.3.0
|
||||
js-tiktoken: 1.0.12
|
||||
js-yaml: 4.1.0
|
||||
jsonpointer: 5.0.1
|
||||
langchainhub: 0.0.11
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
ml-distance: 4.0.1
|
||||
openapi-types: 12.1.3
|
||||
p-retry: 4.6.2
|
||||
@ -8271,14 +8325,14 @@ snapshots:
|
||||
pdf-parse: 1.1.1
|
||||
puppeteer: 22.12.1(typescript@5.4.5)
|
||||
redis: 4.6.14
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- openai
|
||||
|
||||
langchainhub@0.0.11: {}
|
||||
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2):
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2):
|
||||
dependencies:
|
||||
'@types/uuid': 9.0.8
|
||||
commander: 10.0.1
|
||||
@ -8287,8 +8341,8 @@ snapshots:
|
||||
p-retry: 4.6.2
|
||||
uuid: 9.0.1
|
||||
optionalDependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
openai: 4.52.2
|
||||
|
||||
languagedetect@2.0.0: {}
|
||||
@ -8992,7 +9046,7 @@ snapshots:
|
||||
chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070)
|
||||
debug: 4.3.5
|
||||
devtools-protocol: 0.0.1299070
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- supports-color
|
||||
@ -9098,6 +9152,10 @@ snapshots:
|
||||
'@redis/search': 1.1.6(@redis/client@1.5.16)
|
||||
'@redis/time-series': 1.0.5(@redis/client@1.5.16)
|
||||
|
||||
redlock@5.0.0-beta.2:
|
||||
dependencies:
|
||||
node-abort-controller: 3.1.1
|
||||
|
||||
regenerator-runtime@0.14.1: {}
|
||||
|
||||
require-directory@2.1.1: {}
|
||||
@ -9670,7 +9728,9 @@ snapshots:
|
||||
imurmurhash: 0.1.4
|
||||
signal-exit: 4.1.0
|
||||
|
||||
ws@8.17.1: {}
|
||||
ws@7.5.10: {}
|
||||
|
||||
ws@8.18.0: {}
|
||||
|
||||
xml2js@0.6.2:
|
||||
dependencies:
|
||||
|
609
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
609
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
@ -0,0 +1,609 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
ScrapeOptions,
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
delete process.env.USE_DB_AUTHENTICATION;
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it.concurrent("should return the production status", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
});
|
||||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/scrape"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.description).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.keywords).toBe(
|
||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
);
|
||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.ogDescription).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.ogUrl).toBe(
|
||||
"https://www.roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.ogImage).toBe(
|
||||
"https://www.roastmywebsite.ai/og.png"
|
||||
);
|
||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.sourceURL).toBe(
|
||||
"https://roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
|
||||
if (!("data" in responseWithoutRemoveTags.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithRemoveTags);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||
}, 30000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||
}, 60000);
|
||||
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(405);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(500);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
}, 3000);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.rawHtml).toContain("<html");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 5000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("links");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("PagerDuty");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("links");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.links).toContain("https://firecrawl.dev");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
|
||||
});
|
||||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/map"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://roastmywebsite.ai"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://usemotion.com",
|
||||
search: "pricing"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("usemotion.com/pricing");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: false
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||
})
|
||||
|
||||
it.concurrent("should return an error for invalid URL", async () => {
|
||||
const mapRequest = {
|
||||
url: "invalid-url",
|
||||
includeSubdomains: true,
|
||||
search: "test",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(400);
|
||||
expect(response.body).toHaveProperty("success", false);
|
||||
expect(response.body).toHaveProperty("error");
|
||||
});
|
||||
});
|
||||
});
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
import { crawlController } from '../crawl'
|
||||
import { crawlController } from '../v0/crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { authenticateUser } from '../v0/auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
@ -1,69 +0,0 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + req.params.jobId);
|
||||
|
||||
let progress = job.progress;
|
||||
if(typeof progress !== 'object') {
|
||||
progress = {
|
||||
current: 0,
|
||||
current_url: '',
|
||||
total: 0,
|
||||
current_step: '',
|
||||
partialDocs: []
|
||||
}
|
||||
}
|
||||
const {
|
||||
current = 0,
|
||||
current_url = '',
|
||||
total = 0,
|
||||
current_step = '',
|
||||
partialDocs = []
|
||||
} = progress as { current: number, current_url: string, total: number, current_step: string, partialDocs: any[] };
|
||||
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: isCancelled ? "failed" : jobStatus,
|
||||
// progress: job.progress(),
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data && !isCancelled ? data : null,
|
||||
partial_data: jobStatus == 'completed' && !isCancelled ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,110 +0,0 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
try {
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId: uuidv4(),
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false, (progress) => {
|
||||
job.updateProgress({
|
||||
current: progress.current,
|
||||
total: progress.total,
|
||||
current_step: "SCRAPING",
|
||||
current_url: progress.currentDocumentUrl,
|
||||
});
|
||||
});
|
||||
return res.json({
|
||||
success: true,
|
||||
documents: docs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
});
|
||||
|
||||
await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
// authenticate on supabase
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
||||
team_id: "preview",
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
let progress = job.progress;
|
||||
if(typeof progress !== 'object') {
|
||||
progress = {
|
||||
current: 0,
|
||||
current_url: '',
|
||||
total: 0,
|
||||
current_step: '',
|
||||
partialDocs: []
|
||||
}
|
||||
}
|
||||
const {
|
||||
current = 0,
|
||||
current_url = '',
|
||||
total = 0,
|
||||
current_step = '',
|
||||
partialDocs = []
|
||||
} = progress as { current: number, current_url: string, total: number, current_step: string, partialDocs: any[] };
|
||||
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
let jobStatus = await job.getState();
|
||||
if (jobStatus === 'waiting' || jobStatus === 'delayed' || jobStatus === 'waiting-children' || jobStatus === 'unknown' || jobStatus === 'prioritized') {
|
||||
jobStatus = 'active';
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,9 +1,9 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getWebScraperQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
@ -11,13 +11,13 @@ export async function cleanBefore24hCompleteJobsController(
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
scrapeQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
@ -68,10 +68,10 @@ export async function checkQueuesController(req: Request, res: Response) {
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
scrapeQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
@ -1,8 +1,7 @@
|
||||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { sendSlackWebhook } from "../../services/alerts/slack";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
||||
@ -63,22 +62,22 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||
Logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
healthStatus
|
||||
)}`,
|
||||
true
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
// healthStatus
|
||||
// )}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
true
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", message: error.message });
|
@ -1,26 +1,77 @@
|
||||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
||||
import { AuthResponse, NotificationType, RateLimiterMode } from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
import { parseApi } from "../../../src/lib/parseApi";
|
||||
import { getRateLimiter } from "../../../src/services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
RateLimiterMode,
|
||||
} from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { withAuth } from "../../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../../services/notification/email_notification";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redlock } from "../../../src/services/redlock";
|
||||
import { getValue } from "../../../src/services/redis";
|
||||
import { setValue } from "../../../src/services/redis";
|
||||
import { validate } from "uuid";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||
// Check if the string is a valid UUID
|
||||
return validate(potentialUuid);
|
||||
}
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function getKeyAndPriceId(normalizedApi: string): Promise<{
|
||||
success: boolean;
|
||||
teamId?: string;
|
||||
priceId?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
}> {
|
||||
const { data, error } = await supabase_service.rpc("get_key_and_price_id_2", {
|
||||
api_key: normalizedApi,
|
||||
});
|
||||
if (error) {
|
||||
Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`);
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"The server seems overloaded. Please contact hello@firecrawl.com if you aren't sending too many requests at once.",
|
||||
status: 500,
|
||||
};
|
||||
}
|
||||
if (!data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
// TODO: change this error code ?
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
teamId: data[0].team_id,
|
||||
priceId: data[0].price_id,
|
||||
};
|
||||
}
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
@ -51,20 +102,83 @@ export async function supaAuthenticateUser(
|
||||
const iptoken = incomingIP + token;
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let subscriptionData: { team_id: string; plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
let team_id: string;
|
||||
let cacheKey = "";
|
||||
let redLockKey = "";
|
||||
const lockTTL = 15000; // 10 seconds
|
||||
let teamId: string | null = null;
|
||||
let priceId: string | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
team_id = "preview";
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
if (!normalizedApiIsUuid(normalizedApi)) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
cacheKey = `api_key:${normalizedApi}`;
|
||||
|
||||
try {
|
||||
const teamIdPriceId = await getValue(cacheKey);
|
||||
if (teamIdPriceId) {
|
||||
const { team_id, price_id } = JSON.parse(teamIdPriceId);
|
||||
teamId = team_id;
|
||||
priceId = price_id;
|
||||
} else {
|
||||
const {
|
||||
success,
|
||||
teamId: tId,
|
||||
priceId: pId,
|
||||
error,
|
||||
status,
|
||||
} = await getKeyAndPriceId(normalizedApi);
|
||||
if (!success) {
|
||||
return { success, error, status };
|
||||
}
|
||||
teamId = tId;
|
||||
priceId = pId;
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
10
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error with auth function: ${error.message}`);
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
}
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
@ -82,46 +196,39 @@ export async function supaAuthenticateUser(
|
||||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const internal_team_id = data[0].team_id;
|
||||
team_id = internal_team_id;
|
||||
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
}
|
||||
team_id: teamId,
|
||||
plan: plan,
|
||||
};
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Crawl,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Scrape,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Search,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
@ -134,7 +241,8 @@ export async function supaAuthenticateUser(
|
||||
}
|
||||
}
|
||||
|
||||
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
|
||||
const team_endpoint_token =
|
||||
token === "this_is_just_a_preview_token" ? iptoken : teamId;
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
@ -147,7 +255,17 @@ export async function supaAuthenticateUser(
|
||||
const startDate = new Date();
|
||||
const endDate = new Date();
|
||||
endDate.setDate(endDate.getDate() + 7);
|
||||
|
||||
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||
// Cache longer for 429s
|
||||
if (teamId && priceId && mode !== RateLimiterMode.Preview) {
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
60 // 10 seconds, cache for everything
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
@ -157,7 +275,9 @@ export async function supaAuthenticateUser(
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
(mode === RateLimiterMode.Scrape ||
|
||||
mode === RateLimiterMode.Preview ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
@ -181,8 +301,6 @@ export async function supaAuthenticateUser(
|
||||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
@ -195,26 +313,32 @@ export async function supaAuthenticateUser(
|
||||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: subscriptionData.plan ?? "",
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
return "starter";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
return "standard";
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
return "scale";
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
return "hobby";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
return "standardnew";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
return "growth";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
|
||||
return "growthdouble";
|
||||
default:
|
||||
return 'free';
|
||||
return "free";
|
||||
}
|
||||
}
|
||||
}
|
@ -1,10 +1,9 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -18,8 +17,9 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
@ -39,27 +39,9 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
}
|
||||
}
|
||||
|
||||
const jobState = await job.getState();
|
||||
let progress = job.progress;
|
||||
if(typeof progress !== 'object') {
|
||||
progress = {
|
||||
partialDocs: []
|
||||
}
|
||||
}
|
||||
const {
|
||||
partialDocs = []
|
||||
} = progress as { partialDocs: any[] };
|
||||
|
||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||
Logger.info("Billing team for partial docs...");
|
||||
// Note: the credits that we will bill them here might be lower than the actual
|
||||
// due to promises that are not yet resolved
|
||||
await billTeam(team_id, partialDocs.length);
|
||||
}
|
||||
|
||||
try {
|
||||
await (await getWebScraperQueue().client).set("cancelled:" + job.id, "true", "EX", 60 * 60);
|
||||
await job.discard();
|
||||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
60
apps/api/src/controllers/v0/crawl-status.ts
Normal file
60
apps/api/src/controllers/v0/crawl-status.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== team_id) {
|
||||
return res.status(403).json({ error: "Forbidden" });
|
||||
}
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
const job = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(job.id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return job;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
171
apps/api/src/controllers/v0/crawl.ts
Normal file
171
apps/api/src/controllers/v0/crawl.ts
Normal file
@ -0,0 +1,171 @@
|
||||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.updateProgress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
const jobs = sitemap.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
await lockURLs(id, jobs.map(x => x.data.url));
|
||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
}, {
|
||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
135
apps/api/src/controllers/v0/crawlPreview.ts
Normal file
135
apps/api/src/controllers/v0/crawlPreview.ts
Normal file
@ -0,0 +1,135 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
);
|
||||
|
||||
const team_id = "preview";
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.updateProgress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
let robots;
|
||||
|
||||
try {
|
||||
robots = await this.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
robots,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
for (const url of sitemap.map(x => x.url)) {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
|
||||
import { AuthResponse, RateLimiterMode } from "../types";
|
||||
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
@ -1,17 +1,17 @@
|
||||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||
import { ExtractorOptions, PageOptions } from './../../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
import { addScrapeJob } from '../services/queue-jobs';
|
||||
import { scrapeQueueEvents } from '../services/queue-service';
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../lib/default-values';
|
||||
import { addScrapeJob } from '../../services/queue-jobs';
|
||||
import { scrapeQueueEvents } from '../../services/queue-service';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
import { Logger } from '../../lib/logger';
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
@ -45,7 +45,7 @@ export async function scrapeHelper(
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
});
|
||||
}, {}, jobId);
|
||||
|
||||
let doc;
|
||||
try {
|
||||
@ -62,6 +62,8 @@ export async function scrapeHelper(
|
||||
}
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
||||
@ -121,13 +123,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
};
|
||||
|
||||
|
||||
// Async check saves 500ms in average case
|
||||
// Don't async check in llm extraction mode as it could be expensive
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
await checkCredits();
|
||||
} else {
|
||||
checkCredits();
|
||||
}
|
||||
await checkCredits();
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
@ -1,14 +1,15 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue, scrapeQueueEvents } from "../../services/queue-service";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
@ -75,26 +76,28 @@ export async function searchHelper(
|
||||
|
||||
// filter out social media links
|
||||
|
||||
const jobDatas = res.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 10,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
const jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: {
|
||||
...pageOptions,
|
||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||
includeHtml: pageOptions?.includeHtml ?? false,
|
||||
removeTags: pageOptions?.removeTags ?? [],
|
||||
fallback: false,
|
||||
},
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
@ -109,19 +112,6 @@ export async function searchHelper(
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
filteredDocs.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
@ -150,8 +140,8 @@ export async function searchController(req: Request, res: Response) {
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
54
apps/api/src/controllers/v0/status.ts
Normal file
54
apps/api/src/controllers/v0/status.ts
Normal file
@ -0,0 +1,54 @@
|
||||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
const job = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(job.id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return job;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -26,13 +26,7 @@ export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
}> {
|
||||
): Promise<AuthResponse> {
|
||||
const authHeader = req.headers.authorization;
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
@ -106,7 +100,7 @@ export async function supaAuthenticateUser(
|
||||
setTrace(team_id, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
plan: plan,
|
||||
}
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
@ -121,6 +115,9 @@ export async function supaAuthenticateUser(
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
case RateLimiterMode.Map:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
|
||||
break;
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
@ -157,7 +154,7 @@ export async function supaAuthenticateUser(
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
@ -195,7 +192,12 @@ export async function supaAuthenticateUser(
|
||||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: subscriptionData.plan ?? "",
|
||||
api_key: normalizedApi
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
|
148
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
148
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
@ -0,0 +1,148 @@
|
||||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../v0/auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue, scrapeQueueEvents } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
error: string,
|
||||
}
|
||||
|
||||
type CatchupMessage = {
|
||||
type: "catchup",
|
||||
data: CrawlStatusResponse,
|
||||
}
|
||||
|
||||
type DocumentMessage = {
|
||||
type: "document",
|
||||
data: Document,
|
||||
}
|
||||
|
||||
type DoneMessage = { type: "done" }
|
||||
|
||||
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
||||
|
||||
function send(ws: WebSocket, msg: Message) {
|
||||
if (ws.readyState === 1) {
|
||||
return new Promise((resolve, reject) => {
|
||||
ws.send(JSON.stringify(msg), (err) => {
|
||||
if (err) reject(err);
|
||||
else resolve(null);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function close(ws: WebSocket, code: number, msg: Message) {
|
||||
if (ws.readyState <= 1) {
|
||||
ws.close(code, JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return close(ws, 1008, { type: "error", error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs = [];
|
||||
|
||||
const completedListener = async e => {
|
||||
const job = await getScrapeQueue().getJob(e.jobId)
|
||||
if (job.data.crawl_id === req.params.jobId) {
|
||||
if (doneJobIDs.includes(job.id)) return;
|
||||
const j = await getJob(job.id);
|
||||
if (j.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(j.returnvalue),
|
||||
});
|
||||
if (await isCrawlFinishedLocked(req.params.jobId)) {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(true), 5000)) // wait for last events to pour in
|
||||
scrapeQueueEvents.removeListener("completed", completedListener);
|
||||
close(ws, 1000, { type: "done" })
|
||||
}
|
||||
} else {
|
||||
// FAILED
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: handle failed jobs
|
||||
|
||||
scrapeQueueEvents.addListener("completed", completedListener);
|
||||
|
||||
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobs = await getJobs(doneJobIDs);
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
send(ws, {
|
||||
type: "catchup",
|
||||
data: {
|
||||
status,
|
||||
totalCount: jobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
}
|
||||
});
|
||||
|
||||
if (status !== "scraping") {
|
||||
scrapeQueueEvents.removeListener("completed", completedListener);
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
}
|
||||
|
||||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
} catch (err) {
|
||||
const id = uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
@ -1,89 +1,115 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
// TODO: validate req.params.jobId
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(id);
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
|
||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
// if (!job) {
|
||||
// return res.status(404).json({ error: "Job not found" });
|
||||
// }
|
||||
|
||||
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
// const jobStatus = await job.getState();
|
||||
|
||||
// mock:
|
||||
const id = uuidv4();
|
||||
const result = {
|
||||
totalCount: 100,
|
||||
creditsUsed: 2,
|
||||
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
|
||||
status: "scraping", // scraping, completed, failed
|
||||
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
data: [{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
},
|
||||
{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
res.status(200).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
if (job) {
|
||||
job.returnvalue = x.docs;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
jobs.forEach(job => {
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
});
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
const bytesLimit = 10485760; // 10 MiB in bytes
|
||||
const factor = 100; // chunking for faster retrieval
|
||||
|
||||
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
|
||||
// get current chunk and retrieve jobs
|
||||
const currentIDs = doneJobsOrder.slice(i, i+factor);
|
||||
const jobs = await getJobs(currentIDs);
|
||||
|
||||
// iterate through jobs and add them one them one to the byte counter
|
||||
// both loops will break once we cross the byte counter
|
||||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
}
|
||||
}
|
||||
|
||||
// if we ran over the bytes limit, remove the last document
|
||||
if (bytes > bytesLimit) {
|
||||
doneJobs.splice(doneJobs.length - 1, 1);
|
||||
}
|
||||
} else {
|
||||
doneJobs = await getJobs(doneJobsOrder);
|
||||
}
|
||||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
if (typeof req.query.limit === "string") {
|
||||
nextURL.searchParams.set("limit", req.query.limit);
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
status,
|
||||
totalCount: jobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
next:
|
||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,139 +1,126 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import {
|
||||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
// expected req.body
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// crawlerOptions: {
|
||||
// includePaths: string[]
|
||||
// excludePaths: string[]
|
||||
// maxDepth: number
|
||||
// limit: number
|
||||
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks: boolean
|
||||
// ignoreSitemap: number
|
||||
// }
|
||||
// scrapeOptions: Exclude<Scrape, "url">
|
||||
// }
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
const { remainingCredits } = req.account;
|
||||
|
||||
// TODO: Get rid of crawlerOptions
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return res.status(400).json({ error: 'Invalid Url' });
|
||||
}
|
||||
|
||||
// TODO: add job to queue
|
||||
|
||||
const id = uuidv4();
|
||||
return res.status(200).json({ jobId: id, url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}` });
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
||||
|
@ -1,128 +1,94 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
checkAndUpdateURLForMap,
|
||||
isSameDomain,
|
||||
isSameSubdomain,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
|
||||
export async function mapController(req: Request, res: Response) {
|
||||
// expected req.body
|
||||
configDotenv();
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// ignoreSitemap: true??
|
||||
// other crawler options?
|
||||
// }
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
// if (req.headers["x-idempotency-key"]) {
|
||||
// const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
// if (!isIdempotencyValid) {
|
||||
// return res.status(409).json({ error: "Idempotency key already used" });
|
||||
// }
|
||||
// try {
|
||||
// createIdempotencyKey(req);
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
// await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
const sitemap =
|
||||
req.body.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return res.status(400).json({ error: 'Invalid Url' });
|
||||
}
|
||||
|
||||
return res.status(200).json({ urls: [ "test1", "test2" ] });
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
if (sitemap !== null) {
|
||||
sitemap.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
numResults: 50,
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||
|
||||
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = [...new Set(links)];
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links,
|
||||
});
|
||||
}
|
||||
|
@ -1,253 +1,105 @@
|
||||
// import { ExtractorOptions, PageOptions } from './../../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
// import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
// import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
// import { logJob } from "../../services/logging/log_job";
|
||||
// import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
// import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
|
||||
// import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../../src/lib/default-values';
|
||||
// import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../../lib/logger';
|
||||
import { checkAndUpdateURL } from '../../lib/validateUrl';
|
||||
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { scrapeQueueEvents } from '../../services/queue-service';
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||
req.body = scrapeRequestSchema.parse(req.body);
|
||||
let earlyReturn = false;
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const job = await addScrapeJob({
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions: {},
|
||||
origin: req.body.origin,
|
||||
}, {}, jobId);
|
||||
|
||||
let doc: any | undefined;
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return { success: false, error: "Invalid URL", returnCode: 400 };
|
||||
}
|
||||
|
||||
// TODO: check req.body
|
||||
// mockup req.body
|
||||
// req.body = {
|
||||
// url: "test",
|
||||
// headers: {
|
||||
// "x-key": "test"
|
||||
// },
|
||||
// formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
|
||||
// includeTags: ["test"],
|
||||
// excludeTags: ["test"],
|
||||
// onlyMainContent: false,
|
||||
// timeout: 30000,
|
||||
// waitFor: number
|
||||
// }
|
||||
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; // 60 seconds timeout
|
||||
} catch (e) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// check credits
|
||||
await job.remove();
|
||||
|
||||
const result = {
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "test",
|
||||
data: {
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res.status(200).json(result);
|
||||
|
||||
// const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
// const origin = req.body.origin ?? defaultOrigin;
|
||||
// let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// pageOptions.onlyMainContent = true;
|
||||
// timeout = req.body.timeout ?? 90000;
|
||||
// }
|
||||
|
||||
// const checkCredits = async () => {
|
||||
// try {
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// earlyReturn = true;
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// earlyReturn = true;
|
||||
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
// }
|
||||
// };
|
||||
|
||||
|
||||
// await checkCredits();
|
||||
|
||||
// const jobId = uuidv4();
|
||||
|
||||
// const startTime = new Date().getTime();
|
||||
// const result = await scrapeHelper(
|
||||
// jobId,
|
||||
// req,
|
||||
// team_id,
|
||||
// crawlerOptions,
|
||||
// pageOptions,
|
||||
// extractorOptions,
|
||||
// timeout,
|
||||
// plan
|
||||
// );
|
||||
// const endTime = new Date().getTime();
|
||||
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
// if (result.success) {
|
||||
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
// const creditsPerLLMExtract = 50;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
// creditsToBeBilled += creditsPerLLMExtract;
|
||||
// }
|
||||
|
||||
// let startTimeBilling = new Date().getTime();
|
||||
|
||||
// if (earlyReturn) {
|
||||
// // Don't bill if we're early returning
|
||||
// return;
|
||||
// }
|
||||
// const billingResult = await billTeam(
|
||||
// team_id,
|
||||
// creditsToBeBilled
|
||||
// );
|
||||
// if (!billingResult.success) {
|
||||
// return res.status(402).json({
|
||||
// success: false,
|
||||
// error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// logJob({
|
||||
// job_id: jobId,
|
||||
// success: result.success,
|
||||
// message: result.error,
|
||||
// num_docs: 1,
|
||||
// docs: [result.data],
|
||||
// time_taken: timeTakenInSeconds,
|
||||
// team_id: team_id,
|
||||
// mode: "scrape",
|
||||
// url: req.body.url,
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: origin,
|
||||
// extractor_options: extractorOptions,
|
||||
// num_tokens: numTokens,
|
||||
// });
|
||||
|
||||
|
||||
// return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
warning: "No page found",
|
||||
data: doc
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// export async function scrapeHelper(
|
||||
// jobId: string,
|
||||
// req: Request,
|
||||
// team_id: string,
|
||||
// crawlerOptions: any,
|
||||
// pageOptions: PageOptions,
|
||||
// extractorOptions: ExtractorOptions,
|
||||
// timeout: number,
|
||||
// plan?: string
|
||||
// ): Promise<{
|
||||
// success: boolean;
|
||||
// error?: string;
|
||||
// data?: Document;
|
||||
// returnCode: number;
|
||||
// }> {
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
// const url = req.body.url;
|
||||
// if (!url) {
|
||||
// return { success: false, error: "Url is required", returnCode: 400 };
|
||||
// }
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
|
||||
// if (isUrlBlocked(url)) {
|
||||
// return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
// }
|
||||
const billingResult = await billTeam(
|
||||
req.auth.team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId,
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: {
|
||||
// ...crawlerOptions,
|
||||
// },
|
||||
// pageOptions: pageOptions,
|
||||
// extractorOptions: extractorOptions,
|
||||
// });
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
message: "Scrape completed",
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
|
||||
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
|
||||
// );
|
||||
|
||||
// const docsPromise = a.getDocuments(false);
|
||||
|
||||
// let docs;
|
||||
// try {
|
||||
// docs = await Promise.race([docsPromise, timeoutPromise]);
|
||||
// } catch (error) {
|
||||
// return error;
|
||||
// }
|
||||
|
||||
// // make sure doc.content is not empty
|
||||
// let filteredDocs = docs.filter(
|
||||
// (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
// );
|
||||
// if (filteredDocs.length === 0) {
|
||||
// return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||
// }
|
||||
|
||||
|
||||
// // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
// if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
// filteredDocs.forEach(doc => {
|
||||
// delete doc.rawHtml;
|
||||
// });
|
||||
// }
|
||||
|
||||
// return {
|
||||
// success: true,
|
||||
// data: filteredDocs[0],
|
||||
// returnCode: 200,
|
||||
// };
|
||||
// }
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
});
|
||||
}
|
306
apps/api/src/controllers/v1/types.ts
Normal file
306
apps/api/src/controllers/v1/types.ts
Normal file
@ -0,0 +1,306 @@
|
||||
import { Request } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
| "html"
|
||||
| "rawHtml"
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage";
|
||||
|
||||
const url = z.preprocess(
|
||||
(x) => {
|
||||
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
||||
if (x.startsWith("://")) {
|
||||
return "http" + x;
|
||||
} else {
|
||||
return "http://" + x;
|
||||
}
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
},
|
||||
z
|
||||
.string()
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => !isUrlBlocked(x),
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||
)
|
||||
);
|
||||
|
||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"]),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
parsePDF: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
// includePaths?: string[];
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
webhook: z.string().url().optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({
|
||||
url: z.string().url(),
|
||||
origin: z.string().optional().default("api"),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// };
|
||||
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dcTermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dcTermsType?: string;
|
||||
dcType?: string;
|
||||
dcTermsAudience?: string;
|
||||
dcTermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dcTermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type ErrorResponse = {
|
||||
success: false;
|
||||
error: string;
|
||||
details?: any;
|
||||
};
|
||||
|
||||
export type ScrapeResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
};
|
||||
|
||||
export interface ScrapeResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ScrapeResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
id: string;
|
||||
url: string;
|
||||
};
|
||||
|
||||
export type MapResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
links: string[];
|
||||
};
|
||||
|
||||
export type CrawlStatusParams = {
|
||||
jobId: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
totalCount: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: string;
|
||||
next?: string;
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: string;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
remainingCredits: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth?: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface RequestWithAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined,
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
maxCrawledLinks: x.limit,
|
||||
maxCrawledDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata.pageError,
|
||||
statusCode: doc.metadata.pageStatusCode,
|
||||
},
|
||||
};
|
||||
}
|
@ -2,7 +2,7 @@ import express from "express";
|
||||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import "dotenv/config";
|
||||
import { getScrapeQueue, getWebScraperQueue } from "./services/queue-service";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import cluster from "cluster";
|
||||
@ -14,6 +14,8 @@ import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
@ -46,7 +48,8 @@ if (cluster.isMaster) {
|
||||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
const ws = expressWs(express());
|
||||
const app = ws.app;
|
||||
|
||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||
|
||||
@ -59,7 +62,7 @@ if (cluster.isMaster) {
|
||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||
|
||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||
queues: [new BullAdapter(getWebScraperQueue()), new BullAdapter(getScrapeQueue())],
|
||||
queues: [new BullAdapter(getScrapeQueue())],
|
||||
serverAdapter: serverAdapter,
|
||||
});
|
||||
|
||||
@ -79,7 +82,7 @@ if (cluster.isMaster) {
|
||||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use(v1Router);
|
||||
app.use("/v1", v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
@ -106,9 +109,9 @@ if (cluster.isMaster) {
|
||||
|
||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const [waitingJobs] = await Promise.all([
|
||||
webScraperQueue.getWaitingCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
const noWaitingJobs = waitingJobs === 0;
|
||||
@ -128,9 +131,9 @@ if (cluster.isMaster) {
|
||||
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
||||
|
||||
const getWaitingJobsCount = async () => {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const [waitingJobsCount] = await Promise.all([
|
||||
webScraperQueue.getWaitingCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
return waitingJobsCount;
|
||||
@ -183,11 +186,12 @@ if (cluster.isMaster) {
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
// const wsq = getWebScraperQueue();
|
||||
// const sq = getScrapeQueue();
|
||||
|
||||
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
// sq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
// sq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
||||
// wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
// wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
// wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
// wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
// wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
// wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
32
apps/api/src/lib/checkCredits.ts
Normal file
32
apps/api/src/lib/checkCredits.ts
Normal file
@ -0,0 +1,32 @@
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
type checkCreditsResponse = {
|
||||
status: number;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
|
||||
try {
|
||||
const {
|
||||
success: creditsCheckSuccess,
|
||||
message: creditsCheckMessage
|
||||
} = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return {
|
||||
status: 402,
|
||||
error: "Insufficient credits"
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return {
|
||||
status: 500,
|
||||
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
|
||||
};
|
||||
}
|
||||
return {
|
||||
status: 200,
|
||||
error: null
|
||||
}
|
||||
};
|
123
apps/api/src/lib/crawl-redis.ts
Normal file
123
apps/api/src/lib/crawl-redis.ts
Normal file
@ -0,0 +1,123 @@
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
};
|
||||
|
||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||
const x = await redisConnection.get("crawl:" + id);
|
||||
|
||||
if (x === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return JSON.parse(x);
|
||||
}
|
||||
|
||||
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("crawl:" + id);
|
||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||
d.setMilliseconds(0);
|
||||
return d;
|
||||
}
|
||||
|
||||
export async function addCrawlJob(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
||||
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
||||
}
|
||||
|
||||
export async function isCrawlFinished(id: string) {
|
||||
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
||||
}
|
||||
|
||||
export async function isCrawlFinishedLocked(id: string) {
|
||||
return (await redisConnection.exists("crawl:" + id + ":finish"));
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
if (await isCrawlFinished(id)) {
|
||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||
if (set === 1) {
|
||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||
}
|
||||
return set === 1
|
||||
}
|
||||
}
|
||||
|
||||
export async function getCrawlJobs(id: string): Promise<string[]> {
|
||||
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
initialUrl: sc.originUrl,
|
||||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
try {
|
||||
crawler.importRobotsTxt(sc.robots);
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
return crawler;
|
||||
}
|
@ -7,6 +7,7 @@ export const defaultPageOptions = {
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
|
@ -11,6 +11,7 @@ export interface Progress {
|
||||
}
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
@ -18,11 +19,13 @@ export type PageOptions = {
|
||||
fetchPageContent?: boolean;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
includeLinks?: boolean;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
@ -42,8 +45,8 @@ export type SearchOptions = {
|
||||
|
||||
export type CrawlerOptions = {
|
||||
returnOnlyUrls?: boolean;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
includes?: string | string[];
|
||||
excludes?: string | string[];
|
||||
maxCrawledLinks?: number;
|
||||
maxDepth?: number;
|
||||
limit?: number;
|
||||
@ -64,6 +67,7 @@ export type WebScraperOptions = {
|
||||
extractorOptions?: ExtractorOptions;
|
||||
concurrentRequests?: number;
|
||||
bullJobId?: string;
|
||||
priority?: number;
|
||||
};
|
||||
|
||||
export interface DocumentUrl {
|
||||
|
@ -46,7 +46,7 @@ export class ScrapeEvents {
|
||||
}).select().single();
|
||||
return (result.data as any).id;
|
||||
} catch (error) {
|
||||
Logger.error(`Error inserting scrape event: ${error}`);
|
||||
// Logger.error(`Error inserting scrape event: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -17,3 +17,21 @@ export const supabaseGetJobById = async (jobId: string) => {
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
export const supabaseGetJobsById = async (jobIds: string[]) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.in('job_id', jobIds);
|
||||
|
||||
if (error) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
|
88
apps/api/src/lib/validateUrl.test.ts
Normal file
88
apps/api/src/lib/validateUrl.test.ts
Normal file
@ -0,0 +1,88 @@
|
||||
import { isSameDomain } from "./validateUrl";
|
||||
import { isSameSubdomain } from "./validateUrl";
|
||||
|
||||
describe("isSameDomain", () => {
|
||||
it("should return true for a subdomain", () => {
|
||||
const result = isSameDomain("http://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain", () => {
|
||||
const result = isSameDomain("http://example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameDomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with different protocols", () => {
|
||||
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameDomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameDomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with www prefix", () => {
|
||||
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain with www prefix", () => {
|
||||
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
describe("isSameSubdomain", () => {
|
||||
it("should return false for a subdomain", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different subdomains", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameSubdomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameSubdomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with different protocols", () => {
|
||||
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with www prefix", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
@ -1,9 +1,8 @@
|
||||
|
||||
const protocolIncluded = (url: string) => {
|
||||
// if :// not in the start of the url assume http (maybe https?)
|
||||
// regex checks if :// appears before any .
|
||||
return(/^([^.:]+:\/\/)/.test(url));
|
||||
}
|
||||
return /^([^.:]+:\/\/)/.test(url);
|
||||
};
|
||||
|
||||
const getURLobj = (s: string) => {
|
||||
// URL fails if we dont include the protocol ie google.com
|
||||
@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
|
||||
};
|
||||
|
||||
export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
@ -30,9 +28,95 @@ export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
/**
|
||||
* Same domain check
|
||||
* It checks if the domain of the url is the same as the base url
|
||||
* It accounts true for subdomains and www.subdomains
|
||||
* @param url
|
||||
* @param baseUrl
|
||||
* @returns
|
||||
*/
|
||||
export function isSameDomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
return domain1 === domain2;
|
||||
}
|
||||
|
||||
|
||||
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
||||
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
||||
|
||||
// Check if the domains are the same and the subdomains are the same
|
||||
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||
}
|
||||
|
||||
|
||||
export const checkAndUpdateURLForMap = (url: string) => {
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
// remove last slash if present
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
// remove any query params
|
||||
url = url.split("?")[0].trim();
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
@ -12,7 +12,7 @@ import { Document } from "../lib/entities";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { getWebScraperQueue } from "../services/queue-service";
|
||||
import { getScrapeQueue } from "../services/queue-service";
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
@ -27,7 +27,12 @@ export async function startWebScraperPipeline({
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
@ -35,7 +40,7 @@ export async function startWebScraperPipeline({
|
||||
if (partialDocs.length > 50) {
|
||||
partialDocs = partialDocs.slice(-50);
|
||||
}
|
||||
job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
}
|
||||
},
|
||||
onSuccess: (result, mode) => {
|
||||
@ -49,6 +54,7 @@ export async function startWebScraperPipeline({
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}
|
||||
export async function runWebScraper({
|
||||
@ -62,6 +68,7 @@ export async function runWebScraper({
|
||||
onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
@ -74,6 +81,7 @@ export async function runWebScraper({
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
@ -83,6 +91,7 @@ export async function runWebScraper({
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
});
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
@ -104,21 +113,17 @@ export async function runWebScraper({
|
||||
return { url: doc.metadata.sourceURL };
|
||||
}
|
||||
})
|
||||
: docs.filter((doc) => doc.content.trim().length > 0);
|
||||
: docs;
|
||||
|
||||
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + bull_job_id);
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
|
||||
if (!isCancelled) {
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
@ -141,21 +146,21 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
.eq("job_id", job.id);
|
||||
|
||||
if (error) throw new Error(error.message);
|
||||
try {
|
||||
if (mode === "crawl") {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} else {
|
||||
await job.moveToCompleted(result, token, false);
|
||||
}
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
await job.moveToCompleted(result, token, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
// try {
|
||||
// if (mode === "crawl") {
|
||||
// await job.moveToCompleted(null, token, false);
|
||||
// } else {
|
||||
// await job.moveToCompleted(result, token, false);
|
||||
// }
|
||||
// } catch (error) {
|
||||
// // I think the job won't exist here anymore
|
||||
// }
|
||||
// } else {
|
||||
// try {
|
||||
// await job.moveToCompleted(result, token, false);
|
||||
// } catch (error) {
|
||||
// // I think the job won't exist here anymore
|
||||
// }
|
||||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
|
@ -1,10 +1,10 @@
|
||||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
import {
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
} from "../controllers/v0/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -1,14 +1,14 @@
|
||||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
import { searchController } from "../../src/controllers/v0/search";
|
||||
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
import { livenessController } from "../controllers/v0/liveness";
|
||||
import { readinessController } from "../controllers/v0/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
|
@ -1,9 +1,21 @@
|
||||
import express from "express";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import { crawlController } from "../../src/controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { mapController } from "../../src/controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/v1/auth";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
import { ZodError } from "zod";
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
@ -12,23 +24,142 @@ import { mapController } from "../../src/controllers/v1/map";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
}
|
||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
};
|
||||
}
|
||||
|
||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
}
|
||||
|
||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
(async () => {
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
}
|
||||
}
|
||||
|
||||
expressWs(express());
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post("/v1/scrape", scrapeController);
|
||||
v1Router.post("/v1/crawl", crawlController);
|
||||
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
||||
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
|
||||
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
|
||||
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(),
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.ws(
|
||||
"/crawl/:jobId",
|
||||
crawlStatusWSController
|
||||
);
|
||||
|
||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||
// v1Router.delete("/crawl/:jobId", crawlCancelController);
|
||||
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
// v1Router.get("/v1/keyAuth", keyAuthController);
|
||||
// v1Router.get("/keyAuth", keyAuthController);
|
||||
|
||||
// // Search routes
|
||||
// v0Router.post("/v1/search", searchController);
|
||||
// v0Router.post("/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/v1/health/liveness", livenessController);
|
||||
// v1Router.get("/v1/health/readiness", readinessController);
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
||||
|
||||
v1Router.post("/v1/map", mapController);
|
||||
v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
} else {
|
||||
const id = uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
|
||||
}
|
||||
});
|
||||
|
175
apps/api/src/run-req.ts
Normal file
175
apps/api/src/run-req.ts
Normal file
@ -0,0 +1,175 @@
|
||||
import axios from "axios";
|
||||
import { promises as fs } from "fs";
|
||||
import { v4 as uuidV4 } from "uuid";
|
||||
|
||||
interface Result {
|
||||
start_url: string;
|
||||
job_id?: string;
|
||||
idempotency_key?: string;
|
||||
result_data_jsonb?: any;
|
||||
}
|
||||
|
||||
async function sendCrawl(result: Result): Promise<string | undefined> {
|
||||
const idempotencyKey = uuidV4();
|
||||
const url = result.start_url;
|
||||
try {
|
||||
const response = await axios.post(
|
||||
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
|
||||
{
|
||||
url: url,
|
||||
crawlerOptions: {
|
||||
limit: 75,
|
||||
},
|
||||
pageOptions: {
|
||||
includeHtml: true,
|
||||
replaceAllPathsWithAbsolutePaths: true,
|
||||
waitFor: 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer `,
|
||||
},
|
||||
}
|
||||
);
|
||||
result.idempotency_key = idempotencyKey;
|
||||
return response.data.jobId;
|
||||
} catch (error) {
|
||||
console.error("Error sending crawl:", error);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async function getContent(result: Result): Promise<boolean> {
|
||||
let attempts = 0;
|
||||
while (attempts < 120) {
|
||||
// Reduce the number of attempts to speed up
|
||||
try {
|
||||
const response = await axios.get(
|
||||
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer `,
|
||||
},
|
||||
}
|
||||
);
|
||||
if (response.data.status === "completed") {
|
||||
result.result_data_jsonb = response.data.data;
|
||||
// Job actually completed
|
||||
return true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error getting content:", error);
|
||||
}
|
||||
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
|
||||
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
|
||||
attempts++;
|
||||
}
|
||||
// Set result as null if timed out
|
||||
result.result_data_jsonb = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
async function processResults(results: Result[]): Promise<void> {
|
||||
let processedCount = 0;
|
||||
let starterCount = 0;
|
||||
const queue: Result[] = [];
|
||||
const processedUrls = new Set<string>();
|
||||
|
||||
// Initialize the queue with the first 1000 results
|
||||
for (let i = 0; i < Math.min(100, results.length); i++) {
|
||||
queue.push(results[i]);
|
||||
processedUrls.add(results[i].start_url);
|
||||
}
|
||||
|
||||
// Function to process a single result
|
||||
const processSingleResult = async (result: Result) => {
|
||||
const jobId = await sendCrawl(result);
|
||||
if (jobId) {
|
||||
console.log(`Job requested count: ${starterCount}`);
|
||||
starterCount++;
|
||||
result.job_id = jobId;
|
||||
processedCount++;
|
||||
// Save the result to the file
|
||||
try {
|
||||
// Save job id along with the start_url
|
||||
const resultWithJobId = results.map(r => ({
|
||||
start_url: r.start_url,
|
||||
job_id: r.job_id,
|
||||
}));
|
||||
await fs.writeFile(
|
||||
"results_with_job_id_4000_6000.json",
|
||||
JSON.stringify(resultWithJobId, null, 4)
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error writing to results_with_content.json:", error);
|
||||
}
|
||||
|
||||
// Add a new result to the queue if there are more results to process
|
||||
// if (processedCount < results.length) {
|
||||
// for (let i = queue.length; i < results.length; i++) {
|
||||
// if (!processedUrls.has(results[i].start_url)) {
|
||||
// const nextResult = results[i];
|
||||
// console.log("Next result:", nextResult.start_url);
|
||||
// queue.push(nextResult);
|
||||
// processedUrls.add(nextResult.start_url);
|
||||
// console.log(`Queue length: ${queue.length}`);
|
||||
// processSingleResult(nextResult);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
};
|
||||
|
||||
// Start processing the initial queue concurrently
|
||||
// for (let i = 0; i < queue.length; i++) {
|
||||
// processSingleResult(queue[i]);
|
||||
// if ((i + 1) % 500 === 0) {
|
||||
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
|
||||
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||
// }
|
||||
// }
|
||||
// Start processing the initial queue concurrently
|
||||
// await Promise.all(queue.map(result => processSingleResult(result)));
|
||||
for (let i = 0; i < results.length; i += 100) {
|
||||
const batch = results.slice(i, i + 100);
|
||||
Promise.all(batch.map((result) => processSingleResult(result)))
|
||||
.then(() => {
|
||||
console.log(`Processed ${i + 100} results.`);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`Error processing batch starting at index ${i}:`, error);
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||
}
|
||||
}
|
||||
|
||||
// Example call
|
||||
|
||||
async function getStartUrls(): Promise<Result[]> {
|
||||
try {
|
||||
const data = await fs.readFile("starturls.json", "utf-8");
|
||||
return JSON.parse(data);
|
||||
} catch (error) {
|
||||
console.error("Error reading starturls.json:", error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
|
||||
// console.log(results.map((r) => r.start_url).slice(0, 3));
|
||||
|
||||
processResults(results)
|
||||
.then(() => {
|
||||
console.log("All results processed.");
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error("Error processing results:", error);
|
||||
});
|
||||
}
|
||||
|
||||
main();
|
@ -23,8 +23,8 @@ describe('scrapSingleUrl', () => {
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
it('should return a list of links on the mendable.ai page', async () => {
|
||||
const url = 'https://mendable.ai';
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://flutterbricks.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://mendable.ai/blog')
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 10000);
|
||||
|
@ -1,4 +1,4 @@
|
||||
import axios from "axios";
|
||||
import axios, { AxiosError } from "axios";
|
||||
import cheerio, { load } from "cheerio";
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
@ -22,7 +22,7 @@ export class WebCrawler {
|
||||
private crawledUrls: Map<string, string> = new Map();
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
public robots: any;
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
@ -66,7 +66,7 @@ export class WebCrawler {
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
}
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
const url = new URL(link.trim(), this.baseUrl);
|
||||
@ -130,6 +130,25 @@ export class WebCrawler {
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
public importRobotsTxt(txt: string) {
|
||||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||
}
|
||||
|
||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public async start(
|
||||
inProgress?: (progress: Progress) => void,
|
||||
pageOptions?: PageOptions,
|
||||
@ -142,19 +161,17 @@ export class WebCrawler {
|
||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||
const txt = await this.getRobotsTxt();
|
||||
this.importRobotsTxt(txt);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
const sm = await this.tryGetSitemap();
|
||||
if (sm !== null) {
|
||||
return sm;
|
||||
}
|
||||
}
|
||||
|
||||
@ -164,7 +181,7 @@ export class WebCrawler {
|
||||
concurrencyLimit,
|
||||
inProgress
|
||||
);
|
||||
|
||||
|
||||
if (
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
@ -241,6 +258,54 @@ export class WebCrawler {
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
public filterURL(href: string, url: string): string | null {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
}
|
||||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
return fullUrl;
|
||||
}
|
||||
} else { // EXTERNAL LINKS
|
||||
if (
|
||||
this.isInternalLink(url) &&
|
||||
this.allowExternalContentLinks &&
|
||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||
!this.matchesExcludes(fullUrl, true) &&
|
||||
!this.isExternalMainPage(fullUrl)
|
||||
) {
|
||||
return fullUrl;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public extractLinksFromHTML(html: string, url: string) {
|
||||
let links: string[] = [];
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
@ -284,37 +349,7 @@ export class WebCrawler {
|
||||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
}
|
||||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
} else { // EXTERNAL LINKS
|
||||
if (
|
||||
this.isInternalLink(url) &&
|
||||
this.allowExternalContentLinks &&
|
||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||
!this.matchesExcludes(fullUrl, true) &&
|
||||
!this.isExternalMainPage(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
@ -420,9 +455,10 @@ export class WebCrawler {
|
||||
".woff",
|
||||
".ttf",
|
||||
".woff2",
|
||||
".webp"
|
||||
".webp",
|
||||
".inc"
|
||||
];
|
||||
return fileExtensions.some((ext) => url.endsWith(ext));
|
||||
return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext));
|
||||
}
|
||||
|
||||
private isSocialMediaOrEmail(url: string): boolean {
|
||||
@ -464,9 +500,13 @@ export class WebCrawler {
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -475,11 +515,15 @@ export class WebCrawler {
|
||||
try {
|
||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -16,18 +16,19 @@ import {
|
||||
replacePathsWithAbsolutePaths,
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private jobId: string;
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string[];
|
||||
private excludes: string[];
|
||||
private includes: string | string[];
|
||||
private excludes: string | string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
@ -43,6 +44,7 @@ export class WebScraperDataProvider {
|
||||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -71,7 +73,8 @@ export class WebScraperDataProvider {
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML
|
||||
existingHTML,
|
||||
this.priority,
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
@ -87,21 +90,6 @@ export class WebScraperDataProvider {
|
||||
results[i + index] = result;
|
||||
})
|
||||
);
|
||||
try {
|
||||
if (this.mode === "crawl" && this.bullJobId) {
|
||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||
const jobStatus = await job.getState();
|
||||
if (jobStatus === "failed") {
|
||||
Logger.info(
|
||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||
);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error.message);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
return results.filter((result) => result !== null) as Document[];
|
||||
}
|
||||
@ -167,11 +155,29 @@ export class WebScraperDataProvider {
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let includes: string[];
|
||||
if (Array.isArray(this.includes)) {
|
||||
if (this.includes[0] != "") {
|
||||
includes = this.includes;
|
||||
}
|
||||
} else {
|
||||
includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
let excludes: string[];
|
||||
if (Array.isArray(this.excludes)) {
|
||||
if (this.excludes[0] != "") {
|
||||
excludes = this.excludes;
|
||||
}
|
||||
} else {
|
||||
excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||
limit: this.limit,
|
||||
@ -287,7 +293,10 @@ export class WebScraperDataProvider {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
@ -316,12 +325,31 @@ export class WebScraperDataProvider {
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: pdfLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "pdf-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions.parsePDF
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
@ -330,12 +358,32 @@ export class WebScraperDataProvider {
|
||||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const { content, pageStatusCode, pageError } =
|
||||
await fetchAndProcessDocx(p);
|
||||
docxLinks.map(async (docxLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: docxLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "docx-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||
docxLink
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
content,
|
||||
metadata: { sourceURL: p, pageStatusCode, pageError },
|
||||
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
@ -406,6 +454,10 @@ export class WebScraperDataProvider {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
|
||||
if (!Array.isArray(this.excludes)) {
|
||||
this.excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
// Check if the link should be excluded
|
||||
if (
|
||||
@ -417,6 +469,10 @@ export class WebScraperDataProvider {
|
||||
}
|
||||
}
|
||||
|
||||
if (!Array.isArray(this.includes)) {
|
||||
this.includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0) {
|
||||
@ -528,14 +584,22 @@ export class WebScraperDataProvider {
|
||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
|
||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param screenshot Whether to take a screenshot
|
||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||
* @param pageOptions The options for the page
|
||||
* @param headers The headers to send with the request
|
||||
* @param options The options for the request
|
||||
@ -20,18 +21,22 @@ export async function scrapWithFireEngine({
|
||||
url,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
@ -47,8 +52,9 @@ export async function scrapWithFireEngine({
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
@ -61,17 +67,20 @@ export async function scrapWithFireEngine({
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
{
|
||||
|
@ -123,17 +123,21 @@ export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
includeMarkdown: true,
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
includeLinks: true
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = ""
|
||||
existingHtml: string = "",
|
||||
priority?: number,
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
@ -171,11 +175,13 @@ export async function scrapSingleUrl(
|
||||
url,
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
}
|
||||
},
|
||||
priority,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
@ -306,7 +312,7 @@ export async function scrapSingleUrl(
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||
);
|
||||
|
||||
@ -334,8 +340,8 @@ export async function scrapSingleUrl(
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||
@ -357,20 +363,22 @@ export async function scrapSingleUrl(
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
@ -382,7 +390,7 @@ export async function scrapSingleUrl(
|
||||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
@ -395,7 +403,7 @@ export async function scrapSingleUrl(
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
@ -409,9 +417,9 @@ export async function scrapSingleUrl(
|
||||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
@ -19,7 +19,7 @@ export async function getLinksFromSitemap(
|
||||
try {
|
||||
let content: string;
|
||||
try {
|
||||
if (mode === 'axios') {
|
||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
|
@ -1,24 +1,11 @@
|
||||
export const urlSpecificParams = {
|
||||
"platform.openai.com": {
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
cookies: {
|
||||
__cf_bm:
|
||||
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
wait: 3000,
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp"
|
||||
},
|
||||
},
|
||||
},
|
||||
"support.greenpay.me":{
|
||||
@ -232,4 +219,28 @@ export const urlSpecificParams = {
|
||||
}
|
||||
},
|
||||
},
|
||||
"amazon.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"digikey.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
},
|
||||
"zoopla.co.uk":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
@ -4,38 +4,76 @@ import { createWriteStream } from "node:fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
|
||||
const content = await processDocxToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
let tempFilePath = '';
|
||||
let pageStatusCode = 200;
|
||||
let pageError = '';
|
||||
let content = '';
|
||||
|
||||
try {
|
||||
const downloadResult = await downloadDocx(url);
|
||||
tempFilePath = downloadResult.tempFilePath;
|
||||
pageStatusCode = downloadResult.pageStatusCode;
|
||||
pageError = downloadResult.pageError;
|
||||
content = await processDocxToText(tempFilePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
||||
pageStatusCode = 500;
|
||||
pageError = error.message;
|
||||
content = '';
|
||||
} finally {
|
||||
if (tempFilePath) {
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
}
|
||||
}
|
||||
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
try {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", () => {
|
||||
Logger.error('Failed to write DOCX file to disk');
|
||||
reject(new Error('Failed to write DOCX file to disk'));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to download DOCX: ${error.message}`);
|
||||
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
try {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
try {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
@ -76,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
let attempt = 0;
|
||||
const maxAttempts = 10; // Maximum number of attempts
|
||||
let resultAvailable = false;
|
||||
|
||||
while (attempt < maxAttempts && !resultAvailable) {
|
||||
try {
|
||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||
@ -90,13 +89,22 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
} catch (error) {
|
||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||
attempt++;
|
||||
if (attempt >= maxAttempts) {
|
||||
Logger.error("Max attempts reached, unable to fetch result.");
|
||||
break; // Exit the loop if max attempts are reached
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
}
|
||||
}
|
||||
|
||||
if (!resultAvailable) {
|
||||
content = await processPdf(filePath);
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
content = resultResponse.data[resultType];
|
||||
} catch (error) {
|
||||
@ -104,15 +112,29 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else if (parsePDF) {
|
||||
content = await processPdf(filePath);
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
} else {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
try {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to read PDF file: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
async function processPdf(file: string) {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
try {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
} catch (error) {
|
||||
throw error;
|
||||
}
|
||||
}
|
@ -41,10 +41,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(`${origin}${href}`);
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(`${baseUrl}/${href}`);
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
|
44
apps/api/src/search/fireEngine.ts
Normal file
44
apps/api/src/search/fireEngine.ts
Normal file
@ -0,0 +1,44 @@
|
||||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fireEngineMap(q: string, options: {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
numResults: number;
|
||||
page?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
let data = JSON.stringify({
|
||||
query: q,
|
||||
lang: options.lang,
|
||||
country: options.country,
|
||||
location: options.location,
|
||||
tbs: options.tbs,
|
||||
numResults: options.numResults,
|
||||
page: options.page ?? 1,
|
||||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let config = {
|
||||
method: "POST",
|
||||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response) {
|
||||
return response.data
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
||||
|
||||
|
||||
|
||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
|
@ -1,11 +1,9 @@
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
@ -30,12 +28,20 @@ export async function search({
|
||||
proxy?: string;
|
||||
sleep_interval?: number;
|
||||
timeout?: number;
|
||||
}) : Promise<SearchResult[]> {
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
if (process.env.SERPER_API_KEY ) {
|
||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
||||
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location,
|
||||
});
|
||||
}
|
||||
return await google_search(
|
||||
return await googleSearch(
|
||||
query,
|
||||
advanced,
|
||||
num_results,
|
||||
@ -49,7 +55,6 @@ export async function search({
|
||||
);
|
||||
} catch (error) {
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
return [];
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getWebScraperQueue } from "../queue-service";
|
||||
import { getScrapeQueue } from "../queue-service";
|
||||
import { sendSlackWebhook } from "./slack";
|
||||
|
||||
export async function checkAlerts() {
|
||||
@ -13,8 +13,8 @@ export async function checkAlerts() {
|
||||
Logger.info("Initializing alerts");
|
||||
const checkActiveJobs = async () => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const activeJobs = await webScraperQueue.getActiveCount();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const activeJobs = await scrapeQueue.getActiveCount();
|
||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||
Logger.warn(
|
||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||
@ -34,8 +34,8 @@ export async function checkAlerts() {
|
||||
};
|
||||
|
||||
const checkWaitingQueue = async () => {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const waitingJobs = await webScraperQueue.getWaitingCount();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const waitingJobs = await scrapeQueue.getWaitingCount();
|
||||
|
||||
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||
Logger.warn(
|
||||
@ -49,7 +49,7 @@ export async function checkAlerts() {
|
||||
};
|
||||
|
||||
const checkAll = async () => {
|
||||
await checkActiveJobs();
|
||||
// await checkActiveJobs();
|
||||
await checkWaitingQueue();
|
||||
};
|
||||
|
||||
|
@ -3,9 +3,13 @@ import { withAuth } from "../../lib/withAuth";
|
||||
import { sendNotification } from "../notification/email_notification";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
|
||||
export async function billTeam(team_id: string, credits: number) {
|
||||
return withAuth(supaBillTeam)(team_id, credits);
|
||||
}
|
||||
@ -164,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
||||
export async function checkTeamCredits(team_id: string, credits: number) {
|
||||
return withAuth(supaCheckTeamCredits)(team_id, credits);
|
||||
}
|
||||
|
||||
// if team has enough credits for the operation, return true, else return false
|
||||
export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
@ -198,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
if (subscriptionError || !subscription) {
|
||||
// If there is no active subscription but there are available coupons
|
||||
if (couponCredits >= credits) {
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||
}
|
||||
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
@ -248,29 +253,48 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
return {
|
||||
success: false,
|
||||
message: "Insufficient credits, please upgrade!",
|
||||
remainingCredits: FREE_CREDITS - totalCreditsUsed
|
||||
};
|
||||
}
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
|
||||
}
|
||||
|
||||
let totalCreditsUsed = 0;
|
||||
const cacheKey = `credit_usage_${subscription.id}_${subscription.current_period_start}_${subscription.current_period_end}_lc`;
|
||||
const redLockKey = `lock_${cacheKey}`;
|
||||
const lockTTL = 10000; // 10 seconds
|
||||
|
||||
try {
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service.rpc("get_credit_usage_2", {
|
||||
sub_id: subscription.id,
|
||||
start_time: subscription.current_period_start,
|
||||
end_time: subscription.current_period_end,
|
||||
});
|
||||
const lock = await redlock.acquire([redLockKey], lockTTL);
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
|
||||
}
|
||||
try {
|
||||
const cachedCreditUsage = await getValue(cacheKey);
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
if (cachedCreditUsage) {
|
||||
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||
} else {
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service.rpc("get_credit_usage_2", {
|
||||
sub_id: subscription.id,
|
||||
start_time: subscription.current_period_start,
|
||||
end_time: subscription.current_period_end,
|
||||
});
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
|
||||
}
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await lock.release();
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error calculating credit usage: ${error}`);
|
||||
Logger.error(`Error acquiring lock or calculating credit usage: ${error}`);
|
||||
}
|
||||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
@ -299,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
subscription.current_period_start,
|
||||
subscription.current_period_end
|
||||
);
|
||||
return { success: false, message: "Insufficient credits, please upgrade!" };
|
||||
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
} else if (creditUsagePercentage >= 0.8) {
|
||||
// Send email notification for approaching credit limit
|
||||
await sendNotification(
|
||||
@ -310,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
}
|
||||
|
||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||
|
@ -40,10 +40,11 @@ export async function logJob(job: FirecrawlJob) {
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
},
|
||||
]);
|
||||
|
||||
if (process.env.POSTHOG_API_KEY) {
|
||||
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||
let phLog = {
|
||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||
...(job.team_id !== "preview" && {
|
||||
|
@ -44,9 +44,9 @@ export async function logScrape(
|
||||
]);
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
}
|
||||
|
@ -1,28 +1,15 @@
|
||||
import { Job, Queue } from "bullmq";
|
||||
import {
|
||||
getScrapeQueue,
|
||||
getWebScraperQueue,
|
||||
} from "./queue-service";
|
||||
import { getScrapeQueue } from "./queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { WebScraperOptions } from "../types";
|
||||
|
||||
export async function addWebScraperJob(
|
||||
webScraperOptions: WebScraperOptions,
|
||||
options: any = {},
|
||||
jobId: string = uuidv4(),
|
||||
): Promise<Job> {
|
||||
return await getWebScraperQueue().add(jobId, webScraperOptions, {
|
||||
...options,
|
||||
jobId,
|
||||
});
|
||||
}
|
||||
|
||||
export async function addScrapeJob(
|
||||
webScraperOptions: WebScraperOptions,
|
||||
options: any = {},
|
||||
jobId: string = uuidv4(),
|
||||
): Promise<Job> {
|
||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
priority: webScraperOptions.crawl_id ? 20 : 10,
|
||||
...options,
|
||||
jobId,
|
||||
});
|
||||
|
@ -2,38 +2,13 @@ import { Queue } from "bullmq";
|
||||
import { Logger } from "../lib/logger";
|
||||
import IORedis from "ioredis";
|
||||
|
||||
let webScraperQueue: Queue;
|
||||
let scrapeQueue: Queue;
|
||||
|
||||
export const redisConnection = new IORedis(process.env.REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
|
||||
export const webScraperQueueName = "{crawlQueue}";
|
||||
export const scrapeQueueName = "{scrapeQueue}";
|
||||
export function getWebScraperQueue() {
|
||||
if (!webScraperQueue) {
|
||||
webScraperQueue = new Queue(
|
||||
webScraperQueueName,
|
||||
{
|
||||
connection: redisConnection,
|
||||
}
|
||||
// {
|
||||
// settings: {
|
||||
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||
// stalledInterval: 30 * 1000,
|
||||
// maxStalledCount: 10,
|
||||
// },
|
||||
// defaultJobOptions:{
|
||||
// attempts: 5
|
||||
// }
|
||||
// }
|
||||
);
|
||||
Logger.info("Web scraper queue created");
|
||||
}
|
||||
return webScraperQueue;
|
||||
}
|
||||
|
||||
export function getScrapeQueue() {
|
||||
if (!scrapeQueue) {
|
||||
@ -62,5 +37,4 @@ export function getScrapeQueue() {
|
||||
|
||||
import { QueueEvents } from 'bullmq';
|
||||
|
||||
export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection });
|
||||
export const webScraperQueueEvents = new QueueEvents(webScraperQueueName, { connection: redisConnection });
|
||||
export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection });
|
@ -1,23 +1,24 @@
|
||||
import "dotenv/config";
|
||||
import { CustomError } from "../lib/custom-error";
|
||||
import {
|
||||
getWebScraperQueue,
|
||||
getScrapeQueue,
|
||||
redisConnection,
|
||||
webScraperQueueName,
|
||||
scrapeQueueName,
|
||||
} from "./queue-service";
|
||||
import "dotenv/config";
|
||||
import { logtail } from "./logtail";
|
||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import { Job, QueueEvents, tryCatch } from "bullmq";
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { Worker } from "bullmq";
|
||||
import systemMonitor from "./system-monitor";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
@ -33,30 +34,29 @@ const workerStalledCheckInterval =
|
||||
const jobLockExtendInterval =
|
||||
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||
const jobLockExtensionTime =
|
||||
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 15000;
|
||||
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||
|
||||
const cantAcceptConnectionInterval =
|
||||
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||
const connectionMonitorInterval =
|
||||
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
const wsq = getWebScraperQueue();
|
||||
const sq = getScrapeQueue();
|
||||
|
||||
const processJobInternal = async (token: string, job: Job) => {
|
||||
const extendLockInterval = setInterval(async () => {
|
||||
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
|
||||
await job.extendLock(token, jobLockExtensionTime);
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
try {
|
||||
const result = await processJob(job, token);
|
||||
const jobState = await job.getState();
|
||||
if(jobState !== "completed" && jobState !== "failed"){
|
||||
try{
|
||||
await job.moveToCompleted(result.docs, token, false); //3rd arg fetchNext
|
||||
}catch(e){
|
||||
// console.log("Job already completed, error:", e);
|
||||
try{
|
||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} else {
|
||||
await job.moveToCompleted(result.docs, token, false);
|
||||
}
|
||||
}catch(e){
|
||||
}
|
||||
} catch (error) {
|
||||
console.log("Job failed, error:", error);
|
||||
@ -110,11 +110,10 @@ const workerFun = async (queueName: string, processJobInternal: (token: string,
|
||||
}
|
||||
};
|
||||
|
||||
workerFun(webScraperQueueName, processJobInternal);
|
||||
workerFun(scrapeQueueName, processJobInternal);
|
||||
|
||||
async function processJob(job: Job, token: string) {
|
||||
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
||||
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||
|
||||
try {
|
||||
job.updateProgress({
|
||||
@ -131,18 +130,16 @@ async function processJob(job: Job, token: string) {
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id);
|
||||
const rawHtml = docs[0].rawHtml;
|
||||
|
||||
if (isCancelled) {
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), job.token);
|
||||
await job.discard();
|
||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||
delete docs[0].rawHtml;
|
||||
}
|
||||
|
||||
const data = {
|
||||
success,
|
||||
result: {
|
||||
links: isCancelled ? [] : docs.map((doc) => {
|
||||
links: docs.map((doc) => {
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
@ -150,36 +147,132 @@ async function processJob(job: Job, token: string) {
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: isCancelled ? "Job cancelled by user" : message /* etc... */,
|
||||
docs: isCancelled ? [] : docs,
|
||||
error: message /* etc... */,
|
||||
docs,
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl" && !isCancelled) {
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
if (job.data.mode === "crawl") {
|
||||
await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook);
|
||||
}
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success && !isCancelled,
|
||||
message: isCancelled ? "Job cancelled by user" : message,
|
||||
num_docs: isCancelled ? 0 : docs.length,
|
||||
docs: isCancelled ? [] : docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
Logger.debug(`🐂 Job done ${job.id}`);
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
)
|
||||
|
||||
for (const link of links) {
|
||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||
const newJob = await addScrapeJob({
|
||||
url: link,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
team_id: sc.team_id,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
if (x === job.id) {
|
||||
return {
|
||||
async getState() {
|
||||
return "completed"
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
returnvalue: docs,
|
||||
}
|
||||
}
|
||||
|
||||
const j = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(j.id);
|
||||
|
||||
if (supabaseData) {
|
||||
j.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
|
||||
|
||||
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
num_docs: fullDocs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: jobStatus !== "failed",
|
||||
result: {
|
||||
links: fullDocs.map((doc) => {
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
};
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs: fullDocs,
|
||||
};
|
||||
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id, data);
|
||||
}
|
||||
}
|
||||
|
||||
Logger.info(`🐂 Job done ${job.id}`);
|
||||
return data;
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
if (await getWebScraperQueue().isPaused()) {
|
||||
Logger.debug("🐂Queue is paused, ignoring");
|
||||
return;
|
||||
}
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
@ -192,6 +285,9 @@ async function processJob(job: Job, token: string) {
|
||||
});
|
||||
}
|
||||
Logger.error(error);
|
||||
if (error.stack) {
|
||||
Logger.error(error.stack);
|
||||
}
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
@ -205,26 +301,51 @@ async function processJob(job: Job, token: string) {
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
if (job.data.mode === "crawl") {
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
|
||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
// done(null, data);
|
||||
return data;
|
||||
}
|
||||
|
@ -14,18 +14,20 @@ const RATE_LIMITS = {
|
||||
standardNew: 10,
|
||||
standardnew: 10,
|
||||
growth: 50,
|
||||
growthdouble: 50,
|
||||
},
|
||||
scrape: {
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 50,
|
||||
standard: 100,
|
||||
standardOld: 40,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
standardnew: 50,
|
||||
growth: 500,
|
||||
standardNew: 100,
|
||||
standardnew: 100,
|
||||
growth: 1000,
|
||||
growthdouble: 1000,
|
||||
},
|
||||
search: {
|
||||
default: 20,
|
||||
@ -38,6 +40,20 @@ const RATE_LIMITS = {
|
||||
standardNew: 50,
|
||||
standardnew: 50,
|
||||
growth: 500,
|
||||
growthdouble: 500,
|
||||
},
|
||||
map:{
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 40,
|
||||
standardOld: 40,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
standardnew: 50,
|
||||
growth: 500,
|
||||
growthdouble: 500,
|
||||
},
|
||||
preview: {
|
||||
free: 5,
|
||||
|
29
apps/api/src/services/redlock.ts
Normal file
29
apps/api/src/services/redlock.ts
Normal file
@ -0,0 +1,29 @@
|
||||
import Redlock from "redlock";
|
||||
import Client from "ioredis";
|
||||
|
||||
export const redlock = new Redlock(
|
||||
// You should have one client for each independent redis node
|
||||
// or cluster.
|
||||
[new Client(process.env.REDIS_RATE_LIMIT_URL)],
|
||||
{
|
||||
// The expected clock drift; for more details see:
|
||||
// http://redis.io/topics/distlock
|
||||
driftFactor: 0.01, // multiplied by lock ttl to determine drift time
|
||||
|
||||
// The max number of times Redlock will attempt to lock a resource
|
||||
// before erroring.
|
||||
retryCount: 5,
|
||||
|
||||
// the time in ms between attempts
|
||||
retryDelay: 100, // time in ms
|
||||
|
||||
// the max time in ms randomly added to retries
|
||||
// to improve performance under high contention
|
||||
// see https://www.awsarchitectureblog.com/2015/03/backoff.html
|
||||
retryJitter: 200, // time in ms
|
||||
|
||||
// The minimum remaining time on a lock before an extension is automatically
|
||||
// attempted with the `using` API.
|
||||
automaticExtensionThreshold: 500, // time in ms
|
||||
}
|
||||
);
|
@ -36,17 +36,9 @@ export const supabase_service: SupabaseClient = new Proxy(
|
||||
new SupabaseService(),
|
||||
{
|
||||
get: function (target, prop, receiver) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
Logger.debug(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
}
|
||||
const client = target.getClient();
|
||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||
if (client === null) {
|
||||
Logger.error(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
return () => {
|
||||
throw new Error("Supabase client is not configured.");
|
||||
};
|
||||
|
@ -1,15 +1,15 @@
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
|
||||
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string) => {
|
||||
try {
|
||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
let webhookUrl = selfHostedUrl;
|
||||
let webhookUrl = specified ?? selfHostedUrl;
|
||||
|
||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
|
||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
||||
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
||||
if (!selfHostedUrl && useDbAuthentication) {
|
||||
if (!webhookUrl && useDbAuthentication) {
|
||||
const { data: webhooksData, error } = await supabase_service
|
||||
.from("webhooks")
|
||||
.select("url")
|
||||
|
@ -28,6 +28,9 @@ export interface WebScraperOptions {
|
||||
extractorOptions?: any;
|
||||
team_id: string;
|
||||
origin?: string;
|
||||
crawl_id?: string;
|
||||
sitemapped?: boolean;
|
||||
webhook?: string;
|
||||
}
|
||||
|
||||
export interface RunWebScraperParams {
|
||||
@ -41,6 +44,7 @@ export interface RunWebScraperParams {
|
||||
onError: (error: Error) => void;
|
||||
team_id: string;
|
||||
bull_job_id: string;
|
||||
priority?: number;
|
||||
}
|
||||
|
||||
export interface RunWebScraperResult {
|
||||
@ -65,6 +69,7 @@ export interface FirecrawlJob {
|
||||
extractor_options?: ExtractorOptions,
|
||||
num_tokens?: number,
|
||||
retry?: boolean,
|
||||
crawl_id?: string;
|
||||
}
|
||||
|
||||
export interface FirecrawlScrapeResponse {
|
||||
@ -101,6 +106,7 @@ export enum RateLimiterMode {
|
||||
Scrape = "scrape",
|
||||
Preview = "preview",
|
||||
Search = "search",
|
||||
Map = "map",
|
||||
|
||||
}
|
||||
|
||||
@ -110,6 +116,7 @@ export interface AuthResponse {
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
api_key?: string;
|
||||
}
|
||||
|
||||
|
||||
|
@ -8,10 +8,6 @@
|
||||
"sourceMap": true,
|
||||
"outDir": "./dist/src",
|
||||
"moduleResolution": "node",
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"*": ["node_modules/*", "src/types/*"],
|
||||
}
|
||||
},
|
||||
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
||||
}
|
||||
|
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
# If you prefer the allow list template instead of the deny list, see community template:
|
||||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||
#
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
go.work.sum
|
||||
|
||||
# env file
|
||||
.env
|
21
apps/go-sdk/examples/LICENSE
Normal file
21
apps/go-sdk/examples/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Mendable
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
87
apps/go-sdk/examples/example.go
Normal file
87
apps/go-sdk/examples/example.go
Normal file
@ -0,0 +1,87 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mendableai/firecrawl-go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a website
|
||||
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult.Markdown)
|
||||
|
||||
// Crawl a website
|
||||
idempotencyKey := uuid.New().String() // optional idempotency key
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal crawl result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonCrawlResult))
|
||||
|
||||
// LLM Extraction using JSON schema
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
Mode: "llm-extraction",
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
|
||||
// Pretty print the LLM extraction result
|
||||
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonResult))
|
||||
}
|
9
apps/go-sdk/examples/go.mod
Normal file
9
apps/go-sdk/examples/go.mod
Normal file
@ -0,0 +1,9 @@
|
||||
module github.com/mendableai/firecrawl-go-examples
|
||||
|
||||
go 1.22.5
|
||||
|
||||
replace github.com/mendableai/firecrawl => ../
|
||||
|
||||
require github.com/google/uuid v1.6.0
|
||||
|
||||
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
14
apps/go-sdk/examples/go.sum
Normal file
14
apps/go-sdk/examples/go.sum
Normal file
@ -0,0 +1,14 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
2
apps/go-sdk/firecrawl/.env.example
Normal file
2
apps/go-sdk/firecrawl/.env.example
Normal file
@ -0,0 +1,2 @@
|
||||
API_URL=http://localhost:3002
|
||||
TEST_API_KEY=fc-YOUR-API-KEY
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.env
|
||||
vendor
|
21
apps/go-sdk/firecrawl/LICENSE
Normal file
21
apps/go-sdk/firecrawl/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Sideguide Technologies Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
189
apps/go-sdk/firecrawl/README.md
Normal file
189
apps/go-sdk/firecrawl/README.md
Normal file
@ -0,0 +1,189 @@
|
||||
# Firecrawl Go SDK
|
||||
|
||||
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Go SDK, you can
|
||||
|
||||
```bash
|
||||
go get github.com/mendableai/firecrawl
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK with error handling:
|
||||
|
||||
```go
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/mendableai/firecrawl/firecrawl"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Initialize the FirecrawlApp with your API key
|
||||
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a single URL
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while scraping: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
|
||||
// Crawl a website
|
||||
crawlUrl := "https://mendable.ai"
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
crawlResult, err := app.CrawlURL(crawlUrl, params)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while crawling: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
}
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```go
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
|
||||
|
||||
```go
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
},
|
||||
}
|
||||
|
||||
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
|
||||
```go
|
||||
query := "what is mendable?"
|
||||
searchResult, err := app.Search(query)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to search: %v", err)
|
||||
}
|
||||
fmt.Println(searchResult)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```go
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
"includes": []string{}, // leave empty for all pages
|
||||
"limit": 1000,
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```go
|
||||
status, err := app.CheckCrawlStatus(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to check crawl status: %v", err)
|
||||
}
|
||||
fmt.Println(status)
|
||||
```
|
||||
|
||||
### Canceling a Crawl Job
|
||||
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
|
||||
|
||||
```go
|
||||
canceled, err := app.CancelCrawlJob(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to cancel crawl job: %v", err)
|
||||
}
|
||||
fmt.Println(canceled)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
|
||||
|
||||
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
|
584
apps/go-sdk/firecrawl/firecrawl.go
Normal file
584
apps/go-sdk/firecrawl/firecrawl.go
Normal file
@ -0,0 +1,584 @@
|
||||
// Package firecrawl provides a client for interacting with the Firecrawl API.
|
||||
package firecrawl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FirecrawlDocumentMetadata represents metadata for a Firecrawl document
|
||||
type FirecrawlDocumentMetadata struct {
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Keywords string `json:"keywords,omitempty"`
|
||||
Robots string `json:"robots,omitempty"`
|
||||
OGTitle string `json:"ogTitle,omitempty"`
|
||||
OGDescription string `json:"ogDescription,omitempty"`
|
||||
OGURL string `json:"ogUrl,omitempty"`
|
||||
OGImage string `json:"ogImage,omitempty"`
|
||||
OGAudio string `json:"ogAudio,omitempty"`
|
||||
OGDeterminer string `json:"ogDeterminer,omitempty"`
|
||||
OGLocale string `json:"ogLocale,omitempty"`
|
||||
OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"`
|
||||
OGSiteName string `json:"ogSiteName,omitempty"`
|
||||
OGVideo string `json:"ogVideo,omitempty"`
|
||||
DCTermsCreated string `json:"dctermsCreated,omitempty"`
|
||||
DCDateCreated string `json:"dcDateCreated,omitempty"`
|
||||
DCDate string `json:"dcDate,omitempty"`
|
||||
DCTermsType string `json:"dctermsType,omitempty"`
|
||||
DCType string `json:"dcType,omitempty"`
|
||||
DCTermsAudience string `json:"dctermsAudience,omitempty"`
|
||||
DCTermsSubject string `json:"dctermsSubject,omitempty"`
|
||||
DCSubject string `json:"dcSubject,omitempty"`
|
||||
DCDescription string `json:"dcDescription,omitempty"`
|
||||
DCTermsKeywords string `json:"dctermsKeywords,omitempty"`
|
||||
ModifiedTime string `json:"modifiedTime,omitempty"`
|
||||
PublishedTime string `json:"publishedTime,omitempty"`
|
||||
ArticleTag string `json:"articleTag,omitempty"`
|
||||
ArticleSection string `json:"articleSection,omitempty"`
|
||||
SourceURL string `json:"sourceURL,omitempty"`
|
||||
PageStatusCode int `json:"pageStatusCode,omitempty"`
|
||||
PageError string `json:"pageError,omitempty"`
|
||||
}
|
||||
|
||||
// FirecrawlDocument represents a document in Firecrawl
|
||||
type FirecrawlDocument struct {
|
||||
ID string `json:"id,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Content string `json:"content"`
|
||||
Markdown string `json:"markdown,omitempty"`
|
||||
HTML string `json:"html,omitempty"`
|
||||
LLMExtraction map[string]any `json:"llm_extraction,omitempty"`
|
||||
CreatedAt *time.Time `json:"createdAt,omitempty"`
|
||||
UpdatedAt *time.Time `json:"updatedAt,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"`
|
||||
ChildrenLinks []string `json:"childrenLinks,omitempty"`
|
||||
Provider string `json:"provider,omitempty"`
|
||||
Warning string `json:"warning,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
// ExtractorOptions represents options for extraction.
|
||||
type ExtractorOptions struct {
|
||||
Mode string `json:"mode,omitempty"`
|
||||
ExtractionPrompt string `json:"extractionPrompt,omitempty"`
|
||||
ExtractionSchema any `json:"extractionSchema,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResponse represents the response for scraping operations
|
||||
type ScrapeResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data *FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// SearchResponse represents the response for searching operations
|
||||
type SearchResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// CrawlResponse represents the response for crawling operations
|
||||
type CrawlResponse struct {
|
||||
Success bool `json:"success"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// JobStatusResponse represents the response for checking crawl job status
|
||||
type JobStatusResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
Current int `json:"current,omitempty"`
|
||||
CurrentURL string `json:"current_url,omitempty"`
|
||||
CurrentStep string `json:"current_step,omitempty"`
|
||||
Total int `json:"total,omitempty"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
PartialData []*FirecrawlDocument `json:"partial_data,omitempty"`
|
||||
}
|
||||
|
||||
// CancelCrawlJobResponse represents the response for canceling a crawl job
|
||||
type CancelCrawlJobResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// requestOptions represents options for making requests.
|
||||
type requestOptions struct {
|
||||
retries int
|
||||
backoff int
|
||||
}
|
||||
|
||||
// requestOption is a functional option type for requestOptions.
|
||||
type requestOption func(*requestOptions)
|
||||
|
||||
// newRequestOptions creates a new requestOptions instance with the provided options.
|
||||
//
|
||||
// Parameters:
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - *requestOptions: A new instance of requestOptions with the provided options.
|
||||
func newRequestOptions(opts ...requestOption) *requestOptions {
|
||||
options := &requestOptions{retries: 1}
|
||||
for _, opt := range opts {
|
||||
opt(options)
|
||||
}
|
||||
return options
|
||||
}
|
||||
|
||||
// withRetries sets the number of retries for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - retries: The number of retries to be performed.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the number of retries for a request.
|
||||
func withRetries(retries int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.retries = retries
|
||||
}
|
||||
}
|
||||
|
||||
// withBackoff sets the backoff interval for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - backoff: The backoff interval (in milliseconds) to be used for retries.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the backoff interval for a request.
|
||||
func withBackoff(backoff int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.backoff = backoff
|
||||
}
|
||||
}
|
||||
|
||||
// FirecrawlApp represents a client for the Firecrawl API.
|
||||
type FirecrawlApp struct {
|
||||
APIKey string
|
||||
APIURL string
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL.
|
||||
// If the API key or API URL is not provided, it attempts to retrieve them from environment variables.
|
||||
// If the API key is still not found, it returns an error.
|
||||
//
|
||||
// Parameters:
|
||||
// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable.
|
||||
// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev".
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL.
|
||||
// - error: An error if the API key is not provided or retrieved.
|
||||
func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) {
|
||||
if apiKey == "" {
|
||||
apiKey = os.Getenv("FIRECRAWL_API_KEY")
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("no API key provided")
|
||||
}
|
||||
}
|
||||
|
||||
if apiURL == "" {
|
||||
apiURL = os.Getenv("FIRECRAWL_API_URL")
|
||||
if apiURL == "" {
|
||||
apiURL = "https://api.firecrawl.dev"
|
||||
}
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
Timeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
return &FirecrawlApp{
|
||||
APIKey: apiKey,
|
||||
APIURL: apiURL,
|
||||
Client: client,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ScrapeURL scrapes the content of the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to be scraped.
|
||||
// - params: Optional parameters for the scrape request, including extractor options for LLM extraction.
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlDocument: The scraped document data.
|
||||
// - error: An error if the scrape request fails.
|
||||
func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
scrapeBody := map[string]any{"url": url}
|
||||
|
||||
if params != nil {
|
||||
if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok {
|
||||
if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok {
|
||||
extractorOptions.ExtractionSchema = schema.schema()
|
||||
}
|
||||
if extractorOptions.Mode == "" {
|
||||
extractorOptions.Mode = "llm-extraction"
|
||||
}
|
||||
scrapeBody["extractorOptions"] = extractorOptions
|
||||
}
|
||||
|
||||
for key, value := range params {
|
||||
if key != "extractorOptions" {
|
||||
scrapeBody[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/scrape", app.APIURL),
|
||||
scrapeBody,
|
||||
headers,
|
||||
"scrape URL",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var scrapeResponse ScrapeResponse
|
||||
err = json.Unmarshal(resp, &scrapeResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if scrapeResponse.Success {
|
||||
return scrapeResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to scrape URL")
|
||||
}
|
||||
|
||||
// Search performs a search query using the Firecrawl API and returns the search results.
|
||||
//
|
||||
// Parameters:
|
||||
// - query: The search query string.
|
||||
// - params: Optional parameters for the search request.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results.
|
||||
// - error: An error if the search request fails.
|
||||
func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
searchBody := map[string]any{"query": query}
|
||||
for k, v := range params {
|
||||
searchBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/search", app.APIURL),
|
||||
searchBody,
|
||||
headers,
|
||||
"search",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var searchResponse SearchResponse
|
||||
err = json.Unmarshal(resp, &searchResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if searchResponse.Success {
|
||||
return searchResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to search")
|
||||
}
|
||||
|
||||
// CrawlURL starts a crawl job for the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to crawl.
|
||||
// - params: Optional parameters for the crawl request.
|
||||
// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true.
|
||||
// - idempotencyKey: An optional idempotency key to ensure the request is idempotent.
|
||||
//
|
||||
// Returns:
|
||||
// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true.
|
||||
// - error: An error if the crawl request fails.
|
||||
func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) {
|
||||
headers := app.prepareHeaders(idempotencyKey)
|
||||
crawlBody := map[string]any{"url": url}
|
||||
for k, v := range params {
|
||||
crawlBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/crawl", app.APIURL),
|
||||
crawlBody,
|
||||
headers,
|
||||
"start crawl job",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var crawlResponse CrawlResponse
|
||||
err = json.Unmarshal(resp, &crawlResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if waitUntilDone {
|
||||
return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval)
|
||||
}
|
||||
|
||||
if crawlResponse.JobID == "" {
|
||||
return nil, fmt.Errorf("failed to get job ID")
|
||||
}
|
||||
|
||||
return crawlResponse.JobID, nil
|
||||
}
|
||||
|
||||
// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to check.
|
||||
//
|
||||
// Returns:
|
||||
// - *JobStatusResponse: The status of the crawl job.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var jobStatusResponse JobStatusResponse
|
||||
err = json.Unmarshal(resp, &jobStatusResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &jobStatusResponse, nil
|
||||
}
|
||||
|
||||
// CancelCrawlJob cancels a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to cancel.
|
||||
//
|
||||
// Returns:
|
||||
// - string: The status of the crawl job after cancellation.
|
||||
// - error: An error if the crawl job cancellation request fails.
|
||||
func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodDelete,
|
||||
fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"cancel crawl job",
|
||||
)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var cancelCrawlJobResponse CancelCrawlJobResponse
|
||||
err = json.Unmarshal(resp, &cancelCrawlJobResponse)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return cancelCrawlJobResponse.Status, nil
|
||||
}
|
||||
|
||||
// prepareHeaders prepares the headers for an HTTP request.
|
||||
//
|
||||
// Parameters:
|
||||
// - idempotencyKey: A string representing the idempotency key to be included in the headers.
|
||||
// If the idempotency key is an empty string, it will not be included in the headers.
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]string: A map containing the headers for the HTTP request.
|
||||
func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string {
|
||||
headers := map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": fmt.Sprintf("Bearer %s", app.APIKey),
|
||||
}
|
||||
if idempotencyKey != "" {
|
||||
headers["x-idempotency-key"] = idempotencyKey
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
// makeRequest makes a request to the specified URL with the provided method, data, headers, and options.
|
||||
//
|
||||
// Parameters:
|
||||
// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE").
|
||||
// - url: The URL to send the request to.
|
||||
// - data: The data to be sent in the request body.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - action: A string describing the action being performed.
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - []byte: The response body from the request.
|
||||
// - error: An error if the request fails.
|
||||
func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) {
|
||||
var body []byte
|
||||
var err error
|
||||
if data != nil {
|
||||
body, err = json.Marshal(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(method, url, bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for key, value := range headers {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
options := newRequestOptions(opts...)
|
||||
for i := 0; i < options.retries; i++ {
|
||||
resp, err = app.Client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 502 {
|
||||
break
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond)
|
||||
}
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
statusCode := resp.StatusCode
|
||||
if statusCode != 200 {
|
||||
return nil, app.handleError(statusCode, respBody, action)
|
||||
}
|
||||
|
||||
return respBody, nil
|
||||
}
|
||||
|
||||
// monitorJobStatus monitors the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to monitor.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: The crawl result if the job is completed.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) {
|
||||
attempts := 0
|
||||
for {
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var statusData JobStatusResponse
|
||||
err = json.Unmarshal(resp, &statusData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status := statusData.Status
|
||||
if status == "" {
|
||||
return nil, fmt.Errorf("invalid status in response")
|
||||
}
|
||||
|
||||
if status == "completed" {
|
||||
if statusData.Data != nil {
|
||||
return statusData.Data, nil
|
||||
}
|
||||
attempts++
|
||||
if attempts > 3 {
|
||||
return nil, fmt.Errorf("crawl job completed but no data was returned")
|
||||
}
|
||||
} else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" {
|
||||
pollInterval = max(pollInterval, 2)
|
||||
time.Sleep(time.Duration(pollInterval) * time.Second)
|
||||
} else {
|
||||
return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleError handles errors returned by the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - resp: The HTTP response object.
|
||||
// - body: The response body from the HTTP response.
|
||||
// - action: A string describing the action being performed.
|
||||
//
|
||||
// Returns:
|
||||
// - error: An error describing the failure reason.
|
||||
func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error {
|
||||
var errorData map[string]any
|
||||
err := json.Unmarshal(body, &errorData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse error response: %v", err)
|
||||
}
|
||||
|
||||
errorMessage, _ := errorData["error"].(string)
|
||||
if errorMessage == "" {
|
||||
errorMessage = "No additional error details provided."
|
||||
}
|
||||
|
||||
var message string
|
||||
switch statusCode {
|
||||
case 402:
|
||||
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
|
||||
case 408:
|
||||
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
|
||||
case 409:
|
||||
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
|
||||
case 500:
|
||||
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
|
||||
default:
|
||||
message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage)
|
||||
}
|
||||
|
||||
return fmt.Errorf(message)
|
||||
}
|
292
apps/go-sdk/firecrawl/firecrawl_test.go
Normal file
292
apps/go-sdk/firecrawl/firecrawl_test.go
Normal file
@ -0,0 +1,292 @@
|
||||
package firecrawl
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/joho/godotenv"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var API_URL string
|
||||
var TEST_API_KEY string
|
||||
|
||||
func init() {
|
||||
err := godotenv.Load("../.env")
|
||||
if err != nil {
|
||||
log.Fatalf("Error loading .env file: %v", err)
|
||||
}
|
||||
API_URL = os.Getenv("API_URL")
|
||||
TEST_API_KEY = os.Getenv("TEST_API_KEY")
|
||||
}
|
||||
|
||||
func TestNoAPIKey(t *testing.T) {
|
||||
_, err := NewFirecrawlApp("", API_URL)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no API key provided")
|
||||
}
|
||||
|
||||
func TestScrapeURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://firecrawl.dev", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://facebook.com/fake-test", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestScrapeURLE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.NotEqual(t, response.Markdown, "")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
assert.Equal(t, response.HTML, "")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"includeHtml": true,
|
||||
},
|
||||
}
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.Contains(t, response.Markdown, "_Roast_")
|
||||
assert.Contains(t, response.HTML, "<h1")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil)
|
||||
time.Sleep(6 * time.Second) // wait for 6 seconds
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestCrawlURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestShouldReturnErrorForBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestCrawlURLWaitForCompletionE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
uniqueIdempotencyKey := uuid.New().String()
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, uniqueIdempotencyKey)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", params, true, 2, uniqueIdempotencyKey)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used")
|
||||
}
|
||||
|
||||
func TestCheckCrawlStatusE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", params, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
time.Sleep(30 * time.Second) // wait for 30 seconds
|
||||
|
||||
statusResponse, err := app.CheckCrawlStatus(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, statusResponse)
|
||||
|
||||
assert.Equal(t, "completed", statusResponse.Status)
|
||||
assert.Greater(t, len(statusResponse.Data), 0)
|
||||
}
|
||||
|
||||
func TestSearchE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.Search("test query", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Greater(t, len(response), 2)
|
||||
assert.NotEqual(t, response[0].Content, "")
|
||||
}
|
||||
|
||||
func TestSearchInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.Search("test query", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestLLMExtraction(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"extractorOptions": ExtractorOptions{
|
||||
Mode: "llm-extraction",
|
||||
ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
ExtractionSchema: map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"company_mission": map[string]string{"type": "string"},
|
||||
"supports_sso": map[string]string{"type": "boolean"},
|
||||
"is_open_source": map[string]string{"type": "boolean"},
|
||||
},
|
||||
"required": []string{"company_mission", "supports_sso", "is_open_source"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
response, err := app.ScrapeURL("https://mendable.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.LLMExtraction, "company_mission")
|
||||
assert.IsType(t, true, response.LLMExtraction["supports_sso"])
|
||||
assert.IsType(t, true, response.LLMExtraction["is_open_source"])
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CancelCrawlJob("test query")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestCancelNonExistingCrawlJob(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
jobID := uuid.New().String()
|
||||
_, err = app.CancelCrawlJob(jobID)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Job not found")
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
status, err := app.CancelCrawlJob(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cancelled", status)
|
||||
}
|
15
apps/go-sdk/firecrawl/go.mod
Normal file
15
apps/go-sdk/firecrawl/go.mod
Normal file
@ -0,0 +1,15 @@
|
||||
module github.com/mendableai/firecrawl-go
|
||||
|
||||
go 1.22.5
|
||||
|
||||
require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/stretchr/testify v1.9.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
14
apps/go-sdk/firecrawl/go.sum
Normal file
14
apps/go-sdk/firecrawl/go.sum
Normal file
@ -0,0 +1,14 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
@ -1,16 +1,16 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
@ -19,67 +19,15 @@ console.log(jobId);
|
||||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
if (job.data) {
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
const mapResult = await app.map('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
@ -1,5 +1,5 @@
|
||||
import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { CrawlStatusResponse } from './firecrawl/src/index';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
@ -7,7 +7,7 @@ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
console.log(scrapeResult.data.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
@ -17,9 +17,9 @@ console.log(crawlResult)
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: JobStatusResponse;
|
||||
let job: CrawlStatusResponse;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
@ -27,66 +27,8 @@ while (true) {
|
||||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
const mapResult = await app.map('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
85
apps/js-sdk/exampleV0.js
Normal file
85
apps/js-sdk/exampleV0.js
Normal file
@ -0,0 +1,85 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
95
apps/js-sdk/exampleV0.ts
Normal file
95
apps/js-sdk/exampleV0.ts
Normal file
@ -0,0 +1,95 @@
|
||||
import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY", version: "v0"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev') as ScrapeResponseV0;
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: CrawlStatusResponseV0;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query) as SearchResponseV0;
|
||||
if (searchResult.data) {
|
||||
console.log(searchResult.data[0].content)
|
||||
}
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
2
apps/js-sdk/firecrawl/.gitignore
vendored
2
apps/js-sdk/firecrawl/.gitignore
vendored
@ -128,3 +128,5 @@ dist
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
build
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user