diff --git a/.github/archive/js-sdk.yml b/.github/archive/js-sdk.yml
index c84bb8b1..7ef096d4 100644
--- a/.github/archive/js-sdk.yml
+++ b/.github/archive/js-sdk.yml
@@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
- LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
- HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:
diff --git a/.github/archive/python-sdk.yml b/.github/archive/python-sdk.yml
index 27449888..bdefeab6 100644
--- a/.github/archive/python-sdk.yml
+++ b/.github/archive/python-sdk.yml
@@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
- LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
- HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:
diff --git a/.github/archive/rust-sdk.yml b/.github/archive/rust-sdk.yml
index 62deeaab..792e06c2 100644
--- a/.github/archive/rust-sdk.yml
+++ b/.github/archive/rust-sdk.yml
@@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
- LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
- HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8a9a74cc..ef7d1cba 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,7 +12,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
- LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@@ -25,7 +24,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
- HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
diff --git a/.github/workflows/deploy-image-staging.yml b/.github/workflows/deploy-image-staging.yml
new file mode 100644
index 00000000..e74aba9a
--- /dev/null
+++ b/.github/workflows/deploy-image-staging.yml
@@ -0,0 +1,32 @@
+name: STAGING Deploy Images to GHCR
+
+env:
+ DOTNET_VERSION: '6.0.x'
+
+on:
+ push:
+ branches:
+ - mog/webscraper-refactor
+ workflow_dispatch:
+
+jobs:
+ push-app-image:
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ working-directory: './apps/api'
+ steps:
+ - name: 'Checkout GitHub Action'
+ uses: actions/checkout@main
+
+ - name: 'Login to GitHub Container Registry'
+ uses: docker/login-action@v1
+ with:
+ registry: ghcr.io
+ username: ${{github.actor}}
+ password: ${{secrets.GITHUB_TOKEN}}
+
+ - name: 'Build Inventory Image'
+ run: |
+ docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
+ docker push ghcr.io/mendableai/firecrawl-staging:latest
\ No newline at end of file
diff --git a/.github/workflows/deploy-image.yml b/.github/workflows/deploy-image.yml
index 02eb4cc5..c18c7a71 100644
--- a/.github/workflows/deploy-image.yml
+++ b/.github/workflows/deploy-image.yml
@@ -2,6 +2,7 @@ name: Deploy Images to GHCR
env:
DOTNET_VERSION: '6.0.x'
+ SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
on:
push:
@@ -28,5 +29,5 @@ jobs:
- name: 'Build Inventory Image'
run: |
- docker build . --tag ghcr.io/mendableai/firecrawl:latest
+ docker build . --tag ghcr.io/mendableai/firecrawl:latest --secret id=SENTRY_AUTH_TOKEN
docker push ghcr.io/mendableai/firecrawl:latest
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 4d35cb4a..fc527490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,6 @@ apps/js-sdk/firecrawl/dist
/examples/crm_lead_enrichment/crm_lead_enrichment_env
/.venv
/examples/claude_web_crawler/firecrawl_env
+/examples/haiku_web_crawler/firecrawl_env
+/examples/sonnet_web_crawler/firecrawl_env
+/examples/internal_link_assitant/firecrawl_env
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2a843aa8..b8c1f0a5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,7 +6,7 @@ If you're contributing, note that the process is similar to other open source re
## Running the project locally
-First, start by installing dependencies
+First, start by installing dependencies:
1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
2. pnpm [instructions](https://pnpm.io/installation)
@@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @
-LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
@@ -56,12 +55,13 @@ POSTHOG_HOST= # set if you'd like to send posthog events like job logs
First, install the dependencies using pnpm.
```bash
-pnpm install
+# cd apps/api # to make sure you're in the right folder
+pnpm install # make sure you have pnpm version 9+!
```
### Running the project
-You're going to need to open 3 terminals.
+You're going to need to open 3 terminals. Here is [a video guide accurate as of Oct 2024](https://youtu.be/LHqg5QNI4UY).
### Terminal 1 - setting up redis
@@ -77,6 +77,7 @@ Now, navigate to the apps/api/ directory and run:
```bash
pnpm run workers
+# if you are going to use the [llm-extract feature](https://github.com/mendableai/firecrawl/pull/586/), you should also export OPENAI_API_KEY=sk-______
```
This will start the workers who are responsible for processing crawl jobs.
diff --git a/README.md b/README.md
index dd8a740a..5cbffc39 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ We provide an easy to use API with our hosted version. You can find the playgrou
Check out the following resources to get started:
- [x] **API**: [Documentation](https://docs.firecrawl.dev/api-reference/introduction)
- [x] **SDKs**: [Python](https://docs.firecrawl.dev/sdks/python), [Node](https://docs.firecrawl.dev/sdks/node), [Go](https://docs.firecrawl.dev/sdks/go), [Rust](https://docs.firecrawl.dev/sdks/rust)
-- [x] **LLM Frameworks**: [Langchain (python)](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/), [Langchain (js)](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/firecrawl), [Llama Index](https://docs.llamaindex.ai/en/latest/examples/data_connectors/WebPageDemo/#using-firecrawl-reader), [Crew.ai](https://docs.crewai.com/), [Composio](https://composio.dev/tools/firecrawl/all), [PraisonAI](https://docs.praison.ai/firecrawl/)
+- [x] **LLM Frameworks**: [Langchain (python)](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/), [Langchain (js)](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/firecrawl), [Llama Index](https://docs.llamaindex.ai/en/latest/examples/data_connectors/WebPageDemo/#using-firecrawl-reader), [Crew.ai](https://docs.crewai.com/), [Composio](https://composio.dev/tools/firecrawl/all), [PraisonAI](https://docs.praison.ai/firecrawl/), [Superinterface](https://superinterface.ai/docs/assistants/functions/firecrawl), [Vectorize](https://docs.vectorize.io/integrations/source-connectors/firecrawl)
- [x] **Low-code Frameworks**: [Dify](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl), [Langflow](https://docs.langflow.org/), [Flowise AI](https://docs.flowiseai.com/integrations/langchain/document-loaders/firecrawl), [Cargo](https://docs.getcargo.io/integration/firecrawl), [Pipedream](https://pipedream.com/apps/firecrawl/)
- [x] **Others**: [Zapier](https://zapier.com/apps/firecrawl/integrations), [Pabbly Connect](https://www.pabbly.com/connect/integrations/firecrawl/)
- [ ] Want an SDK or Integration? Let us know by opening an issue.
@@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
- **Media parsing**: pdfs, docx, images.
- **Reliability first**: designed to get the data you need - no matter how hard it is.
- **Actions**: click, scroll, input, wait and more before extracting data
+- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
@@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
}'
```
+### Batch Scraping Multiple URLs (New)
+
+You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
+
+```bash
+curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
+ -H 'Content-Type: application/json' \
+ -H 'Authorization: Bearer YOUR_API_KEY' \
+ -d '{
+ "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
+ "formats" : ["markdown", "html"]
+ }'
+```
### Search (v0) (Beta)
@@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
scrapeOptions: {
formats: ['markdown', 'html'],
}
-} as CrawlParams, true, 30) as CrawlStatusResponse;
+} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
if (crawlResponse) {
console.log(crawlResponse)
diff --git a/SELF_HOST.md b/SELF_HOST.md
index 78228485..46e08db9 100644
--- a/SELF_HOST.md
+++ b/SELF_HOST.md
@@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
BULL_AUTH_KEY= @
-LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
diff --git a/apps/api/.env.example b/apps/api/.env.example
index f3c1dc1b..d54c696d 100644
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@@ -1,5 +1,5 @@
# ===== Required ENVS ======
-NUM_WORKERS_PER_QUEUE=8
+NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
@@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
# ===== Optional ENVS ======
+# SearchApi key. Head to https://searchapi.com/ to get your API key
+SEARCHAPI_API_KEY=
+# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
+SEARCHAPI_ENGINE=
+
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
-SUPABASE_ANON_TOKEN=
-SUPABASE_URL=
+SUPABASE_ANON_TOKEN=
+SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
# Other Optionals
@@ -28,8 +33,6 @@ SCRAPING_BEE_API_KEY=
# add for LLM dependednt features (image alt generation, etc.)
OPENAI_API_KEY=
BULL_AUTH_KEY=@
-# use if you're configuring basic logging with logtail
-LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY=
# set if you'd like to send slack server health status messages
@@ -49,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
STRIPE_PRICE_ID_GROWTH=
STRIPE_PRICE_ID_GROWTH_YEARLY=
-HYPERDX_API_KEY=
-HDX_NODE_BETA_MODE=1
-
# set if you'd like to use the fire engine closed beta
FIRE_ENGINE_BETA_URL=
diff --git a/apps/api/.env.local b/apps/api/.env.local
index 17f85935..9fa41498 100644
--- a/apps/api/.env.local
+++ b/apps/api/.env.local
@@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
BULL_AUTH_KEY=
LOGTAIL_KEY=
PLAYWRIGHT_MICROSERVICE_URL=
-
+SEARCHAPI_API_KEY=
diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile
index b908f679..adc78927 100644
--- a/apps/api/Dockerfile
+++ b/apps/api/Dockerfile
@@ -19,10 +19,10 @@ RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
# Install Go
FROM golang:1.19 AS go-base
-COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md
+COPY sharedLibs/go-html-to-md /app/sharedLibs/go-html-to-md
# Install Go dependencies and build parser lib
-RUN cd /app/src/lib/go-html-to-md && \
+RUN cd /app/sharedLibs/go-html-to-md && \
go mod tidy && \
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
chmod +x html-to-markdown.so
@@ -30,7 +30,7 @@ RUN cd /app/src/lib/go-html-to-md && \
FROM base
COPY --from=prod-deps /app/node_modules /app/node_modules
COPY --from=build /app /app
-COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so
+COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
# Start the server by default, this can be overwritten at runtime
EXPOSE 8080
diff --git a/apps/api/jest.setup.js b/apps/api/jest.setup.js
index c158ca42..0b3b09b7 100644
--- a/apps/api/jest.setup.js
+++ b/apps/api/jest.setup.js
@@ -1 +1 @@
-global.fetch = require('jest-fetch-mock');
+// global.fetch = require('jest-fetch-mock');
diff --git a/apps/api/package.json b/apps/api/package.json
index dc62d4c3..53324783 100644
--- a/apps/api/package.json
+++ b/apps/api/package.json
@@ -32,9 +32,11 @@
"@tsconfig/recommended": "^1.0.3",
"@types/body-parser": "^1.19.2",
"@types/cors": "^2.8.13",
+ "@types/escape-html": "^1.0.4",
"@types/express": "^4.17.17",
"@types/jest": "^29.5.12",
"@types/node": "^20.14.1",
+ "@types/pdf-parse": "^1.1.4",
"body-parser": "^1.20.1",
"express": "^4.18.2",
"jest": "^29.6.3",
@@ -79,6 +81,7 @@
"date-fns": "^3.6.0",
"dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2",
+ "escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"form-data": "^4.0.0",
@@ -93,6 +96,7 @@
"languagedetect": "^2.0.0",
"logsnag": "^1.0.0",
"luxon": "^3.4.3",
+ "marked": "^14.1.2",
"md5": "^2.3.0",
"moment": "^2.29.4",
"mongoose": "^8.4.4",
@@ -115,6 +119,8 @@
"typesense": "^1.5.4",
"unstructured-client": "^0.11.3",
"uuid": "^10.0.0",
+ "winston": "^3.14.2",
+ "winston-transport": "^4.8.0",
"wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2",
diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
index f07c4ecd..9a1c9a22 100644
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@@ -101,6 +101,9 @@ importers:
dotenv-cli:
specifier: ^7.4.2
version: 7.4.2
+ escape-html:
+ specifier: ^1.0.3
+ version: 1.0.3
express-rate-limit:
specifier: ^7.3.1
version: 7.3.1(express@4.19.2)
@@ -143,6 +146,9 @@ importers:
luxon:
specifier: ^3.4.3
version: 3.4.4
+ marked:
+ specifier: ^14.1.2
+ version: 14.1.2
md5:
specifier: ^2.3.0
version: 2.3.0
@@ -209,6 +215,12 @@ importers:
uuid:
specifier: ^10.0.0
version: 10.0.0
+ winston:
+ specifier: ^3.14.2
+ version: 3.14.2
+ winston-transport:
+ specifier: ^4.8.0
+ version: 4.8.0
wordpos:
specifier: ^2.1.0
version: 2.1.0
@@ -240,6 +252,9 @@ importers:
'@types/cors':
specifier: ^2.8.13
version: 2.8.17
+ '@types/escape-html':
+ specifier: ^1.0.4
+ version: 1.0.4
'@types/express':
specifier: ^4.17.17
version: 4.17.21
@@ -249,6 +264,9 @@ importers:
'@types/node':
specifier: ^20.14.1
version: 20.14.1
+ '@types/pdf-parse':
+ specifier: ^1.1.4
+ version: 1.1.4
body-parser:
specifier: ^1.20.1
version: 1.20.2
@@ -645,6 +663,9 @@ packages:
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
engines: {node: '>=12'}
+ '@dabh/diagnostics@2.0.3':
+ resolution: {integrity: sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==}
+
'@devil7softwares/pos@1.0.2':
resolution: {integrity: sha512-49Ke26+++Ix8C5LChi4uga7aWgMuc5zV1NqjGxXxE7DralZwe+hvUuSJmBDWS+HHZaK9rFzLNdufV4HAvvOxPA==}
engines: {node: '>=0'}
@@ -658,8 +679,8 @@ packages:
engines: {node: '>=16.0.0'}
hasBin: true
- '@grpc/grpc-js@1.10.10':
- resolution: {integrity: sha512-HPa/K5NX6ahMoeBv15njAc/sfF4/jmiXLar9UlC2UfHFKZzsCVLc3wbe7+7qua7w9VPh2/L6EBxyAV7/E8Wftg==}
+ '@grpc/grpc-js@1.12.2':
+ resolution: {integrity: sha512-bgxdZmgTrJZX50OjyVwz3+mNEnCTNkh3cIqGPWVNeW9jX6bn1ZkU80uPd+67/ZpIJIjRQ9qaHCjhavyoWYxumg==}
engines: {node: '>=12.10.0'}
'@grpc/proto-loader@0.7.13':
@@ -903,6 +924,12 @@ packages:
peerDependencies:
'@opentelemetry/api': '>=1.0.0 <1.10.0'
+ '@opentelemetry/core@1.26.0':
+ resolution: {integrity: sha512-1iKxXXE8415Cdv0yjG3G6hQnB5eVEsJce3QaawX8SjDn0mAS0ZM8fAbZZJD4ajvhC15cePvosSCut404KrIIvQ==}
+ engines: {node: '>=14'}
+ peerDependencies:
+ '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
'@opentelemetry/exporter-logs-otlp-http@0.51.1':
resolution: {integrity: sha512-cd6GZ9IqCrmvOJwi1HjRR7o9ihF7xhZTekgxUsoyTsPF+SjKMsLF9ur6HeBYkYhk+YjZ1ken3XUMH47oUTvu8Q==}
engines: {node: '>=14'}
@@ -1317,14 +1344,14 @@ packages:
peerDependencies:
'@opentelemetry/api': '>=1.3.0 <1.9.0'
- '@opentelemetry/propagation-utils@0.30.10':
- resolution: {integrity: sha512-hhTW8pFp9PSyosYzzuUL9rdm7HF97w3OCyElufFHyUnYnKkCBbu8ne2LyF/KSdI/xZ81ubxWZs78hX4S7pLq5g==}
+ '@opentelemetry/propagation-utils@0.30.12':
+ resolution: {integrity: sha512-bgab3q/4dYUutUpQCEaSDa+mLoQJG3vJKeSiGuhM4iZaSpkz8ov0fs1MGil5PfxCo6Hhw3bB3bFYhUtnsfT/Pg==}
engines: {node: '>=14'}
peerDependencies:
'@opentelemetry/api': ^1.0.0
- '@opentelemetry/propagator-aws-xray@1.25.1':
- resolution: {integrity: sha512-soZQdO9EAROMwa9bL2C0VLadbrfRjSA9t7g6X8sL0X1B8V59pzOayYMyTW9qTECn9uuJV98A7qOnJm6KH6yk8w==}
+ '@opentelemetry/propagator-aws-xray@1.26.0':
+ resolution: {integrity: sha512-Sex+JyEZ/xX328TArBqQjh1NZSfNyw5NdASUIi9hnPsnMBMSBaDe7B9JRnXv0swz7niNyAnXa6MY7yOCV76EvA==}
engines: {node: '>=14'}
peerDependencies:
'@opentelemetry/api': '>=1.0.0 <1.10.0'
@@ -1351,14 +1378,14 @@ packages:
peerDependencies:
'@opentelemetry/api': ^1.0.0
- '@opentelemetry/resource-detector-aws@1.5.1':
- resolution: {integrity: sha512-+IUh4gAwJf49vOJM6PIjmgOapRH5zr21ZpFnNU0QZmxRi52AXVhZN7A89pKW6GAQheWnVQLD7iUN87ieYt70tw==}
+ '@opentelemetry/resource-detector-aws@1.7.0':
+ resolution: {integrity: sha512-VxrwUi/9QcVIV+40d/jOKQthfD/E4/ppQ9FsYpDH7qy16cOO5519QOdihCQJYpVNbgDqf6q3hVrCy1f8UuG8YA==}
engines: {node: '>=14'}
peerDependencies:
'@opentelemetry/api': ^1.0.0
- '@opentelemetry/resource-detector-azure@0.2.9':
- resolution: {integrity: sha512-16Z6kyrmszoa7J1uj1kbSAgZuk11K07yEDj6fa3I9XBf8Debi8y4K8ex94kpxbCfEraWagXji3bCWvaq3k4dRg==}
+ '@opentelemetry/resource-detector-azure@0.2.12':
+ resolution: {integrity: sha512-iIarQu6MiCjEEp8dOzmBvCSlRITPFTinFB2oNKAjU6xhx8d7eUcjNOKhBGQTvuCriZrxrEvDaEEY9NfrPQ6uYQ==}
engines: {node: '>=14'}
peerDependencies:
'@opentelemetry/api': ^1.0.0
@@ -1369,8 +1396,8 @@ packages:
peerDependencies:
'@opentelemetry/api': ^1.0.0
- '@opentelemetry/resource-detector-gcp@0.29.10':
- resolution: {integrity: sha512-rm2HKJ9lsdoVvrbmkr9dkOzg3Uk0FksXNxvNBgrCprM1XhMoJwThI5i0h/5sJypISUAJlEeJS6gn6nROj/NpkQ==}
+ '@opentelemetry/resource-detector-gcp@0.29.13':
+ resolution: {integrity: sha512-vdotx+l3Q+89PeyXMgKEGnZ/CwzwMtuMi/ddgD9/5tKZ08DfDGB2Npz9m2oXPHRCjc4Ro6ifMqFlRyzIvgOjhg==}
engines: {node: '>=14'}
peerDependencies:
'@opentelemetry/api': ^1.0.0
@@ -1438,6 +1465,10 @@ packages:
resolution: {integrity: sha512-ZDjMJJQRlyk8A1KZFCc+bCbsyrn1wTwdNt56F7twdfUfnHUZUq77/WfONCj8p72NZOyP7pNTdUWSTYC3GTbuuQ==}
engines: {node: '>=14'}
+ '@opentelemetry/semantic-conventions@1.27.0':
+ resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==}
+ engines: {node: '>=14'}
+
'@opentelemetry/sql-common@0.40.1':
resolution: {integrity: sha512-nSDlnHSqzC3pXn/wZEZVLuAuJ1MYMXPBwtv2qAbCa3847SaHItdE7SzUq/Jtb0KZmh1zfAbNi3AAMjztTT4Ugg==}
engines: {node: '>=14'}
@@ -1574,10 +1605,6 @@ packages:
engines: {node: '>= 10'}
hasBin: true
- '@sentry/core@8.13.0':
- resolution: {integrity: sha512-N9Qg4ZGxZWp8eb2eUUHVVKgjBLtFIjS805nG92s6yJmkvOpKm6mLtcUaT/iDf3Hta6nG+xRkhbE3r+Z4cbXG8w==}
- engines: {node: '>=14.18'}
-
'@sentry/core@8.26.0':
resolution: {integrity: sha512-g/tVmTZD4GNbLFf++hKJfBpcCAtduFEMLnbfa9iT/QEZjlmP+EzY+GsH9bafM5VsNe8DiOUp+kJKWtShzlVdBA==}
engines: {node: '>=14.18'}
@@ -1601,18 +1628,10 @@ packages:
engines: {node: '>=14.18'}
hasBin: true
- '@sentry/types@8.13.0':
- resolution: {integrity: sha512-r63s/H5gvQnQM9tTGBXz2xErUbxZALh4e2Lg/1aHj4zIvGLBjA2z5qWsh6TEZYbpmgAyGShLDr6+rWeUVf9yBQ==}
- engines: {node: '>=14.18'}
-
'@sentry/types@8.26.0':
resolution: {integrity: sha512-zKmh6SWsJh630rpt7a9vP4Cm4m1C2gDTUqUiH565CajCL/4cePpNWYrNwalSqsOSL7B9OrczA1+n6a6XvND+ng==}
engines: {node: '>=14.18'}
- '@sentry/utils@8.13.0':
- resolution: {integrity: sha512-PxV0v9VbGWH9zP37P5w2msLUFDr287nYjoY2XVF+RSolyiTs1CQNI5ZMUO3o4MsSac/dpXxjyrZXQd72t/jRYA==}
- engines: {node: '>=14.18'}
-
'@sentry/utils@8.26.0':
resolution: {integrity: sha512-xvlPU9Hd2BlyT+FhWHGNwnxWqdVRk2AHnDtVcW4Ma0Ri5EwS+uy4Jeik5UkSv8C5RVb9VlxFmS8LN3I1MPJsLw==}
engines: {node: '>=14.18'}
@@ -2013,6 +2032,9 @@ packages:
'@types/cors@2.8.17':
resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==}
+ '@types/escape-html@1.0.4':
+ resolution: {integrity: sha512-qZ72SFTgUAZ5a7Tj6kf2SHLetiH5S6f8G5frB2SPQ3EyF02kxdyBFf4Tz4banE3xCgGnKgWLt//a6VuYHKYJTg==}
+
'@types/express-serve-static-core@4.19.3':
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
@@ -2025,8 +2047,8 @@ packages:
'@types/graceful-fs@4.1.9':
resolution: {integrity: sha512-olP3sd1qOEe5dXTSaFvQG+02VdRXcdytWLAZsAq1PecU8uqQAhkrnbli7DagjtXKW/Bl7YJbUsa8MPcuc8LHEQ==}
- '@types/http-assert@1.5.5':
- resolution: {integrity: sha512-4+tE/lwdAahgZT1g30Jkdm9PzFRde0xwxBNUyRsCitRvCQB90iuA2uJYdUnhnANRcqGXaWOGY4FEoxeElNAK2g==}
+ '@types/http-assert@1.5.6':
+ resolution: {integrity: sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==}
'@types/http-errors@2.0.4':
resolution: {integrity: sha512-D0CFMMtydbJAegzOyHjtiKPLlvnm3iTZyZRSZoLq2mRhDdmLfIWOCYPfQJ4cu2erKghU++QvjcUjp/5h7hESpA==}
@@ -2073,6 +2095,9 @@ packages:
'@types/node@20.14.1':
resolution: {integrity: sha512-T2MzSGEu+ysB/FkWfqmhV3PLyQlowdptmmgD20C6QxsS8Fmv5SjpZ1ayXaEC0S21/h5UJ9iA6W/5vSNU5l00OA==}
+ '@types/pdf-parse@1.1.4':
+ resolution: {integrity: sha512-+gbBHbNCVGGYw1S9lAIIvrHW47UYOhMIFUsJcMkMrzy1Jf0vulBN3XQIjPgnoOXveMuHnF3b57fXROnY/Or7eg==}
+
'@types/pg-pool@2.0.4':
resolution: {integrity: sha512-qZAvkv1K3QbmHHFYSNRYPkRjOWRLBYrL4B9c+wG0GSVGBw0NtJwPcgx/DSddeDJvRGMHCEQ4VMEVfuJ/0gZ3XQ==}
@@ -2532,6 +2557,15 @@ packages:
color-name@1.1.4:
resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
+ color-string@1.9.1:
+ resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==}
+
+ color@3.2.1:
+ resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==}
+
+ colorspace@1.1.4:
+ resolution: {integrity: sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==}
+
combined-stream@1.0.8:
resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
engines: {node: '>= 0.8'}
@@ -2807,6 +2841,9 @@ packages:
emoji-regex@9.2.2:
resolution: {integrity: sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==}
+ enabled@2.0.0:
+ resolution: {integrity: sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==}
+
encodeurl@1.0.2:
resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==}
engines: {node: '>= 0.8'}
@@ -2968,6 +3005,9 @@ packages:
resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==}
hasBin: true
+ fn.name@1.1.0:
+ resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==}
+
follow-redirects@1.15.6:
resolution: {integrity: sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==}
engines: {node: '>=4.0'}
@@ -3030,8 +3070,8 @@ packages:
function-bind@1.1.2:
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
- gaxios@6.7.0:
- resolution: {integrity: sha512-DSrkyMTfAnAm4ks9Go20QGOcXEyW/NmZhvTYBU2rb4afBB393WIMQPWPEDMl/k8xqiNN9HYq2zao3oWXsdl2Tg==}
+ gaxios@6.7.1:
+ resolution: {integrity: sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==}
engines: {node: '>=14'}
gcp-metadata@6.1.0:
@@ -3250,6 +3290,9 @@ packages:
is-arrayish@0.2.1:
resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==}
+ is-arrayish@0.3.2:
+ resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==}
+
is-binary-path@2.1.0:
resolution: {integrity: sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==}
engines: {node: '>=8'}
@@ -3566,6 +3609,9 @@ packages:
koffi@2.9.0:
resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==}
+ kuler@2.0.0:
+ resolution: {integrity: sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==}
+
langchain@0.2.8:
resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==}
engines: {node: '>=18'}
@@ -3798,6 +3844,10 @@ packages:
resolution: {integrity: sha512-1ulHeNPp6k/LD8H91o7VYFBng5i1BDE7HoKxVbZiGFidS1Rj65qcywLxX+pVfAPoQJEjRdvKcusKwOupHCVOVQ==}
engines: {node: '>= 12.0.0'}
+ logform@2.6.1:
+ resolution: {integrity: sha512-CdaO738xRapbKIMVn2m4F6KTj4j7ooJ8POVnebSgKo3KBz5axNXRAL7ZdRjIV6NOr2Uf4vjtRkxrFETOioCqSA==}
+ engines: {node: '>= 12.0.0'}
+
loglevel@1.9.1:
resolution: {integrity: sha512-hP3I3kCrDIMuRwAwHltphhDM1r8i55H33GgqjXbrisuJhF4kRhW1dNuxsRklp4bXl8DSdLaNLuiL4A/LWRfxvg==}
engines: {node: '>= 0.6.0'}
@@ -3849,6 +3899,11 @@ packages:
engines: {node: '>=12.0.0'}
hasBin: true
+ marked@14.1.2:
+ resolution: {integrity: sha512-f3r0yqpz31VXiDB/wj9GaOB0a2PRLQl6vJmXiFrniNwjkKdvakqJRULhjFKJpxOchlCRiG5fcacoUZY5Xa6PEQ==}
+ engines: {node: '>= 18'}
+ hasBin: true
+
md5@2.3.0:
resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==}
@@ -4118,6 +4173,9 @@ packages:
once@1.4.0:
resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
+ one-time@1.0.0:
+ resolution: {integrity: sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==}
+
onetime@5.1.2:
resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==}
engines: {node: '>=6'}
@@ -4368,8 +4426,8 @@ packages:
proto-list@1.2.4:
resolution: {integrity: sha512-vtK/94akxsTMhe0/cbfpR+syPuszcuwhqVjJq26CuNDgFGj682oRBXOP5MJpv2r7JtE8MsiepGIqvvOTBwn2vA==}
- protobufjs@7.3.2:
- resolution: {integrity: sha512-RXyHaACeqXeqAKGLDl68rQKbmObRsTIn4TYVUUug1KfS47YWCo5MacGITEryugIgZqORCvJWEk4l449POg5Txg==}
+ protobufjs@7.4.0:
+ resolution: {integrity: sha512-mRUWCc3KUU4w1jU8sGxICXH/gNS94DvI1gxqDvBzhj1JpcsimQkYiOJfwsPUykUI5ZaspFbSgmBLER8IrQ3tqw==}
engines: {node: '>=12.0.0'}
proxy-addr@2.0.7:
@@ -4645,6 +4703,9 @@ packages:
resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==}
engines: {node: '>=14'}
+ simple-swizzle@0.2.2:
+ resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==}
+
simple-update-notifier@1.1.0:
resolution: {integrity: sha512-VpsrsJSUcJEseSbMHkrsrAVSdvVS5I96Qo1QAQ4FxQ9wXFcB+pjj7FB7/us9+GcgfW4ziHtYMc1J0PLczb55mg==}
engines: {node: '>=8.10.0'}
@@ -4811,6 +4872,9 @@ packages:
text-decoder@1.1.0:
resolution: {integrity: sha512-TmLJNj6UgX8xcUZo4UDStGQtDiTzF7BzWlzn9g7UWrjkpHr5uJTK1ld16wZ3LXb2vb6jH8qU89dW5whuMdXYdw==}
+ text-hex@1.0.0:
+ resolution: {integrity: sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==}
+
through@2.3.8:
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
@@ -5041,8 +5105,12 @@ packages:
engines: {node: '>= 8'}
hasBin: true
- winston-transport@4.7.0:
- resolution: {integrity: sha512-ajBj65K5I7denzer2IYW6+2bNIVqLGDHqDw3Ow8Ohh+vdW+rv4MZ6eiDvHoKhfJFZ2auyN8byXieDDJ96ViONg==}
+ winston-transport@4.8.0:
+ resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==}
+ engines: {node: '>= 12.0.0'}
+
+ winston@3.14.2:
+ resolution: {integrity: sha512-CO8cdpBB2yqzEf8v895L+GNKYJiEq8eKlHU38af3snQBQ+sdAIUepjMSguOIJC7ICbzm0ZI+Af2If4vIJrtmOg==}
engines: {node: '>= 12.0.0'}
wordnet-db@3.1.14:
@@ -5924,6 +5992,12 @@ snapshots:
dependencies:
'@jridgewell/trace-mapping': 0.3.9
+ '@dabh/diagnostics@2.0.3':
+ dependencies:
+ colorspace: 1.1.4
+ enabled: 2.0.0
+ kuler: 2.0.0
+
'@devil7softwares/pos@1.0.2': {}
'@dqbd/tiktoken@1.0.17': {}
@@ -5936,7 +6010,7 @@ snapshots:
shell-quote: 1.8.1
yargs: 17.7.2
- '@grpc/grpc-js@1.10.10':
+ '@grpc/grpc-js@1.12.2':
dependencies:
'@grpc/proto-loader': 0.7.13
'@js-sdsl/ordered-map': 4.4.2
@@ -5945,7 +6019,7 @@ snapshots:
dependencies:
lodash.camelcase: 4.3.0
long: 5.2.3
- protobufjs: 7.3.2
+ protobufjs: 7.4.0
yargs: 17.7.2
'@hyperdx/instrumentation-exception@0.1.0(@opentelemetry/api@1.9.0)':
@@ -5955,9 +6029,9 @@ snapshots:
'@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0)
'@opentelemetry/semantic-conventions': 1.25.1
- '@sentry/core': 8.13.0
- '@sentry/types': 8.13.0
- '@sentry/utils': 8.13.0
+ '@sentry/core': 8.26.0
+ '@sentry/types': 8.26.0
+ '@sentry/utils': 8.26.0
json-stringify-safe: 5.0.1
shimmer: 1.2.1
tslib: 2.6.3
@@ -6008,7 +6082,7 @@ snapshots:
semver: 7.6.2
shimmer: 1.2.1
tslib: 2.6.3
- winston-transport: 4.7.0
+ winston-transport: 4.8.0
transitivePeerDependencies:
- encoding
- supports-color
@@ -6373,10 +6447,10 @@ snapshots:
'@opentelemetry/instrumentation-undici': 0.2.0(@opentelemetry/api@1.9.0)
'@opentelemetry/instrumentation-winston': 0.37.0(@opentelemetry/api@1.9.0)
'@opentelemetry/resource-detector-alibaba-cloud': 0.28.10(@opentelemetry/api@1.9.0)
- '@opentelemetry/resource-detector-aws': 1.5.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/resource-detector-azure': 0.2.9(@opentelemetry/api@1.9.0)
+ '@opentelemetry/resource-detector-aws': 1.7.0(@opentelemetry/api@1.9.0)
+ '@opentelemetry/resource-detector-azure': 0.2.12(@opentelemetry/api@1.9.0)
'@opentelemetry/resource-detector-container': 0.3.11(@opentelemetry/api@1.9.0)
- '@opentelemetry/resource-detector-gcp': 0.29.10(@opentelemetry/api@1.9.0)
+ '@opentelemetry/resource-detector-gcp': 0.29.13(@opentelemetry/api@1.9.0)
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/sdk-node': 0.51.1(@opentelemetry/api@1.9.0)
transitivePeerDependencies:
@@ -6401,6 +6475,11 @@ snapshots:
'@opentelemetry/api': 1.9.0
'@opentelemetry/semantic-conventions': 1.25.1
+ '@opentelemetry/core@1.26.0(@opentelemetry/api@1.9.0)':
+ dependencies:
+ '@opentelemetry/api': 1.9.0
+ '@opentelemetry/semantic-conventions': 1.27.0
+
'@opentelemetry/exporter-logs-otlp-http@0.51.1(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
@@ -6432,7 +6511,7 @@ snapshots:
'@opentelemetry/exporter-trace-otlp-grpc@0.51.1(@opentelemetry/api@1.9.0)':
dependencies:
- '@grpc/grpc-js': 1.10.10
+ '@grpc/grpc-js': 1.12.2
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0)
'@opentelemetry/otlp-grpc-exporter-base': 0.51.1(@opentelemetry/api@1.9.0)
@@ -6480,7 +6559,7 @@ snapshots:
dependencies:
'@opentelemetry/api': 1.9.0
'@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/propagator-aws-xray': 1.25.1(@opentelemetry/api@1.9.0)
+ '@opentelemetry/propagator-aws-xray': 1.26.0(@opentelemetry/api@1.9.0)
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/semantic-conventions': 1.25.1
'@types/aws-lambda': 8.10.122
@@ -6492,7 +6571,7 @@ snapshots:
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/propagation-utils': 0.30.10(@opentelemetry/api@1.9.0)
+ '@opentelemetry/propagation-utils': 0.30.12(@opentelemetry/api@1.9.0)
'@opentelemetry/semantic-conventions': 1.25.1
transitivePeerDependencies:
- supports-color
@@ -6992,18 +7071,18 @@ snapshots:
'@opentelemetry/otlp-grpc-exporter-base@0.51.1(@opentelemetry/api@1.9.0)':
dependencies:
- '@grpc/grpc-js': 1.10.10
+ '@grpc/grpc-js': 1.12.2
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0)
'@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0)
- protobufjs: 7.3.2
+ protobufjs: 7.4.0
'@opentelemetry/otlp-proto-exporter-base@0.51.1(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0)
'@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0)
- protobufjs: 7.3.2
+ protobufjs: 7.4.0
'@opentelemetry/otlp-transformer@0.51.1(@opentelemetry/api@1.9.0)':
dependencies:
@@ -7015,14 +7094,14 @@ snapshots:
'@opentelemetry/sdk-metrics': 1.24.1(@opentelemetry/api@1.9.0)
'@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/propagation-utils@0.30.10(@opentelemetry/api@1.9.0)':
+ '@opentelemetry/propagation-utils@0.30.12(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
- '@opentelemetry/propagator-aws-xray@1.25.1(@opentelemetry/api@1.9.0)':
+ '@opentelemetry/propagator-aws-xray@1.26.0(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
- '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
+ '@opentelemetry/core': 1.26.0(@opentelemetry/api@1.9.0)
'@opentelemetry/propagator-b3@1.24.1(@opentelemetry/api@1.9.0)':
dependencies:
@@ -7042,18 +7121,19 @@ snapshots:
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/semantic-conventions': 1.25.1
- '@opentelemetry/resource-detector-aws@1.5.1(@opentelemetry/api@1.9.0)':
+ '@opentelemetry/resource-detector-aws@1.7.0(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/semantic-conventions': 1.25.1
+ '@opentelemetry/semantic-conventions': 1.27.0
- '@opentelemetry/resource-detector-azure@0.2.9(@opentelemetry/api@1.9.0)':
+ '@opentelemetry/resource-detector-azure@0.2.12(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
+ '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/semantic-conventions': 1.25.1
+ '@opentelemetry/semantic-conventions': 1.27.0
'@opentelemetry/resource-detector-container@0.3.11(@opentelemetry/api@1.9.0)':
dependencies:
@@ -7061,12 +7141,12 @@ snapshots:
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/semantic-conventions': 1.25.1
- '@opentelemetry/resource-detector-gcp@0.29.10(@opentelemetry/api@1.9.0)':
+ '@opentelemetry/resource-detector-gcp@0.29.13(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
'@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0)
'@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0)
- '@opentelemetry/semantic-conventions': 1.25.1
+ '@opentelemetry/semantic-conventions': 1.27.0
gcp-metadata: 6.1.0
transitivePeerDependencies:
- encoding
@@ -7152,6 +7232,8 @@ snapshots:
'@opentelemetry/semantic-conventions@1.25.1': {}
+ '@opentelemetry/semantic-conventions@1.27.0': {}
+
'@opentelemetry/sql-common@0.40.1(@opentelemetry/api@1.9.0)':
dependencies:
'@opentelemetry/api': 1.9.0
@@ -7291,11 +7373,6 @@ snapshots:
- encoding
- supports-color
- '@sentry/core@8.13.0':
- dependencies:
- '@sentry/types': 8.13.0
- '@sentry/utils': 8.13.0
-
'@sentry/core@8.26.0':
dependencies:
'@sentry/types': 8.26.0
@@ -7359,14 +7436,8 @@ snapshots:
transitivePeerDependencies:
- supports-color
- '@sentry/types@8.13.0': {}
-
'@sentry/types@8.26.0': {}
- '@sentry/utils@8.13.0':
- dependencies:
- '@sentry/types': 8.13.0
-
'@sentry/utils@8.26.0':
dependencies:
'@sentry/types': 8.26.0
@@ -7908,6 +7979,8 @@ snapshots:
dependencies:
'@types/node': 20.14.1
+ '@types/escape-html@1.0.4': {}
+
'@types/express-serve-static-core@4.19.3':
dependencies:
'@types/node': 20.14.1
@@ -7932,7 +8005,7 @@ snapshots:
dependencies:
'@types/node': 20.14.1
- '@types/http-assert@1.5.5': {}
+ '@types/http-assert@1.5.6': {}
'@types/http-errors@2.0.4': {}
@@ -7962,7 +8035,7 @@ snapshots:
'@types/accepts': 1.3.7
'@types/content-disposition': 0.5.8
'@types/cookies': 0.9.0
- '@types/http-assert': 1.5.5
+ '@types/http-assert': 1.5.6
'@types/http-errors': 2.0.4
'@types/keygrip': 1.0.6
'@types/koa-compose': 3.2.8
@@ -7995,6 +8068,8 @@ snapshots:
dependencies:
undici-types: 5.26.5
+ '@types/pdf-parse@1.1.4': {}
+
'@types/pg-pool@2.0.4':
dependencies:
'@types/pg': 8.6.1
@@ -8520,6 +8595,21 @@ snapshots:
color-name@1.1.4: {}
+ color-string@1.9.1:
+ dependencies:
+ color-name: 1.1.4
+ simple-swizzle: 0.2.2
+
+ color@3.2.1:
+ dependencies:
+ color-convert: 1.9.3
+ color-string: 1.9.1
+
+ colorspace@1.1.4:
+ dependencies:
+ color: 3.2.1
+ text-hex: 1.0.0
+
combined-stream@1.0.8:
dependencies:
delayed-stream: 1.0.0
@@ -8749,6 +8839,8 @@ snapshots:
emoji-regex@9.2.2: {}
+ enabled@2.0.0: {}
+
encodeurl@1.0.2: {}
end-of-stream@1.4.4:
@@ -8937,6 +9029,8 @@ snapshots:
flat@5.0.2: {}
+ fn.name@1.1.0: {}
+
follow-redirects@1.15.6: {}
foreground-child@3.2.1:
@@ -8989,20 +9083,20 @@ snapshots:
function-bind@1.1.2: {}
- gaxios@6.7.0:
+ gaxios@6.7.1:
dependencies:
extend: 3.0.2
https-proxy-agent: 7.0.5
is-stream: 2.0.1
node-fetch: 2.7.0
- uuid: 10.0.0
+ uuid: 9.0.1
transitivePeerDependencies:
- encoding
- supports-color
gcp-metadata@6.1.0:
dependencies:
- gaxios: 6.7.0
+ gaxios: 6.7.1
json-bigint: 1.0.0
transitivePeerDependencies:
- encoding
@@ -9275,6 +9369,8 @@ snapshots:
is-arrayish@0.2.1: {}
+ is-arrayish@0.3.2: {}
+
is-binary-path@2.1.0:
dependencies:
binary-extensions: 2.3.0
@@ -9764,6 +9860,8 @@ snapshots:
koffi@2.9.0: {}
+ kuler@2.0.0: {}
+
langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
@@ -9866,6 +9964,15 @@ snapshots:
safe-stable-stringify: 2.4.3
triple-beam: 1.4.1
+ logform@2.6.1:
+ dependencies:
+ '@colors/colors': 1.6.0
+ '@types/triple-beam': 1.3.5
+ fecha: 4.2.3
+ ms: 2.1.3
+ safe-stable-stringify: 2.4.3
+ triple-beam: 1.4.1
+
loglevel@1.9.1: {}
logsnag@1.0.0:
@@ -9923,6 +10030,8 @@ snapshots:
underscore: 1.13.6
xmlbuilder: 10.1.1
+ marked@14.1.2: {}
+
md5@2.3.0:
dependencies:
charenc: 0.0.2
@@ -10191,6 +10300,10 @@ snapshots:
dependencies:
wrappy: 1.0.2
+ one-time@1.0.0:
+ dependencies:
+ fn.name: 1.1.0
+
onetime@5.1.2:
dependencies:
mimic-fn: 2.1.0
@@ -10475,7 +10588,7 @@ snapshots:
proto-list@1.2.4: {}
- protobufjs@7.3.2:
+ protobufjs@7.4.0:
dependencies:
'@protobufjs/aspromise': 1.1.2
'@protobufjs/base64': 1.1.2
@@ -10797,6 +10910,10 @@ snapshots:
signal-exit@4.1.0: {}
+ simple-swizzle@0.2.2:
+ dependencies:
+ is-arrayish: 0.3.2
+
simple-update-notifier@1.1.0:
dependencies:
semver: 7.0.0
@@ -10985,6 +11102,8 @@ snapshots:
dependencies:
b4a: 1.6.6
+ text-hex@1.0.0: {}
+
through@2.3.8: {}
tmpl@1.0.5: {}
@@ -11172,12 +11291,26 @@ snapshots:
dependencies:
isexe: 2.0.0
- winston-transport@4.7.0:
+ winston-transport@4.8.0:
dependencies:
- logform: 2.6.0
- readable-stream: 3.6.2
+ logform: 2.6.1
+ readable-stream: 4.5.2
triple-beam: 1.4.1
+ winston@3.14.2:
+ dependencies:
+ '@colors/colors': 1.6.0
+ '@dabh/diagnostics': 2.0.3
+ async: 3.2.5
+ is-stream: 2.0.1
+ logform: 2.6.0
+ one-time: 1.0.0
+ readable-stream: 3.6.2
+ safe-stable-stringify: 2.4.3
+ stack-trace: 0.0.10
+ triple-beam: 1.4.1
+ winston-transport: 4.8.0
+
wordnet-db@3.1.14: {}
wordpos@2.1.0:
diff --git a/apps/api/requests.http b/apps/api/requests.http
index 3e7bd2b7..809bae7b 100644
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@@ -1,15 +1,15 @@
### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
- "url":"corterix.com"
+ "url":"firecrawl.dev"
}
### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Check Job Status
@@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website
POST http://localhost:3002/v0/crawl HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
@@ -45,7 +45,7 @@ content-type: application/json
### Scrape Website
POST http://localhost:3002/v0/scrape HTTP/1.1
-Authorization: Bearer
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
@@ -56,12 +56,12 @@ content-type: application/json
### Check Job Status
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
-Authorization: Bearer
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Job Result
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
-Authorization: Bearer
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
@@ -70,7 +70,7 @@ content-type: application/json
### Check Job Status
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
-Authorization: Bearer
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Active Jobs Count
GET http://localhost:3002/serverHealthCheck
diff --git a/apps/api/sharedLibs/go-html-to-md/.gitignore b/apps/api/sharedLibs/go-html-to-md/.gitignore
new file mode 100644
index 00000000..bdab47c6
--- /dev/null
+++ b/apps/api/sharedLibs/go-html-to-md/.gitignore
@@ -0,0 +1,2 @@
+html-to-markdown.so
+html-to-markdown.h
\ No newline at end of file
diff --git a/apps/api/src/lib/go-html-to-md/README.md b/apps/api/sharedLibs/go-html-to-md/README.md
similarity index 100%
rename from apps/api/src/lib/go-html-to-md/README.md
rename to apps/api/sharedLibs/go-html-to-md/README.md
diff --git a/apps/api/src/lib/go-html-to-md/go.mod b/apps/api/sharedLibs/go-html-to-md/go.mod
similarity index 61%
rename from apps/api/src/lib/go-html-to-md/go.mod
rename to apps/api/sharedLibs/go-html-to-md/go.mod
index 0836f441..ad50981f 100644
--- a/apps/api/src/lib/go-html-to-md/go.mod
+++ b/apps/api/sharedLibs/go-html-to-md/go.mod
@@ -2,7 +2,7 @@ module html-to-markdown.go
go 1.19
-require github.com/JohannesKaufmann/html-to-markdown v1.6.0
+require github.com/tomkosm/html-to-markdown v0.0.0-20241031120941-3a729f6b7751
require (
github.com/PuerkitoBio/goquery v1.9.2 // indirect
@@ -12,3 +12,5 @@ require (
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)
+
+replace github.com/JohannesKaufmann/html-to-markdown => github.com/tomkosm/html-to-markdown v0.0.0-20241031120941-3a729f6b7751
diff --git a/apps/api/src/lib/go-html-to-md/go.sum b/apps/api/sharedLibs/go-html-to-md/go.sum
similarity index 67%
rename from apps/api/src/lib/go-html-to-md/go.sum
rename to apps/api/sharedLibs/go-html-to-md/go.sum
index 7961629d..a2c6c64f 100644
--- a/apps/api/src/lib/go-html-to-md/go.sum
+++ b/apps/api/sharedLibs/go-html-to-md/go.sum
@@ -1,12 +1,8 @@
-github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k=
-github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
@@ -15,27 +11,17 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
-github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
-github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
-github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/tomkosm/html-to-markdown v0.0.0-20241031120941-3a729f6b7751 h1:l6JdzR2ry727okVeBxnH8nh3SAd7l/0gJTWbK/3UBRY=
+github.com/tomkosm/html-to-markdown v0.0.0-20241031120941-3a729f6b7751/go.mod h1:I2mfsDlV0RelCsTjeYh9mdXdwD2M70rA7LT/y2girik=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
-github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
-golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
-golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
@@ -43,9 +29,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
-golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
-golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -58,25 +41,15 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
-golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
-golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
-golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk=
-golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
-golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
@@ -84,10 +57,8 @@ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/apps/api/src/lib/go-html-to-md/html-to-markdown.go b/apps/api/sharedLibs/go-html-to-md/html-to-markdown.go
similarity index 74%
rename from apps/api/src/lib/go-html-to-md/html-to-markdown.go
rename to apps/api/sharedLibs/go-html-to-md/html-to-markdown.go
index 9905a69a..11776fd6 100644
--- a/apps/api/src/lib/go-html-to-md/html-to-markdown.go
+++ b/apps/api/sharedLibs/go-html-to-md/html-to-markdown.go
@@ -2,10 +2,10 @@ package main
import (
"C"
- "log"
+ // "log"
- md "github.com/JohannesKaufmann/html-to-markdown"
- "github.com/JohannesKaufmann/html-to-markdown/plugin"
+ md "github.com/tomkosm/html-to-markdown"
+ "github.com/tomkosm/html-to-markdown/plugin"
)
//export ConvertHTMLToMarkdown
@@ -15,7 +15,7 @@ func ConvertHTMLToMarkdown(html *C.char) *C.char {
markdown, err := converter.ConvertString(C.GoString(html))
if err != nil {
- log.Fatal(err)
+ // log.Fatal(err)
}
return C.CString(markdown)
}
diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
index b1708abc..dec77131 100644
--- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
@@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
expect(crawlInitResponse.statusCode).toBe(200);
expect(crawlInitResponse.body).toHaveProperty("jobId");
- let crawlStatus: string;
+ let crawlStatus: string = "scraping";
let crawlData = [];
while (crawlStatus !== "completed") {
const statusResponse = await request(TEST_URL)
diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts
index acb22780..83f676b8 100644
--- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts
@@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
process.env.SCRAPING_BEE_API_KEY = "";
process.env.OPENAI_API_KEY = "";
process.env.BULL_AUTH_KEY = "";
- process.env.LOGTAIL_KEY = "";
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
process.env.LLAMAPARSE_API_KEY = "";
process.env.TEST_API_KEY = "";
diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
index a4163472..e1f5f3fa 100644
--- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
@@ -1,7 +1,7 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import {
- ScrapeRequest,
+ ScrapeRequestInput,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
@@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
});
it.concurrent("should throw error for blocklisted URL", async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test",
};
@@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with a valid API key",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
};
@@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with a valid API key",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/abs/2410.04840",
};
@@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"],
};
@@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
30000
);
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
// formats: ["markdown", "html"],
};
@@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
}, 60000);
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
}, 60000);
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true
};
@@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
- const scrapeRequestWithRemoveTags: ScrapeRequest = {
+ const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/",
excludeTags: ['.nav', '#footer', 'strong'],
onlyMainContent: false // default is true
@@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"],
};
@@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with waitFor",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 8000
@@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response with a valid links on page",
async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["links"],
};
@@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
});
it.concurrent("should throw error for blocklisted URL", async () => {
- const scrapeRequest: ScrapeRequest = {
+ const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test",
};
@@ -868,7 +868,7 @@ describe("POST /v1/crawl", () => {
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
- expect(urls.length).toBeGreaterThanOrEqual(1);
+ expect(urls.length).toBeGreaterThan(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
diff --git a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
new file mode 100644
index 00000000..5c7feb1f
--- /dev/null
+++ b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
@@ -0,0 +1,603 @@
+import request from "supertest";
+import { configDotenv } from "dotenv";
+import {
+ ScrapeRequest,
+ ScrapeResponseRequestTest,
+} from "../../controllers/v1/types";
+
+configDotenv();
+const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
+const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
+
+describe("E2E Tests for v1 API Routes", () => {
+
+ it.concurrent('should return a successful response for a scrape with 403 page', async () => {
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post('/v1/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/403' });
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(response.body.data).toHaveProperty('markdown');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.statusCode).toBe(403);
+ }, 30000);
+
+ it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data).toHaveProperty("markdown");
+
+ expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+ expect(response.body.data.markdown).toContain("Content with id #content-1");
+ // expect(response.body.data.markdown).toContain("Loading...");
+ expect(response.body.data.markdown).toContain("Click me!");
+ expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
+ expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
+ expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
+ expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
+ expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
+ },
+ 30000);
+
+ it.concurrent("should handle 'formats:html' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["html"]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+
+ expect(response.body.data).not.toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("html");
+
+ expect(response.body.data.html).not.toContain("");
+ expect(response.body.data.html).toContain("
This page is used for end-to-end (e2e) testing with Firecrawl.
");
+ },
+ 30000);
+
+ it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["rawHtml"]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data).not.toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("rawHtml");
+
+ expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.");
+ expect(response.body.data.rawHtml).toContain(">Header");
+ },
+ 30000);
+
+ // - TODO: tests for links
+ // - TODO: tests for screenshot
+ // - TODO: tests for screenshot@fullPage
+
+ it.concurrent("should handle 'headers' parameter correctly", async () => {
+ // @ts-ignore
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ headers: { "e2e-header-test": "firecrawl" }
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
+ }, 30000);
+
+ it.concurrent("should handle 'includeTags' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ includeTags: ['#content-1']
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data.markdown).not.toContain("This page is used for end-to-end (e2e) testing with Firecrawl.
");
+ expect(response.body.data.markdown).toContain("Content with id #content-1");
+ },
+ 30000);
+
+ it.concurrent("should handle 'excludeTags' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ excludeTags: ['#content-1']
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+ expect(response.body.data.markdown).not.toContain("Content with id #content-1");
+ },
+ 30000);
+
+ it.concurrent("should handle 'onlyMainContent' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["html", "markdown"],
+ onlyMainContent: false
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+ expect(response.body.data.html).toContain("");
+ },
+ 30000);
+
+ it.concurrent("should handle 'timeout' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ timeout: 500
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(408);
+
+ if (!("error" in response.body)) {
+ throw new Error("Expected response body to have 'error' property");
+ }
+ expect(response.body.error).toBe("Request timed out");
+ expect(response.body.success).toBe(false);
+ }, 30000);
+
+
+ it.concurrent("should handle 'mobile' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ mobile: true
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(response.body.data.markdown).toContain("This content is only visible on mobile");
+ },
+ 30000);
+
+ it.concurrent("should handle 'parsePDF' parameter correctly",
+ async () => {
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
+ await new Promise((r) => setTimeout(r, 6000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
+ expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
+
+ const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
+ await new Promise((r) => setTimeout(r, 6000));
+
+ expect(responseNoParsePDF.statusCode).toBe(200);
+ expect(responseNoParsePDF.body).toHaveProperty('data');
+ if (!("data" in responseNoParsePDF.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
+ },
+ 30000);
+
+ // it.concurrent("should handle 'location' parameter correctly",
+ // async () => {
+ // const scrapeRequest: ScrapeRequest = {
+ // url: "https://roastmywebsite.ai",
+ // location: {
+ // country: "US",
+ // languages: ["en"]
+ // }
+ // };
+
+ // const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ // .post("/v1/scrape")
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ // .set("Content-Type", "application/json")
+ // .send(scrapeRequest);
+
+ // expect(response.statusCode).toBe(200);
+ // // Add assertions to verify location is handled correctly
+ // },
+ // 30000);
+
+ it.concurrent("should handle 'skipTlsVerification' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: "https://expired.badssl.com/",
+ timeout: 120000
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+ console.log("Error1a")
+ // console.log(response.body)
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
+ console.log("Error?")
+
+ const scrapeRequestWithSkipTlsVerification = {
+ url: "https://expired.badssl.com/",
+ skipTlsVerification: true,
+ timeout: 120000
+
+ } as ScrapeRequest;
+
+ const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequestWithSkipTlsVerification);
+
+ console.log("Error1b")
+ // console.log(responseWithSkipTlsVerification.body)
+ expect(responseWithSkipTlsVerification.statusCode).toBe(200);
+ if (!("data" in responseWithSkipTlsVerification.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ // console.log(responseWithSkipTlsVerification.body.data)
+ expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
+ },
+ 60000);
+
+ it.concurrent("should handle 'removeBase64Images' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ removeBase64Images: true
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ // console.log(response.body.data.markdown)
+ // - TODO: not working for every image
+ // expect(response.body.data.markdown).toContain("Image-Removed");
+ },
+ 30000);
+
+ it.concurrent("should handle 'action wait' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ actions: [{
+ type: "wait",
+ milliseconds: 10000
+ }]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(response.body.data.markdown).not.toContain("Loading...");
+ expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
+ },
+ 30000);
+
+ // screenshot
+ it.concurrent("should handle 'action screenshot' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ actions: [{
+ type: "screenshot"
+ }]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ if (!response.body.data.actions?.screenshots) {
+ throw new Error("Expected response body to have screenshots array");
+ }
+ expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
+ expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
+
+ // TODO compare screenshot with expected screenshot
+ },
+ 30000);
+
+ it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ actions: [{
+ type: "screenshot",
+ fullPage: true
+ },
+ {
+ type:"scrape"
+ }]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ // console.log(response.body.data.actions?.screenshots[0])
+ if (!response.body.data.actions?.screenshots) {
+ throw new Error("Expected response body to have screenshots array");
+ }
+ expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
+ expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
+
+ if (!response.body.data.actions?.scrapes) {
+ throw new Error("Expected response body to have scrapes array");
+ }
+ expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
+ expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+ // TODO compare screenshot with expected full page screenshot
+ },
+ 30000);
+
+ it.concurrent("should handle 'action click' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ actions: [{
+ type: "click",
+ selector: "#click-me"
+ }]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+ expect(response.body.data.markdown).not.toContain("Click me!");
+ expect(response.body.data.markdown).toContain("Text changed after click!");
+ },
+ 30000);
+
+ it.concurrent("should handle 'action write' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["html"],
+ actions: [{
+ type: "click",
+ selector: "#input-1"
+ },
+ {
+ type: "write",
+ text: "Hello, world!"
+ }
+ ]} as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ expect(response.statusCode).toBe(200);
+ if (!("data" in response.body)) {
+ throw new Error("Expected response body to have 'data' property");
+ }
+
+ // TODO: fix this test (need to fix fire-engine first)
+ // uncomment the following line:
+ // expect(response.body.data.html).toContain("");
+ },
+ 30000);
+
+ // TODO: fix this test (need to fix fire-engine first)
+ it.concurrent("should handle 'action pressKey' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["markdown"],
+ actions: [
+ {
+ type: "press",
+ key: "ArrowDown"
+ }
+ ]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ // // TODO: fix this test (need to fix fire-engine first)
+ // // right now response.body is: { success: false, error: '(Internal server error) - null' }
+ // expect(response.statusCode).toBe(200);
+ // if (!("data" in response.body)) {
+ // throw new Error("Expected response body to have 'data' property");
+ // }
+ // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
+ },
+ 30000);
+
+ // TODO: fix this test (need to fix fire-engine first)
+ it.concurrent("should handle 'action scroll' parameter correctly",
+ async () => {
+ const scrapeRequest = {
+ url: E2E_TEST_SERVER_URL,
+ formats: ["markdown"],
+ actions: [
+ {
+ type: "click",
+ selector: "#scroll-bottom-loader"
+ },
+ {
+ type: "scroll",
+ direction: "down",
+ amount: 2000
+ }
+ ]
+ } as ScrapeRequest;
+
+ const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+ .post("/v1/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send(scrapeRequest);
+
+ // TODO: uncomment this tests
+ // expect(response.statusCode).toBe(200);
+ // if (!("data" in response.body)) {
+ // throw new Error("Expected response body to have 'data' property");
+ // }
+ //
+ // expect(response.body.data.markdown).toContain("You have reached the bottom!")
+ },
+ 30000);
+
+ // TODO: test scrape action
+
+});
\ No newline at end of file
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 26caf63e..90a4587d 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => {
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
- expect(urls.length).toBeGreaterThanOrEqual(1);
+ expect(urls.length).toBeGreaterThan(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
@@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .maxResponseSize(4000000000);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts
index 93327e66..de74fed0 100644
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@@ -9,9 +9,8 @@ import {
import { supabase_service } from "../services/supabase";
import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
-import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
import { redlock } from "../services/redlock";
import { deleteKey, getValue } from "../services/redis";
import { setValue } from "../services/redis";
@@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
export async function setCachedACUC(
api_key: string,
acuc:
- | AuthCreditUsageChunk
- | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
+ | AuthCreditUsageChunk | null
+ | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
) {
const cacheKeyACUC = `acuc_${api_key}`;
const redLockKey = `lock_${cacheKeyACUC}`;
@@ -49,7 +48,7 @@ export async function setCachedACUC(
try {
await redlock.using([redLockKey], 10000, {}, async (signal) => {
if (typeof acuc === "function") {
- acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
+ acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
if (acuc === null) {
if (signal.aborted) {
@@ -69,7 +68,7 @@ export async function setCachedACUC(
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
});
} catch (error) {
- Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
+ logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
}
}
@@ -103,7 +102,7 @@ export async function getACUC(
break;
}
- Logger.warn(
+ logger.warn(
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
);
retries++;
@@ -146,33 +145,14 @@ export async function authenticateUser(
res,
mode?: RateLimiterMode
): Promise {
- return withAuth(supaAuthenticateUser)(req, res, mode);
-}
-
-function setTrace(team_id: string, api_key: string) {
- try {
- setTraceAttributes({
- team_id,
- api_key,
- });
- } catch (error) {
- Sentry.captureException(error);
- Logger.error(`Error setting trace attributes: ${error.message}`);
- }
+ return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
}
export async function supaAuthenticateUser(
req,
res,
mode?: RateLimiterMode
-): Promise<{
- success: boolean;
- team_id?: string;
- error?: string;
- status?: number;
- plan?: PlanType;
- chunk?: AuthCreditUsageChunk;
-}> {
+): Promise {
const authHeader =
req.headers.authorization ??
(req.headers["sec-websocket-protocol"]
@@ -200,7 +180,7 @@ export async function supaAuthenticateUser(
let teamId: string | null = null;
let priceId: string | null = null;
- let chunk: AuthCreditUsageChunk;
+ let chunk: AuthCreditUsageChunk | null = null;
if (token == "this_is_just_a_preview_token") {
if (mode == RateLimiterMode.CrawlStatus) {
@@ -233,8 +213,6 @@ export async function supaAuthenticateUser(
priceId = chunk.price_id;
const plan = getPlanByPriceId(priceId);
- // HyperDX Logging
- setTrace(teamId, normalizedApi);
subscriptionData = {
team_id: teamId,
plan,
@@ -291,7 +269,7 @@ export async function supaAuthenticateUser(
try {
await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) {
- Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
+ logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
@@ -318,7 +296,7 @@ export async function supaAuthenticateUser(
mode === RateLimiterMode.CrawlStatus ||
mode === RateLimiterMode.Search)
) {
- return { success: true, team_id: "preview" };
+ return { success: true, team_id: "preview", chunk: null };
// check the origin of the request and make sure its from firecrawl.dev
// const origin = req.headers.origin;
// if (origin && origin.includes("firecrawl.dev")){
@@ -333,12 +311,12 @@ export async function supaAuthenticateUser(
return {
success: true,
- team_id: subscriptionData.team_id,
- plan: (subscriptionData.plan ?? "") as PlanType,
+ team_id: teamId ?? undefined,
+ plan: (subscriptionData?.plan ?? "") as PlanType,
chunk,
};
}
-function getPlanByPriceId(price_id: string): PlanType {
+function getPlanByPriceId(price_id: string | null): PlanType {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER:
return "starter";
@@ -354,9 +332,14 @@ function getPlanByPriceId(price_id: string): PlanType {
return "standardnew";
case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
+ case process.env.STRIPE_PRICE_ID_SCALE_2M:
return "growth";
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
return "growthdouble";
+ case process.env.STRIPE_PRICE_ID_ETIER2C:
+ return "etier2c";
+ case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh
+ return "etier1a";
default:
return "free";
}
diff --git a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
index 876ca98a..75acd60a 100644
--- a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
+++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
@@ -1,7 +1,7 @@
import { Request, Response } from "express";
import { supabase_service } from "../../../services/supabase";
import { clearACUC } from "../../auth";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
export async function acucCacheClearController(req: Request, res: Response) {
try {
@@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
.select("*")
.eq("team_id", team_id);
- await Promise.all(keys.data.map((x) => clearACUC(x.key)));
+ await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
res.json({ ok: true });
} catch (error) {
- Logger.error(`Error clearing ACUC cache via API route: ${error}`);
+ logger.error(`Error clearing ACUC cache via API route: ${error}`);
res.status(500).json({ error: "Internal server error" });
}
}
diff --git a/apps/api/src/controllers/v0/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts
index 71748002..6ef8a992 100644
--- a/apps/api/src/controllers/v0/admin/queue.ts
+++ b/apps/api/src/controllers/v0/admin/queue.ts
@@ -1,7 +1,7 @@
import { Request, Response } from "express";
import { Job } from "bullmq";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../../services/alerts";
import { sendSlackWebhook } from "../../../services/alerts/slack";
@@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
) {
- Logger.info("🐂 Cleaning jobs older than 24h");
+ logger.info("🐂 Cleaning jobs older than 24h");
try {
const scrapeQueue = getScrapeQueue();
const batchSize = 10;
@@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
).flat();
const before24hJobs =
completedJobs.filter(
- (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+ (job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
@@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
await job.remove();
count++;
} catch (jobError) {
- Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
+ logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
- Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
+ logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
@@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
- Logger.debug(`Failed to initialize alerts: ${error}`);
+ logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
@@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
noActiveJobs,
});
} catch (error) {
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
@@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
}
if (targetMachineCount !== activeMachines) {
- Logger.info(
+ logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
);
@@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
count: activeMachines,
});
} catch (error) {
- Logger.error(error);
+ logger.error(error);
return res.status(500).send("Failed to initialize autoscaler");
}
}
diff --git a/apps/api/src/controllers/v0/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts
index dc58d745..dc587606 100644
--- a/apps/api/src/controllers/v0/admin/redis-health.ts
+++ b/apps/api/src/controllers/v0/admin/redis-health.ts
@@ -1,6 +1,6 @@
import { Request, Response } from "express";
import Redis from "ioredis";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
import { redisRateLimitClient } from "../../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) {
@@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
return await operation();
} catch (error) {
if (attempt === retries) throw error;
- Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
+ logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
}
}
};
try {
- const queueRedis = new Redis(process.env.REDIS_URL);
+ const queueRedis = new Redis(process.env.REDIS_URL!);
const testKey = "test";
const testValue = "test";
@@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
await retryOperation(() => queueRedis.del(testKey));
} catch (error) {
- Logger.error(`queueRedis health check failed: ${error}`);
+ logger.error(`queueRedis health check failed: ${error}`);
queueRedisHealth = null;
}
@@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
);
await retryOperation(() => redisRateLimitClient.del(testKey));
} catch (error) {
- Logger.error(`redisRateLimitClient health check failed: ${error}`);
+ logger.error(`redisRateLimitClient health check failed: ${error}`);
redisRateLimitHealth = null;
}
@@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy"
) {
- Logger.info("Both Redis instances are healthy");
+ logger.info("Both Redis instances are healthy");
return res.status(200).json({ status: "healthy", details: healthStatus });
} else {
- Logger.info(
+ logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}`
);
// await sendSlackWebhook(
@@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
.json({ status: "unhealthy", details: healthStatus });
}
} catch (error) {
- Logger.error(`Redis health check failed: ${error}`);
+ logger.error(`Redis health check failed: ${error}`);
// await sendSlackWebhook(
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
// true
diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts
index efcd454a..e81064f2 100644
--- a/apps/api/src/controllers/v0/crawl-cancel.ts
+++ b/apps/api/src/controllers/v0/crawl-cancel.ts
@@ -2,7 +2,7 @@ import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
@@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
- const { success, team_id, error, status } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { team_id } = auth;
+
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
@@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
sc.cancelled = true;
await saveCrawl(req.params.jobId, sc);
} catch (error) {
- Logger.error(error);
+ logger.error(error);
}
res.json({
@@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
});
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts
index 66522bcf..7b6e610a 100644
--- a/apps/api/src/controllers/v0/crawl-status.ts
+++ b/apps/api/src/controllers/v0/crawl-status.ts
@@ -2,15 +2,17 @@ import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
+import { Job } from "bullmq";
+import { toLegacyDocument } from "../v1/types";
configDotenv();
export async function getJobs(crawlId: string, ids: string[]) {
- const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
+ const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
@@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) {
export async function crawlStatusController(req: Request, res: Response) {
try {
- const { success, team_id, error, status } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { team_id } = auth;
+
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
@@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) {
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length,
- data: jobStatus === "completed" ? data : null,
- partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
+ data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
+ partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
});
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts
index 3ebee976..cb7a3ccc 100644
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
+import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
+import { ZodError } from "zod";
export async function crawlController(req: Request, res: Response) {
try {
- const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { team_id, plan, chunk } = auth;
+
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
@@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) {
try {
createIdempotencyKey(req);
} catch (error) {
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
@@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) {
// TODO: need to do this to v1
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
- let url = req.body.url;
+ let url = urlSchema.parse(req.body.url);
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
@@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) {
// documents: docs,
// });
// } catch (error) {
- // Logger.error(error);
+ // logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
@@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) {
await logCrawl(id, team_id);
+ const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
+
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
- pageOptions,
+ scrapeOptions,
+ internalOptions,
team_id,
plan,
createdAt: Date.now(),
@@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) {
data: {
url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
+ crawlerOptions,
+ scrapeOptions,
+ internalOptions,
team_id,
plan,
- pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
@@ -187,19 +195,16 @@ export async function crawlController(req: Request, res: Response) {
await lockURLs(
id,
+ sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
- if (Sentry.isInitialized()) {
- for (const job of jobs) {
- // add with sentry instrumentation
- await addScrapeJob(job.data as any, {}, job.opts.jobId);
- }
- } else {
- await getScrapeQueue().addBulk(jobs);
+ for (const job of jobs) {
+ // add with sentry instrumentation
+ await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
} else {
await lockURL(id, sc, url);
@@ -207,28 +212,33 @@ export async function crawlController(req: Request, res: Response) {
// Not needed, first one should be 15.
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
- const job = await addScrapeJob(
+ const jobId = uuidv4();
+ await addScrapeJob(
{
url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
+ crawlerOptions,
+ scrapeOptions,
+ internalOptions,
team_id,
- plan,
- pageOptions: pageOptions,
+ plan: plan!,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
},
{
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
- }
+ },
+ jobId,
);
- await addCrawlJob(id, job.id);
+ await addCrawlJob(id, jobId);
}
res.json({ jobId: id });
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
- return res.status(500).json({ error: error.message });
+ logger.error(error);
+ return res.status(500).json({ error: error instanceof ZodError
+ ? "Invalid URL"
+ : error.message });
}
}
diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts
index bceb1df9..8b82bef8 100644
--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@@ -3,15 +3,16 @@ import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
+import { fromLegacyScrapeOptions } from "../v1/types";
export async function crawlPreviewController(req: Request, res: Response) {
try {
- const { success, error, status, team_id:a, plan } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.Preview
@@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) {
const team_id = "preview";
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { plan } = auth;
+
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
@@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
// documents: docs,
// });
// } catch (error) {
- // Logger.error(error);
+ // logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
@@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) {
robots = await this.getRobotsTxt();
} catch (_) {}
+ const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
+
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
- pageOptions,
+ scrapeOptions,
+ internalOptions,
team_id,
plan,
robots,
@@ -103,38 +109,42 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
- const job = await addScrapeJob({
+ const jobId = uuidv4();
+ await addScrapeJob({
url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
team_id,
- plan,
- pageOptions: pageOptions,
+ plan: plan!,
+ crawlerOptions,
+ scrapeOptions,
+ internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
- });
- await addCrawlJob(id, job.id);
+ }, {}, jobId);
+ await addCrawlJob(id, jobId);
}
} else {
await lockURL(id, sc, url);
- const job = await addScrapeJob({
+ const jobId = uuidv4();
+ await addScrapeJob({
url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
team_id,
- plan,
- pageOptions: pageOptions,
+ plan: plan!,
+ crawlerOptions,
+ scrapeOptions,
+ internalOptions,
origin: "website-preview",
crawl_id: id,
- });
- await addCrawlJob(id, job.id);
+ }, {}, jobId);
+ await addCrawlJob(id, jobId);
}
res.json({ jobId: id });
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v0/keyAuth.ts b/apps/api/src/controllers/v0/keyAuth.ts
index b70d672a..63915302 100644
--- a/apps/api/src/controllers/v0/keyAuth.ts
+++ b/apps/api/src/controllers/v0/keyAuth.ts
@@ -8,13 +8,14 @@ import { authenticateUser } from "../auth";
export const keyAuthController = async (req: Request, res: Response) => {
try {
// make sure to authenticate user first, Bearer
- const { success, team_id, error, status } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+
// if success, return success: true
return res.status(200).json({ success: true });
} catch (error) {
diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts
index f5dbc3d1..9bc41fc1 100644
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@@ -7,7 +7,7 @@ import {
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
-import { Document } from "../../lib/entities";
+import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import {
@@ -19,9 +19,11 @@ import {
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue } from "../../services/queue-service";
import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
+import { fromLegacyScrapeOptions } from "../v1/types";
+import { ZodError } from "zod";
export async function scrapeHelper(
jobId: string,
@@ -35,10 +37,10 @@ export async function scrapeHelper(
): Promise<{
success: boolean;
error?: string;
- data?: Document;
+ data?: Document | { url: string };
returnCode: number;
}> {
- const url = req.body.url;
+ const url = urlSchema.parse(req.body.url);
if (typeof url !== "string") {
return { success: false, error: "Url is required", returnCode: 400 };
}
@@ -54,15 +56,16 @@ export async function scrapeHelper(
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
- const job = await addScrapeJob(
+ const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
+
+ await addScrapeJob(
{
url,
mode: "single_urls",
- crawlerOptions,
team_id,
- pageOptions,
- plan,
- extractorOptions,
+ scrapeOptions,
+ internalOptions,
+ plan: plan!,
origin: req.body.origin ?? defaultOrigin,
is_scrape: true,
},
@@ -81,9 +84,9 @@ export async function scrapeHelper(
},
async (span) => {
try {
- doc = (await waitForJob(job.id, timeout))[0];
+ doc = (await waitForJob(jobId, timeout));
} catch (e) {
- if (e instanceof Error && e.message.startsWith("Job wait")) {
+ if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
span.setAttribute("timedOut", true);
return {
success: false,
@@ -116,10 +119,10 @@ export async function scrapeHelper(
return err;
}
- await job.remove();
+ await getScrapeQueue().remove(jobId);
if (!doc) {
- console.error("!!! PANIC DOC IS", doc, job);
+ console.error("!!! PANIC DOC IS", doc);
return {
success: true,
error: "No page found",
@@ -149,7 +152,7 @@ export async function scrapeHelper(
return {
success: true,
- data: doc,
+ data: toLegacyDocument(doc, internalOptions),
returnCode: 200,
};
}
@@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer
- const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { team_id, plan, chunk } = auth;
+
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = {
@@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
}
} catch (error) {
- Logger.error(error);
+ logger.error(error);
earlyReturn = true;
return res.status(500).json({
error:
@@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) {
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
- result.data && result.data.markdown
- ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
+ result.data && (result.data as Document).markdown
+ ? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
: 0;
if (result.success) {
@@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) {
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
- Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
+ logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
@@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) {
let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
- if (doc && doc.rawHtml) {
- delete doc.rawHtml;
+ if (doc && (doc as Document).rawHtml) {
+ delete (doc as Document).rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
- if(!pageOptions.includeMarkdown && doc && doc.markdown) {
- delete doc.markdown;
+ if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
+ delete (doc as Document).markdown;
}
}
+ const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
+
logJob({
job_id: jobId,
success: result.success,
@@ -276,21 +283,22 @@ export async function scrapeController(req: Request, res: Response) {
mode: "scrape",
url: req.body.url,
crawlerOptions: crawlerOptions,
- pageOptions: pageOptions,
+ scrapeOptions,
origin: origin,
- extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({
error:
- typeof error === "string"
- ? error
- : error?.message ?? "Internal Server Error",
+ error instanceof ZodError
+ ? "Invalid URL"
+ : typeof error === "string"
+ ? error
+ : error?.message ?? "Internal Server Error",
});
}
}
diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts
index 3635a4c4..4dd38afd 100644
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@@ -1,5 +1,4 @@
import { Request, Response } from "express";
-import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
@@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
+import { Job } from "bullmq";
+import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
export async function searchHelper(
jobId: string,
req: Request,
team_id: string,
- subscription_id: string,
+ subscription_id: string | null | undefined,
crawlerOptions: any,
pageOptions: PageOptions,
searchOptions: SearchOptions,
- plan: PlanType
+ plan: PlanType | undefined
): Promise<{
success: boolean;
error?: string;
@@ -35,8 +36,8 @@ export async function searchHelper(
return { success: false, error: "Query is required", returnCode: 400 };
}
- const tbs = searchOptions.tbs ?? null;
- const filter = searchOptions.filter ?? null;
+ const tbs = searchOptions.tbs ?? undefined;
+ const filter = searchOptions.filter ?? undefined;
let num_results = Math.min(searchOptions.limit ?? 7, 10);
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
@@ -57,11 +58,12 @@ export async function searchHelper(
});
let justSearch = pageOptions.fetchPageContent === false;
-
+
+ const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
if (justSearch) {
billTeam(team_id, subscription_id, res.length).catch(error => {
- Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
+ logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
return { success: true, data: res, returnCode: 200 };
@@ -88,9 +90,9 @@ export async function searchHelper(
data: {
url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
team_id: team_id,
- pageOptions: pageOptions,
+ scrapeOptions,
+ internalOptions,
},
opts: {
jobId: uuid,
@@ -99,28 +101,23 @@ export async function searchHelper(
};
})
- let jobs = [];
- if (Sentry.isInitialized()) {
- for (const job of jobDatas) {
- // add with sentry instrumentation
- jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority));
- }
- } else {
- jobs = await getScrapeQueue().addBulk(jobDatas);
- await getScrapeQueue().addBulk(jobs);
+ // TODO: addScrapeJobs
+ for (const job of jobDatas) {
+ await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
}
- const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
+ const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
}
- await Promise.all(jobs.map(x => x.remove()));
+ const sq = getScrapeQueue();
+ await Promise.all(jobDatas.map(x => sq.remove(x.opts.jobId)));
// make sure doc.content is not empty
const filteredDocs = docs.filter(
- (doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
+ (doc: any) => doc && doc.content && doc.content.trim().length > 0
);
if (filteredDocs.length === 0) {
@@ -137,14 +134,15 @@ export async function searchHelper(
export async function searchController(req: Request, res: Response) {
try {
// make sure to authenticate user first, Bearer
- const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
RateLimiterMode.Search
);
- if (!success) {
- return res.status(status).json({ error });
+ if (!auth.success) {
+ return res.status(auth.status).json({ error: auth.error });
}
+ const { team_id, plan, chunk } = auth;
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? {
includeHtml: req.body.pageOptions?.includeHtml ?? false,
@@ -167,7 +165,7 @@ export async function searchController(req: Request, res: Response) {
}
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: "Internal server error" });
}
const startTime = new Date().getTime();
@@ -194,17 +192,16 @@ export async function searchController(req: Request, res: Response) {
mode: "search",
url: req.body.query,
crawlerOptions: crawlerOptions,
- pageOptions: pageOptions,
origin: origin,
});
return res.status(result.returnCode).json(result);
} catch (error) {
- if (error instanceof Error && error.message.startsWith("Job wait")) {
+ if (error instanceof Error && (error.message.startsWith("Job wait") || error.message === "timeout")) {
return res.status(408).json({ error: "Request timed out" });
}
Sentry.captureException(error);
- Logger.error(error);
+ logger.error("Unhandled error occurred in search", { error });
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts
index bf8d2834..c5eafc2d 100644
--- a/apps/api/src/controllers/v0/status.ts
+++ b/apps/api/src/controllers/v0/status.ts
@@ -1,5 +1,5 @@
import { Request, Response } from "express";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
@@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
});
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts
index 7c68341b..b018dc99 100644
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@@ -4,7 +4,6 @@ import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
- legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
@@ -16,6 +15,7 @@ import {
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
+import { addScrapeJobs } from "../../services/queue-jobs";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@@ -27,17 +27,16 @@ export async function batchScrapeController(
await logCrawl(id, req.auth.team_id);
- let { remainingCredits } = req.account;
+ let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
remainingCredits = Infinity;
}
- const pageOptions = legacyScrapeOptions(req.body);
-
const sc: StoredCrawl = {
crawlerOptions: null,
- pageOptions,
+ scrapeOptions: req.body,
+ internalOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
@@ -55,23 +54,21 @@ export async function batchScrapeController(
}
const jobs = req.body.urls.map((x) => {
- const uuid = uuidv4();
return {
- name: uuid,
data: {
url: x,
- mode: "single_urls",
+ mode: "single_urls" as const,
team_id: req.auth.team_id,
- plan: req.auth.plan,
+ plan: req.auth.plan!,
crawlerOptions: null,
- pageOptions,
+ scrapeOptions: req.body,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
- jobId: uuid,
+ jobId: uuidv4(),
priority: 20,
},
};
@@ -79,13 +76,14 @@ export async function batchScrapeController(
await lockURLs(
id,
+ sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
- await getScrapeQueue().addBulk(jobs);
+ await addScrapeJobs(jobs);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts
index f8fba824..958318b5 100644
--- a/apps/api/src/controllers/v1/crawl-cancel.ts
+++ b/apps/api/src/controllers/v1/crawl-cancel.ts
@@ -1,6 +1,6 @@
import { Response } from "express";
import { supabase_service } from "../../services/supabase";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
@@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
sc.cancelled = true;
await saveCrawl(req.params.jobId, sc);
} catch (error) {
- Logger.error(error);
+ logger.error(error);
}
res.json({
@@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
});
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
}
diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts
index 3738e3a2..18222edc 100644
--- a/apps/api/src/controllers/v1/crawl-status-ws.ts
+++ b/apps/api/src/controllers/v1/crawl-status-ws.ts
@@ -1,14 +1,15 @@
import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../auth";
-import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
+import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
+import { Job, JobState } from "bullmq";
type ErrorMessage = {
type: "error",
@@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth {
@@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth !doneJobIDs.includes(x));
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
- const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
-
- for (const jobID of newlyDoneJobIDs) {
- const job = await getJob(jobID);
+ const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
+ const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
+ for (const job of newlyDoneJobs) {
if (job.returnvalue) {
send(ws, {
type: "document",
- data: legacyDocumentConverter(job.returnvalue),
+ data: job.returnvalue,
})
} else {
return close(ws, 3000, { type: "error", error: job.failedReason });
@@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth legacyDocumentConverter(x)),
+ data: data,
}
});
@@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) {
try {
- const { success, team_id, error, status, plan } = await authenticateUser(
+ const auth = await authenticateUser(
req,
null,
RateLimiterMode.CrawlStatus,
);
- if (!success) {
+ if (!auth.success) {
return close(ws, 3000, {
type: "error",
- error,
+ error: auth.error,
});
}
+ const { team_id, plan } = auth;
+
req.auth = { team_id, plan };
await crawlStatusWS(ws, req);
@@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
}
}
- Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
+ logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts
index a8d78293..f150ddc4 100644
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@@ -1,9 +1,10 @@
import { Response } from "express";
-import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
+import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
+import { Job, JobState } from "bullmq";
configDotenv();
export async function getJob(id: string) {
@@ -24,7 +25,7 @@ export async function getJob(id: string) {
}
export async function getJobs(ids: string[]) {
- const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
+ const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
@@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth 0) {
- if (!doneJobs[0].data.pageOptions.includeRawHtml) {
+ if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
for (let ii = 0; ii < doneJobs.length; ii++) {
if (data[ii]) {
delete data[ii].rawHtml;
@@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth legacyDocumentConverter(x)),
+ data: data,
});
}
diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts
index 0000b6fe..aaf33f29 100644
--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@@ -4,9 +4,8 @@ import {
CrawlRequest,
crawlRequestSchema,
CrawlResponse,
- legacyCrawlerOptions,
- legacyScrapeOptions,
RequestWithAuth,
+ toLegacyCrawlerOptions,
} from "./types";
import {
addCrawlJob,
@@ -20,9 +19,10 @@ import {
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
+import { scrapeOptions as scrapeOptionsSchema } from "./types";
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
@@ -34,18 +34,22 @@ export async function crawlController(
await logCrawl(id, req.auth.team_id);
- let { remainingCredits } = req.account;
+ let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
remainingCredits = Infinity;
}
- const crawlerOptions = legacyCrawlerOptions(req.body);
- const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
+ const crawlerOptions = {
+ ...req.body,
+ url: undefined,
+ scrapeOptions: undefined,
+ };
+ const scrapeOptions = req.body.scrapeOptions;
// TODO: @rafa, is this right? copied from v0
- if (Array.isArray(crawlerOptions.includes)) {
- for (const x of crawlerOptions.includes) {
+ if (Array.isArray(crawlerOptions.includePaths)) {
+ for (const x of crawlerOptions.includePaths) {
try {
new RegExp(x);
} catch (e) {
@@ -54,8 +58,8 @@ export async function crawlController(
}
}
- if (Array.isArray(crawlerOptions.excludes)) {
- for (const x of crawlerOptions.excludes) {
+ if (Array.isArray(crawlerOptions.excludePaths)) {
+ for (const x of crawlerOptions.excludePaths) {
try {
new RegExp(x);
} catch (e) {
@@ -68,8 +72,9 @@ export async function crawlController(
const sc: StoredCrawl = {
originUrl: req.body.url,
- crawlerOptions,
- pageOptions,
+ crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
+ scrapeOptions,
+ internalOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
@@ -78,9 +83,9 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc);
try {
- sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
+ sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
} catch (e) {
- Logger.debug(
+ logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
@@ -112,7 +117,7 @@ export async function crawlController(
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions,
- pageOptions,
+ scrapeOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
@@ -128,6 +133,7 @@ export async function crawlController(
await lockURLs(
id,
+ sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
@@ -137,14 +143,15 @@ export async function crawlController(
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
- const job = await addScrapeJob(
+ const jobId = uuidv4();
+ await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
- crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
- plan: req.auth.plan,
- pageOptions: pageOptions,
+ crawlerOptions,
+ scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
+ plan: req.auth.plan!,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
@@ -152,9 +159,10 @@ export async function crawlController(
},
{
priority: 15,
- }
+ },
+ jobId,
);
- await addCrawlJob(id, job.id);
+ await addCrawlJob(id, jobId);
}
if(req.body.webhook) {
diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts
index f4e93e7f..f290c5ab 100644
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@@ -1,27 +1,18 @@
import { Request, Response } from "express";
-import { Logger } from "../../lib/logger";
import {
Document,
- legacyDocumentConverter,
- legacyExtractorOptions,
- legacyScrapeOptions,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
- legacyCrawlerOptions,
MapDocument,
} from "./types";
-import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
-import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
-import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
-import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
-import { getMapResults } from "./map";
import { rerankDocuments } from "../../lib/extract/reranker";
import { generateBasicCompletion } from "../../lib/extract/completions";
+import { getMapResults } from "./map";
@@ -73,7 +64,7 @@ export async function extractController(
mappedDocuments.push(...(mapResults.links as MapDocument[]));
// transform mappedUrls to just documents
// we quickly rerank
- const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
+ const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
console.log(rerank);
} else {
mappedDocuments.push({ url });
diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts
index 7e74b43e..91d712de 100644
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@@ -1,9 +1,9 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
- legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
+ scrapeOptions,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
@@ -15,21 +15,33 @@ import {
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
-import { performCosineSimilarity } from "../../lib/map-cosine";
-import { Logger } from "../../lib/logger";
-import Redis from "ioredis";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
+import { performCosineSimilarity } from "../../lib/map-cosine";
+import { logger } from "../../lib/logger";
+import Redis from "ioredis";
configDotenv();
-const redis = new Redis(process.env.REDIS_URL);
+const redis = new Redis(process.env.REDIS_URL!);
// Max Links that /map can return
const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000;
-interface MapOptions {
+export async function getMapResults({
+ url,
+ search,
+ limit = MAX_MAP_LIMIT,
+ ignoreSitemap = false,
+ includeSubdomains = true,
+ crawlerOptions = {},
+ teamId,
+ plan,
+ origin,
+ subId,
+ includeMetadata = false
+}: {
url: string;
search?: string;
limit?: number;
@@ -37,42 +49,34 @@ interface MapOptions {
includeSubdomains?: boolean;
crawlerOptions?: any;
teamId: string;
- plan: string;
+ plan?: string;
origin?: string;
- subId?: string;
+ subId: string | null;
includeMetadata?: boolean;
-}
-
-export async function getMapResults({
- url,
- search,
- limit = MAX_MAP_LIMIT,
- ignoreSitemap = false,
- includeSubdomains = false,
- crawlerOptions = {},
- teamId,
- plan,
- origin,
- subId,
- includeMetadata = false,
-}: MapOptions) {
- const startTime = new Date().getTime();
+}) {
const id = uuidv4();
- let links: { url: string; title?: string; description?: string }[] = [{ url }];
+ let links: string[] = [url];
const sc: StoredCrawl = {
originUrl: url,
- crawlerOptions,
- pageOptions: {},
+ crawlerOptions: {
+ ...crawlerOptions,
+ scrapeOptions: undefined,
+ },
+ scrapeOptions: scrapeOptions.parse({}),
+ internalOptions: {},
team_id: teamId,
createdAt: Date.now(),
- plan,
+ plan: plan,
};
const crawler = crawlToCrawler(id, sc);
let urlWithoutWww = url.replace("www.", "");
- let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
+
+ let mapUrl = search
+ ? `"${search}" site:${urlWithoutWww}`
+ : `site:${url}`;
const resultsPerPage = 100;
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
@@ -80,8 +84,8 @@ export async function getMapResults({
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
- let allResults: any[];
- let pagePromises: Promise[];
+ let allResults: any[] = [];
+ let pagePromises: Promise[] = [];
if (cachedResult) {
allResults = JSON.parse(cachedResult);
@@ -96,9 +100,11 @@ export async function getMapResults({
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
allResults = await Promise.all(pagePromises);
- await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
+ await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
}
+ console.log("allResults", allResults);
+ // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises),
@@ -110,7 +116,7 @@ export async function getMapResults({
if (sitemap !== null) {
sitemap.forEach((x) => {
- links.push({ url: x.url });
+ links.push(x.url);
});
}
@@ -125,62 +131,74 @@ export async function getMapResults({
if (mapResults.length > 0) {
if (search) {
+ // Ensure all map results are first, maintaining their order
links = [
- { url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
- ...mapResults.slice(1).map((x) => ({
- url: x.url,
- title: x.title,
- description: x.description
- })),
+ mapResults[0].url,
+ ...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
- mapResults.forEach((x) => {
- links.push({
- url: x.url,
- title: x.title,
- description: x.description
- });
+ mapResults.map((x) => {
+ links.push(x.url);
});
}
}
+ // Perform cosine similarity between the search query and the list of links
if (search) {
- const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
- links = links.filter(l => filteredLinks.includes(l.url));
+ const searchQuery = search.toLowerCase();
+ links = performCosineSimilarity(links, searchQuery);
}
links = links
.map((x) => {
try {
- return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
+ return checkAndUpdateURLForMap(x).url.trim();
} catch (_) {
return null;
}
})
- .filter((x) => x !== null);
+ .filter((x) => x !== null) as string[];
- links = links.filter((x) => isSameDomain(x.url, url));
+ // allows for subdomains to be included
+ links = links.filter((x) => isSameDomain(x, url));
+ // if includeSubdomains is false, filter out subdomains
if (!includeSubdomains) {
- links = links.filter((x) => isSameSubdomain(x.url, url));
+ links = links.filter((x) => isSameSubdomain(x, url));
}
- links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
+ // remove duplicates that could be due to http/https or www
+ links = removeDuplicateUrls(links);
- const endTime = new Date().getTime();
- const timeTakenInSeconds = (endTime - startTime) / 1000;
+ billTeam(teamId, subId, 1).catch((error) => {
+ logger.error(
+ `Failed to bill team ${teamId} for 1 credit: ${error}`
+ );
+ });
const linksToReturn = links.slice(0, limit);
+ logJob({
+ job_id: id,
+ success: links.length > 0,
+ message: "Map completed",
+ num_docs: linksToReturn.length,
+ docs: linksToReturn,
+ time_taken: (new Date().getTime() - Date.now()) / 1000,
+ team_id: teamId,
+ mode: "map",
+ url: url,
+ crawlerOptions: {},
+ scrapeOptions: {},
+ origin: origin ?? "api",
+ num_tokens: 0,
+ });
+
return {
- links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
- scrapeId: origin?.includes("website") ? id : undefined,
- timeTakenInSeconds,
- id,
- linksLength: links.length,
- linksToReturnLength: linksToReturn.length,
- docs: linksToReturn.map(l => l.url),
+ success: true,
+ links: includeMetadata ? mapResults : linksToReturn,
+ scrape_id: origin?.includes("website") ? id : undefined,
};
}
@@ -190,43 +208,73 @@ export async function mapController(
) {
req.body = mapRequestSchema.parse(req.body);
- const results = await getMapResults({
+ console.log("req.body", req.body);
+ const result = await getMapResults({
url: req.body.url,
search: req.body.search,
limit: req.body.limit,
ignoreSitemap: req.body.ignoreSitemap,
includeSubdomains: req.body.includeSubdomains,
- crawlerOptions: legacyCrawlerOptions(req.body),
+ crawlerOptions: req.body,
teamId: req.auth.team_id,
plan: req.auth.plan,
origin: req.body.origin,
- subId: req.acuc?.sub_id,
+ subId: req.acuc?.sub_id
});
- await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
- Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
- });
+ const response = {
+ success: true as const,
+ links: result.links,
+ scrape_id: result.scrape_id
+ };
- await logJob({
- job_id: results.id,
- success: results.linksLength > 0,
- message: "Map completed",
- num_docs: results.linksToReturnLength,
- docs: results.docs,
- time_taken: results.timeTakenInSeconds,
- team_id: req.auth.team_id,
- mode: "map",
- url: req.body.url,
- crawlerOptions: {},
- pageOptions: {},
- origin: req.body.origin,
- extractor_options: { mode: "markdown" },
- num_tokens: 0,
- });
-
- return res.status(200).json({
- success: true,
- links: results.links.map(l => l.url),
- scrape_id: results.scrapeId,
- });
+ return res.status(200).json(response);
}
+
+// Subdomain sitemap url checking
+
+// // For each result, check for subdomains, get their sitemaps and add them to the links
+// const processedUrls = new Set();
+// const processedSubdomains = new Set();
+
+// for (const result of links) {
+// let url;
+// let hostParts;
+// try {
+// url = new URL(result);
+// hostParts = url.hostname.split('.');
+// } catch (e) {
+// continue;
+// }
+
+// console.log("hostParts", hostParts);
+// // Check if it's a subdomain (more than 2 parts, and not 'www')
+// if (hostParts.length > 2 && hostParts[0] !== 'www') {
+// const subdomain = hostParts[0];
+// console.log("subdomain", subdomain);
+// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
+// console.log("subdomainUrl", subdomainUrl);
+
+// if (!processedSubdomains.has(subdomainUrl)) {
+// processedSubdomains.add(subdomainUrl);
+
+// const subdomainCrawl = crawlToCrawler(id, {
+// originUrl: subdomainUrl,
+// crawlerOptions: legacyCrawlerOptions(req.body),
+// pageOptions: {},
+// team_id: req.auth.team_id,
+// createdAt: Date.now(),
+// plan: req.auth.plan,
+// });
+// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
+// if (subdomainSitemap) {
+// subdomainSitemap.forEach((x) => {
+// if (!processedUrls.has(x.url)) {
+// processedUrls.add(x.url);
+// links.push(x.url);
+// }
+// });
+// }
+// }
+// }
+// }
\ No newline at end of file
diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts
index 5e0aecb6..db50f7d3 100644
--- a/apps/api/src/controllers/v1/scrape-status.ts
+++ b/apps/api/src/controllers/v1/scrape-status.ts
@@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
- if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
+ if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource.",
diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts
index d0d4c5fc..9c85c91e 100644
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@@ -1,10 +1,7 @@
-import { Request, Response } from "express";
-import { Logger } from "../../lib/logger";
+import { Response } from "express";
+import { logger } from "../../lib/logger";
import {
Document,
- legacyDocumentConverter,
- legacyExtractorOptions,
- legacyScrapeOptions,
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
@@ -12,11 +9,11 @@ import {
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
-import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
+import { getScrapeQueue } from "../../services/queue-service";
export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
@@ -27,8 +24,6 @@ export async function scrapeController(
const origin = req.body.origin;
const timeout = req.body.timeout;
- const pageOptions = legacyScrapeOptions(req.body);
- const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4();
const startTime = new Date().getTime();
@@ -38,15 +33,14 @@ export async function scrapeController(
basePriority: 10,
});
- const job = await addScrapeJob(
+ await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
- crawlerOptions: {},
team_id: req.auth.team_id,
- plan: req.auth.plan,
- pageOptions,
- extractorOptions,
+ scrapeOptions: req.body,
+ internalOptions: {},
+ plan: req.auth.plan!,
origin: req.body.origin,
is_scrape: true,
},
@@ -55,14 +49,14 @@ export async function scrapeController(
jobPriority
);
- const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
+ const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
- let doc: any | undefined;
+ let doc: Document;
try {
- doc = (await waitForJob(job.id, timeout + totalWait))[0];
+ doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) {
- Logger.error(`Error in scrapeController: ${e}`);
- if (e instanceof Error && e.message.startsWith("Job wait")) {
+ logger.error(`Error in scrapeController: ${e}`);
+ if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
return res.status(408).json({
success: false,
error: "Request timed out",
@@ -70,34 +64,19 @@ export async function scrapeController(
} else {
return res.status(500).json({
success: false,
- error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
- extractorOptions && extractorOptions.mode !== "markdown"
- ? " - Could be due to LLM parsing issues"
- : ""
- }`,
+ error: `(Internal server error) - ${(e && e.message) ? e.message : e}`,
});
}
}
- await job.remove();
-
- if (!doc) {
- console.error("!!! PANIC DOC IS", doc, job);
- return res.status(200).json({
- success: true,
- warning: "No page found",
- data: doc,
- });
- }
-
- delete doc.index;
- delete doc.provider;
+ await getScrapeQueue().remove(jobId);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
- doc && doc.markdown
- ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+ doc && doc.extract
+ // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+ ? 0 // TODO: fix
: 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document
@@ -110,22 +89,16 @@ export async function scrapeController(
}
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
- Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
+ logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
- if (!pageOptions || !pageOptions.includeRawHtml) {
+ if (!req.body.formats.includes("rawHtml")) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
- if(pageOptions && pageOptions.includeExtract) {
- if(!pageOptions.includeMarkdown && doc && doc.markdown) {
- delete doc.markdown;
- }
- }
-
logJob({
job_id: jobId,
success: true,
@@ -136,16 +109,14 @@ export async function scrapeController(
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
- crawlerOptions: {},
- pageOptions: pageOptions,
+ scrapeOptions: req.body,
origin: origin,
- extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
- data: legacyDocumentConverter(doc),
+ data: doc,
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}
diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
index 4e31dd86..530ca765 100644
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@@ -1,10 +1,11 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
-import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
import { countries } from "../../lib/validate-country";
+import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
+import { InternalOptions } from "../../scraper/scrapeURL";
export type Format =
| "markdown"
@@ -52,7 +53,7 @@ const strictMessage = "Unrecognized key in body -- please review the v1 API docu
export const extractOptions = z.object({
mode: z.enum(["llm"]).default("llm"),
schema: z.any().optional(),
- systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
+ systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."),
prompt: z.string().optional()
}).strict(strictMessage);
@@ -61,8 +62,14 @@ export type ExtractOptions = z.infer;
export const actionsSchema = z.array(z.union([
z.object({
type: z.literal("wait"),
- milliseconds: z.number().int().positive().finite(),
- }),
+ milliseconds: z.number().int().positive().finite().optional(),
+ selector: z.string().optional(),
+ }).refine(
+ (data) => (data.milliseconds !== undefined || data.selector !== undefined) && !(data.milliseconds !== undefined && data.selector !== undefined),
+ {
+ message: "Either 'milliseconds' or 'selector' must be provided, but not both.",
+ }
+ ),
z.object({
type: z.literal("click"),
selector: z.string(),
@@ -81,7 +88,15 @@ export const actionsSchema = z.array(z.union([
}),
z.object({
type: z.literal("scroll"),
- direction: z.enum(["up", "down"]),
+ direction: z.enum(["up", "down"]).optional().default("down"),
+ selector: z.string().optional(),
+ }),
+ z.object({
+ type: z.literal("scrape"),
+ }),
+ z.object({
+ type: z.literal("executeJavascript"),
+ script: z.string()
}),
]));
@@ -107,17 +122,32 @@ export const scrapeOptions = z.object({
timeout: z.number().int().positive().finite().safe().default(30000),
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
+ mobile: z.boolean().default(false),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
+ // New
+ location: z.object({
+ country: z.string().optional().refine(
+ (val) => !val || Object.keys(countries).includes(val.toUpperCase()),
+ {
+ message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
+ }
+ ).transform(val => val ? val.toUpperCase() : 'US'),
+ languages: z.string().array().optional(),
+ }).optional(),
+
+ // Deprecated
geolocation: z.object({
country: z.string().optional().refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
}
- ).transform(val => val ? val.toUpperCase() : 'US')
+ ).transform(val => val ? val.toUpperCase() : 'US'),
+ languages: z.string().array().optional(),
}).optional(),
skipTlsVerification: z.boolean().default(false),
+ removeBase64Images: z.boolean().default(true),
}).strict(strictMessage)
@@ -158,6 +188,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer;
+export type ScrapeRequestInput = z.input;
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
@@ -188,6 +219,8 @@ const crawlerOptions = z.object({
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
+ deduplicateSimilarURLs: z.boolean().default(true),
+ ignoreQueryParameters: z.boolean().default(false),
}).strict(strictMessage);
// export type CrawlerOptions = {
@@ -231,7 +264,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
- limit: z.number().min(1).max(5000).default(5000).optional(),
+ limit: z.number().min(1).max(5000).default(5000),
}).strict(strictMessage);
// export type MapRequest = {
@@ -243,13 +276,14 @@ export type MapRequest = z.infer;
export type Document = {
markdown?: string;
- extract?: string;
+ extract?: any;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
actions?: {
- screenshots: string[];
+ screenshots?: string[];
+ scrapes?: ScrapeActionContent[];
};
warning?: string;
metadata: {
@@ -282,11 +316,11 @@ export type Document = {
publishedTime?: string;
articleTag?: string;
articleSection?: string;
+ url?: string;
sourceURL?: string;
statusCode?: number;
error?: string;
[key: string]: string | string[] | number | undefined;
-
};
};
@@ -372,7 +406,7 @@ export type CrawlStatusResponse =
type AuthObject = {
team_id: string;
- plan: PlanType;
+ plan: PlanType | undefined;
};
type Account = {
@@ -445,7 +479,7 @@ export interface ResponseWithSentry<
sentry?: string,
}
-export function legacyCrawlerOptions(x: CrawlerOptions) {
+export function toLegacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
@@ -456,69 +490,26 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap,
+ deduplicateSimilarURLs: x.deduplicateSimilarURLs,
+ ignoreQueryParameters: x.ignoreQueryParameters,
};
}
-export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
+export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
return {
- includeMarkdown: x.formats.includes("markdown"),
- includeHtml: x.formats.includes("html"),
- includeRawHtml: x.formats.includes("rawHtml"),
- includeExtract: x.formats.includes("extract"),
- onlyIncludeTags: x.includeTags,
- removeTags: x.excludeTags,
- onlyMainContent: x.onlyMainContent,
- waitFor: x.waitFor,
- headers: x.headers,
- includeLinks: x.formats.includes("links"),
- screenshot: x.formats.includes("screenshot"),
- fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
- parsePDF: x.parsePDF,
- actions: x.actions as Action[], // no strict null checking grrrr - mogery
- geolocation: x.geolocation,
- skipTlsVerification: x.skipTlsVerification
- };
-}
-
-export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
- return {
- mode: x.mode ? "llm-extraction" : "markdown",
- extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
- extractionSchema: x.schema,
- userPrompt: x.prompt ?? "",
- };
-}
-
-export function legacyDocumentConverter(doc: any): Document {
- if (doc === null || doc === undefined) return null;
-
- if (doc.metadata) {
- if (doc.metadata.screenshot) {
- doc.screenshot = doc.metadata.screenshot;
- delete doc.metadata.screenshot;
- }
-
- if (doc.metadata.fullPageScreenshot) {
- doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
- delete doc.metadata.fullPageScreenshot;
- }
- }
-
- return {
- markdown: doc.markdown,
- links: doc.linksOnPage,
- rawHtml: doc.rawHtml,
- html: doc.html,
- extract: doc.llm_extraction,
- screenshot: doc.screenshot ?? doc.fullPageScreenshot,
- actions: doc.actions ?? undefined,
- warning: doc.warning ?? undefined,
- metadata: {
- ...doc.metadata,
- pageError: undefined,
- pageStatusCode: undefined,
- error: doc.metadata?.pageError,
- statusCode: doc.metadata?.pageStatusCode,
+ crawlOptions: crawlerOptions.parse({
+ includePaths: x.includes,
+ excludePaths: x.excludes,
+ limit: x.maxCrawledLinks ?? x.limit,
+ maxDepth: x.maxDepth,
+ allowBackwardLinks: x.allowBackwardCrawling,
+ allowExternalLinks: x.allowExternalContentLinks,
+ ignoreSitemap: x.ignoreSitemap,
+ deduplicateSimilarURLs: x.deduplicateSimilarURLs,
+ ignoreQueryParameters: x.ignoreQueryParameters,
+ }),
+ internalOptions: {
+ v0CrawlOnlyUrls: x.returnOnlyUrls,
},
};
}
@@ -529,4 +520,73 @@ export interface MapDocument {
url: string;
title?: string;
description?: string;
-}
\ No newline at end of file
+}
+export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
+ return {
+ scrapeOptions: scrapeOptions.parse({
+ formats: [
+ (pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
+ (pageOptions.includeHtml ?? false) ? "html" as const : null,
+ (pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
+ (pageOptions.screenshot ?? false) ? "screenshot" as const : null,
+ (pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
+ (extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
+ "links"
+ ].filter(x => x !== null),
+ waitFor: pageOptions.waitFor,
+ headers: pageOptions.headers,
+ includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
+ excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
+ onlyMainContent: pageOptions.onlyMainContent ?? false,
+ timeout: timeout,
+ parsePDF: pageOptions.parsePDF,
+ actions: pageOptions.actions,
+ location: pageOptions.geolocation,
+ skipTlsVerification: pageOptions.skipTlsVerification,
+ removeBase64Images: pageOptions.removeBase64Images,
+ extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
+ systemPrompt: extractorOptions.extractionPrompt,
+ prompt: extractorOptions.userPrompt,
+ schema: extractorOptions.extractionSchema,
+ } : undefined,
+ mobile: pageOptions.mobile,
+ }),
+ internalOptions: {
+ atsv: pageOptions.atsv,
+ v0DisableJsDom: pageOptions.disableJsDom,
+ v0UseFastMode: pageOptions.useFastMode,
+ },
+ // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
+ }
+}
+
+export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
+ const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
+ const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
+ return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
+}
+
+export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
+ if (internalOptions.v0CrawlOnlyUrls) {
+ return { url: document.metadata.sourceURL! };
+ }
+
+ return {
+ content: document.markdown!,
+ markdown: document.markdown!,
+ html: document.html,
+ rawHtml: document.rawHtml,
+ linksOnPage: document.links,
+ llm_extraction: document.extract,
+ metadata: {
+ ...document.metadata,
+ error: undefined,
+ statusCode: undefined,
+ pageError: document.metadata.error,
+ pageStatusCode: document.metadata.statusCode,
+ screenshot: document.screenshot,
+ },
+ actions: document.actions ,
+ warning: document.warning,
+ }
+}
diff --git a/apps/api/src/example.ts b/apps/api/src/example.ts
deleted file mode 100644
index edf0faef..00000000
--- a/apps/api/src/example.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-import { WebScraperDataProvider } from "./scraper/WebScraper";
-
-async function example() {
- const example = new WebScraperDataProvider();
-
- await example.setOptions({
- jobId: "TEST",
- mode: "crawl",
- urls: ["https://mendable.ai"],
- crawlerOptions: {},
- });
- const docs = await example.getDocuments(false);
- docs.map((doc) => {
- console.log(doc.metadata.sourceURL);
- });
- console.log(docs.length);
-}
-
-// example();
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index 5ccbb9cc..7f7ec036 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -6,28 +6,24 @@ import bodyParser from "body-parser";
import cors from "cors";
import { getScrapeQueue } from "./services/queue-service";
import { v0Router } from "./routes/v0";
-import { initSDK } from "@hyperdx/node-opentelemetry";
import os from "os";
-import { Logger } from "./lib/logger";
+import { logger } from "./lib/logger";
import { adminRouter } from "./routes/admin";
-import { ScrapeEvents } from "./lib/scrape-events";
import http from 'node:http';
import https from 'node:https';
import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1";
import expressWs from "express-ws";
-import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid";
-import dns from 'node:dns';
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
-Logger.info(`Number of CPUs: ${numCPUs} available`);
+logger.info(`Number of CPUs: ${numCPUs} available`);
const cacheable = new CacheableLookup()
@@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
serverAdapter: serverAdapter,
});
-
app.use(
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
serverAdapter.getRouter()
@@ -78,15 +73,10 @@ app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost";
-// HyperDX OpenTelemetry
-if (process.env.ENV === "production") {
- initSDK({ consoleCapture: true, additionalInstrumentations: [] });
-}
-
function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => {
- Logger.info(`Worker ${process.pid} listening on port ${port}`);
- Logger.info(
+ logger.info(`Worker ${process.pid} listening on port ${port}`);
+ logger.info(
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
);
});
@@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => {
const [waitingJobs] = await Promise.all([
scrapeQueue.getWaitingCount(),
]);
-
const noWaitingJobs = waitingJobs === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noWaitingJobs ? 200 : 500).json({
@@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
});
} catch (error) {
Sentry.captureException(error);
- Logger.error(error);
+ logger.error(error);
return res.status(500).json({ error: error.message });
}
});
@@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
// Re-check the waiting jobs count after the timeout
waitingJobsCount = await getWaitingJobsCount();
if (waitingJobsCount >= treshold) {
- const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
+ const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
const message = {
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
timeout / 60000
@@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
});
if (!response.ok) {
- Logger.error("Failed to send Slack notification");
+ logger.error("Failed to send Slack notification");
}
}
}, timeout);
}
} catch (error) {
Sentry.captureException(error);
- Logger.debug(error);
+ logger.debug(error);
}
};
@@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => {
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => {
if (err instanceof ZodError) {
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
- Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
+ logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
}
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
@@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
}
}
- Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
+ logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
});
-Logger.info(`Worker ${process.pid} started`);
+logger.info(`Worker ${process.pid} started`);
// const sq = getScrapeQueue();
diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts
index d05f9bd7..430dc1d4 100644
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities";
-import { Logger } from "../logger";
+import { logger } from "../logger";
// Generate completion using OpenAI
export async function generateCompletions(
documents: Document[],
- extractionOptions: ExtractorOptions,
+ extractionOptions: ExtractorOptions | undefined,
mode: "markdown" | "raw-html"
): Promise {
// const schema = zodToJsonSchema(options.schema)
- const schema = extractionOptions.extractionSchema;
- const systemPrompt = extractionOptions.extractionPrompt;
- const prompt = extractionOptions.userPrompt;
+ const schema = extractionOptions?.extractionSchema;
+ const systemPrompt = extractionOptions?.extractionPrompt;
+ const prompt = extractionOptions?.userPrompt;
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
@@ -51,7 +51,7 @@ export async function generateCompletions(
return completionResult;
} catch (error) {
- Logger.error(`Error generating completions: ${error}`);
+ logger.error(`Error generating completions: ${error}`);
throw error;
}
default:
diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts
index 23147b12..f777dce9 100644
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@@ -95,7 +95,7 @@ export async function generateOpenAICompletions({
try {
llmExtraction = JSON.parse(
- jsonCompletion.choices[0].message.content.trim()
+ (jsonCompletion.choices[0].message.content ?? "").trim()
);
} catch (e) {
throw new Error("Invalid JSON");
diff --git a/apps/api/src/lib/batch-process.ts b/apps/api/src/lib/batch-process.ts
index 30289fd0..802d1eb1 100644
--- a/apps/api/src/lib/batch-process.ts
+++ b/apps/api/src/lib/batch-process.ts
@@ -3,7 +3,7 @@ export async function batchProcess(
batchSize: number,
asyncFunction: (item: T, index: number) => Promise
): Promise {
- const batches = [];
+ const batches: T[][] = [];
for (let i = 0; i < array.length; i += batchSize) {
const batch = array.slice(i, i + batchSize);
batches.push(batch);
diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts
new file mode 100644
index 00000000..72dc1e45
--- /dev/null
+++ b/apps/api/src/lib/concurrency-limit.ts
@@ -0,0 +1,48 @@
+import { getRateLimiterPoints } from "../services/rate-limiter";
+import { redisConnection } from "../services/queue-service";
+import { RateLimiterMode } from "../types";
+import { JobsOptions } from "bullmq";
+
+const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
+const constructQueueKey = (team_id: string) => "concurrency-limit-queue:" + team_id;
+const stalledJobTimeoutMs = 2 * 60 * 1000;
+
+export function getConcurrencyLimitMax(plan: string): number {
+ return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan);
+}
+
+export async function cleanOldConcurrencyLimitEntries(team_id: string, now: number = Date.now()) {
+ await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
+}
+
+export async function getConcurrencyLimitActiveJobs(team_id: string, now: number = Date.now()): Promise {
+ return await redisConnection.zrangebyscore(constructKey(team_id), now, Infinity);
+}
+
+export async function pushConcurrencyLimitActiveJob(team_id: string, id: string, now: number = Date.now()) {
+ await redisConnection.zadd(constructKey(team_id), now + stalledJobTimeoutMs, id);
+}
+
+export async function removeConcurrencyLimitActiveJob(team_id: string, id: string) {
+ await redisConnection.zrem(constructKey(team_id), id);
+}
+
+export type ConcurrencyLimitedJob = {
+ id: string;
+ data: any;
+ opts: JobsOptions;
+ priority?: number;
+}
+
+export async function takeConcurrencyLimitedJob(team_id: string): Promise {
+ const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
+ if (res === null || res === undefined) {
+ return null;
+ }
+
+ return JSON.parse(res[1][0][0]);
+}
+
+export async function pushConcurrencyLimitedJob(team_id: string, job: ConcurrencyLimitedJob) {
+ await redisConnection.zadd(constructQueueKey(team_id), job.priority ?? 1, JSON.stringify(job));
+}
diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts
new file mode 100644
index 00000000..eb9c81f1
--- /dev/null
+++ b/apps/api/src/lib/crawl-redis.test.ts
@@ -0,0 +1,33 @@
+import { generateURLPermutations } from "./crawl-redis";
+
+describe("generateURLPermutations", () => {
+ it("generates permutations correctly", () => {
+ const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
+ expect(bareHttps.length).toBe(4);
+ expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
+ expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
+ expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
+ expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
+
+ const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
+ expect(bareHttp.length).toBe(4);
+ expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
+ expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
+ expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
+ expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
+
+ const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
+ expect(wwwHttps.length).toBe(4);
+ expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
+ expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
+ expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
+ expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
+
+ const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
+ expect(wwwHttp.length).toBe(4);
+ expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
+ expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
+ expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
+ expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
+ })
+});
\ No newline at end of file
diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts
index 379bc179..b5936ad6 100644
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@@ -1,13 +1,17 @@
+import { InternalOptions } from "../scraper/scrapeURL";
+import { ScrapeOptions } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
-import { Logger } from "./logger";
+import { logger } from "./logger";
+import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
export type StoredCrawl = {
originUrl?: string;
crawlerOptions: any;
- pageOptions: any;
+ scrapeOptions: Omit;
+ internalOptions: InternalOptions;
team_id: string;
- plan: string;
+ plan?: string;
robots?: string;
cancelled?: boolean;
createdAt: number;
@@ -87,40 +91,74 @@ export async function getThrottledJobs(teamId: string): Promise {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
}
+export function normalizeURL(url: string, sc: StoredCrawl): string {
+ const urlO = new URL(url);
+ if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
+ urlO.search = "";
+ }
+ urlO.hash = "";
+ return urlO.href;
+}
+
+export function generateURLPermutations(url: string | URL): URL[] {
+ const urlO = new URL(url);
+
+ // Construct two versions, one with www., one without
+ const urlWithWWW = new URL(urlO);
+ const urlWithoutWWW = new URL(urlO);
+ if (urlO.hostname.startsWith("www.")) {
+ urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
+ } else {
+ urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
+ }
+
+ let permutations = [urlWithWWW, urlWithoutWWW];
+
+ // Construct more versions for http/https
+ permutations = permutations.flatMap(urlO => {
+ if (!["http:", "https:"].includes(urlO.protocol)) {
+ return [urlO];
+ }
+
+ const urlWithHTTP = new URL(urlO);
+ const urlWithHTTPS = new URL(urlO);
+ urlWithHTTP.protocol = "http:";
+ urlWithHTTPS.protocol = "https:";
+
+ return [urlWithHTTP, urlWithHTTPS];
+ });
+
+ return permutations;
+}
+
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise {
if (typeof sc.crawlerOptions?.limit === "number") {
- if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
+ if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
return false;
}
}
- try {
- const urlO = new URL(url);
- urlO.search = "";
- urlO.hash = "";
- url = urlO.href;
- } catch (error) {
- Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
+ url = normalizeURL(url, sc);
+
+ await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
+ await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
+
+ let res: boolean;
+ if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
+ res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
+ } else {
+ const permutations = generateURLPermutations(url);
+ res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
}
- const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
-export async function lockURLs(id: string, urls: string[]): Promise {
+export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise {
urls = urls.map(url => {
- try {
- const urlO = new URL(url);
- urlO.search = "";
- urlO.hash = "";
- return urlO.href;
- } catch (error) {
- Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
- }
-
- return url;
+ return normalizeURL(url, sc);
});
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
@@ -131,11 +169,11 @@ export async function lockURLs(id: string, urls: string[]): Promise {
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
- initialUrl: sc.originUrl,
+ initialUrl: sc.originUrl!,
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
- maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
+ maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
limit: sc.crawlerOptions?.limit ?? 10000,
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 8aa1d004..9fa39cff 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -1,3 +1,5 @@
+import type { Document as V1Document } from "../controllers/v1/types";
+
export interface Progress {
current: number;
total: number;
@@ -12,7 +14,8 @@ export interface Progress {
export type Action = {
type: "wait",
- milliseconds: number,
+ milliseconds?: number,
+ selector?: string,
} | {
type: "click",
selector: string,
@@ -27,8 +30,14 @@ export type Action = {
key: string,
} | {
type: "scroll",
- direction: "up" | "down"
-};
+ direction?: "up" | "down",
+ selector?: string,
+} | {
+ type: "scrape",
+} | {
+ type: "executeJavascript",
+ script: string,
+}
export type PageOptions = {
includeMarkdown?: boolean;
@@ -55,6 +64,8 @@ export type PageOptions = {
country?: string;
};
skipTlsVerification?: boolean;
+ removeBase64Images?: boolean;
+ mobile?: boolean;
};
export type ExtractorOptions = {
@@ -124,7 +135,8 @@ export class Document {
provider?: string;
warning?: string;
actions?: {
- screenshots: string[];
+ screenshots?: string[];
+ scrapes?: ScrapeActionContent[];
}
index?: number;
@@ -163,11 +175,17 @@ export class SearchResult {
}
}
+export interface ScrapeActionContent {
+ url: string;
+ html: string;
+}
+
export interface FireEngineResponse {
html: string;
screenshots?: string[];
pageStatusCode?: number;
pageError?: string;
+ scrapeActionContent?: ScrapeActionContent[];
}
diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts
index a542a434..8800d916 100644
--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@@ -5,7 +5,7 @@ import "../services/sentry"
import * as Sentry from "@sentry/node";
import dotenv from 'dotenv';
-import { Logger } from './logger';
+import { logger } from './logger';
dotenv.config();
// TODO: add a timeout to the Go parser
@@ -15,7 +15,7 @@ class GoMarkdownConverter {
private convert: any;
private constructor() {
- const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so');
+ const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
const lib = koffi.load(goExecutablePath);
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
}
@@ -40,7 +40,7 @@ class GoMarkdownConverter {
}
}
-export async function parseMarkdown(html: string): Promise {
+export async function parseMarkdown(html: string | null | undefined): Promise {
if (!html) {
return '';
}
@@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise {
markdownContent = processMultiLineLinks(markdownContent);
markdownContent = removeSkipToContentLinks(markdownContent);
- Logger.info(`HTML to Markdown conversion using Go parser successful`);
+ logger.info(`HTML to Markdown conversion using Go parser successful`);
return markdownContent;
}
} catch (error) {
Sentry.captureException(error);
- Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
+ logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
}
// Fallback to TurndownService if Go parser fails or is not enabled
diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts
index 83fefcec..27e45230 100644
--- a/apps/api/src/lib/job-priority.ts
+++ b/apps/api/src/lib/job-priority.ts
@@ -1,6 +1,6 @@
import { redisConnection } from "../../src/services/queue-service";
import { PlanType } from "../../src/types";
-import { Logger } from "./logger";
+import { logger } from "./logger";
const SET_KEY_PREFIX = "limit_team_id:";
export async function addJobPriority(team_id, job_id) {
@@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
await redisConnection.expire(setKey, 60);
} catch (e) {
- Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
+ logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
}
}
@@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
// remove job_id from the set
await redisConnection.srem(setKey, job_id);
} catch (e) {
- Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
+ logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
}
}
@@ -33,7 +33,7 @@ export async function getJobPriority({
team_id,
basePriority = 10,
}: {
- plan: PlanType;
+ plan: PlanType | undefined;
team_id: string;
basePriority?: number;
}): Promise {
@@ -70,6 +70,14 @@ export async function getJobPriority({
bucketLimit = 400;
planModifier = 0.1;
break;
+ case "etier2c":
+ bucketLimit = 1000;
+ planModifier = 0.05;
+ break;
+ case "etier1a":
+ bucketLimit = 1000;
+ planModifier = 0.05;
+ break;
default:
bucketLimit = 25;
@@ -87,7 +95,7 @@ export async function getJobPriority({
);
}
} catch (e) {
- Logger.error(
+ logger.error(
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
);
return basePriority;
diff --git a/apps/api/src/lib/load-testing-example.ts b/apps/api/src/lib/load-testing-example.ts
deleted file mode 100644
index 01b61db9..00000000
--- a/apps/api/src/lib/load-testing-example.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
-
-// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
-
-// const scrapInBatches = async (
-// urls: string[],
-// batchSize: number,
-// delayMs: number
-// ) => {
-// let successCount = 0;
-// let errorCount = 0;
-
-// for (let i = 0; i < urls.length; i += batchSize) {
-// const batch = urls
-// .slice(i, i + batchSize)
-// .map((url) => scrapWithFireEngine(url));
-// try {
-// const results = await Promise.all(batch);
-// results.forEach((data, index) => {
-// if (data.trim() === "") {
-// errorCount++;
-// } else {
-// successCount++;
-// console.log(
-// `Scraping result ${i + index + 1}:`,
-// data.trim().substring(0, 20) + "..."
-// );
-// }
-// });
-// } catch (error) {
-// console.error("Error during scraping:", error);
-// }
-// await delay(delayMs);
-// }
-
-// console.log(`Total successful scrapes: ${successCount}`);
-// console.log(`Total errored scrapes: ${errorCount}`);
-// };
-// function run() {
-// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
-// scrapInBatches(urls, 10, 1000);
-// }
diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts
index 7eca1ef0..eb4f8aeb 100644
--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@@ -1,57 +1,51 @@
+import * as winston from "winston";
+
import { configDotenv } from "dotenv";
configDotenv();
-enum LogLevel {
- NONE = 'NONE', // No logs will be output.
- ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
- WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
- INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
- DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
- TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
-}
-export class Logger {
- static colors = {
- ERROR: '\x1b[31m%s\x1b[0m', // Red
- WARN: '\x1b[33m%s\x1b[0m', // Yellow
- INFO: '\x1b[34m%s\x1b[0m', // Blue
- DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
- TRACE: '\x1b[35m%s\x1b[0m' // Magenta
- };
-
- static log (message: string, level: LogLevel) {
- const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
- const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
- const currentLevelIndex = levels.indexOf(logLevel);
- const messageLevelIndex = levels.indexOf(level);
-
- if (currentLevelIndex >= messageLevelIndex) {
- const color = Logger.colors[level];
- console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
-
- // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
- // if (useDbAuthentication) {
- // save to supabase? another place?
- // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
- // }
+const logFormat = winston.format.printf(info =>
+ `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
+ info.metadata,
+ (_, value) => {
+ if (value instanceof Error) {
+ return {
+ ...value,
+ name: value.name,
+ message: value.message,
+ stack: value.stack,
+ cause: value.cause,
+ }
+ } else {
+ return value;
+ }
}
- }
- static error(message: string | any) {
- Logger.log(message, LogLevel.ERROR);
- }
+ ) : ""}`
+)
- static warn(message: string) {
- Logger.log(message, LogLevel.WARN);
- }
-
- static info(message: string) {
- Logger.log(message, LogLevel.INFO);
- }
-
- static debug(message: string) {
- Logger.log(message, LogLevel.DEBUG);
- }
-
- static trace(message: string) {
- Logger.log(message, LogLevel.TRACE);
- }
-}
+export const logger = winston.createLogger({
+ level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
+ format: winston.format.json({
+ replacer(key, value) {
+ if (value instanceof Error) {
+ return {
+ ...value,
+ name: value.name,
+ message: value.message,
+ stack: value.stack,
+ cause: value.cause,
+ }
+ } else {
+ return value;
+ }
+ }
+ }),
+ transports: [
+ new winston.transports.Console({
+ format: winston.format.combine(
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
+ winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
+ ...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
+ ),
+ }),
+ ],
+});
diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts
index db2491a9..2a089548 100644
--- a/apps/api/src/lib/map-cosine.ts
+++ b/apps/api/src/lib/map-cosine.ts
@@ -1,4 +1,4 @@
-import { Logger } from "./logger";
+import { logger } from "./logger";
export function performCosineSimilarity(links: string[], searchQuery: string) {
try {
@@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
links = a.map((item) => item.link);
return links;
} catch (error) {
- Logger.error(`Error performing cosine similarity: ${error}`);
+ logger.error(`Error performing cosine similarity: ${error}`);
return links;
}
}
diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts
index ad70dfef..83873a58 100644
--- a/apps/api/src/lib/scrape-events.ts
+++ b/apps/api/src/lib/scrape-events.ts
@@ -1,8 +1,8 @@
import { Job } from "bullmq";
-import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
-import { Logger } from "./logger";
+import { logger } from "./logger";
import { configDotenv } from "dotenv";
+import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();
export type ScrapeErrorEvent = {
@@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
type: "scrape",
url: string,
worker?: string,
- method: (typeof baseScrapers)[number],
+ method: Engine,
result: null | {
success: boolean,
response_code?: number,
@@ -49,7 +49,7 @@ export class ScrapeEvents {
}).select().single();
return (result.data as any).id;
} catch (error) {
- // Logger.error(`Error inserting scrape event: ${error}`);
+ // logger.error(`Error inserting scrape event: ${error}`);
return null;
}
}
@@ -69,7 +69,7 @@ export class ScrapeEvents {
}
}).eq("id", logId);
} catch (error) {
- Logger.error(`Error updating scrape result: ${error}`);
+ logger.error(`Error updating scrape result: ${error}`);
}
}
@@ -81,7 +81,7 @@ export class ScrapeEvents {
worker: process.env.FLY_MACHINE_ID,
});
} catch (error) {
- Logger.error(`Error logging job event: ${error}`);
+ logger.error(`Error logging job event: ${error}`);
}
}
}
diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts
index c418a6e0..c9be72a3 100644
--- a/apps/api/src/lib/supabase-jobs.ts
+++ b/apps/api/src/lib/supabase-jobs.ts
@@ -1,5 +1,5 @@
import { supabase_service } from "../services/supabase";
-import { Logger } from "./logger";
+import { logger } from "./logger";
import * as Sentry from "@sentry/node";
/**
@@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
.in("job_id", jobIds);
if (error) {
- Logger.error(`Error in supabaseGetJobsById: ${error}`);
+ logger.error(`Error in supabaseGetJobsById: ${error}`);
Sentry.captureException(error);
return [];
}
@@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
.eq("crawl_id", crawlId)
if (error) {
- Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
+ logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
Sentry.captureException(error);
return [];
}
diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts
index b45b8973..a6cd539d 100644
--- a/apps/api/src/lib/withAuth.ts
+++ b/apps/api/src/lib/withAuth.ts
@@ -1,30 +1,25 @@
import { AuthResponse } from "../../src/types";
-import { Logger } from "./logger";
+import { logger } from "./logger";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
let warningCount = 0;
-export function withAuth(
- originalFunction: (...args: U) => Promise
+export function withAuth(
+ originalFunction: (...args: U) => Promise,
+ mockSuccess: T,
) {
return async function (...args: U): Promise {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
if (warningCount < 5) {
- Logger.warn("You're bypassing authentication");
+ logger.warn("You're bypassing authentication");
warningCount++;
}
return { success: true } as T;
} else {
- try {
- return await originalFunction(...args);
- } catch (error) {
- Sentry.captureException(error);
- Logger.error(`Error in withAuth function: ${error}`);
- return { success: false, error: error.message } as T;
- }
+ return await originalFunction(...args);
}
};
}
diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts
index 8eb679e7..90d4a47f 100644
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@@ -1,146 +1,127 @@
import { Job } from "bullmq";
import {
- CrawlResult,
WebScraperOptions,
RunWebScraperParams,
RunWebScraperResult,
} from "../types";
-import { WebScraperDataProvider } from "../scraper/WebScraper";
-import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing";
-import { Document } from "../lib/entities";
+import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
+import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
+import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();
export async function startWebScraperPipeline({
job,
token,
}: {
- job: Job;
+ job: Job & { id: string };
token: string;
}) {
- let partialDocs: Document[] = [];
return (await runWebScraper({
url: job.data.url,
mode: job.data.mode,
- crawlerOptions: job.data.crawlerOptions,
- extractorOptions: job.data.extractorOptions,
- pageOptions: {
- ...job.data.pageOptions,
+ scrapeOptions: {
+ ...job.data.scrapeOptions,
...(job.data.crawl_id ? ({
- includeRawHtml: true,
+ formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
}): {}),
},
- inProgress: (progress) => {
- Logger.debug(`🐂 Job in progress ${job.id}`);
- if (progress.currentDocument) {
- partialDocs.push(progress.currentDocument);
- if (partialDocs.length > 50) {
- partialDocs = partialDocs.slice(-50);
- }
- // job.updateProgress({ ...progress, partialDocs: partialDocs });
- }
- },
- onSuccess: (result, mode) => {
- Logger.debug(`🐂 Job completed ${job.id}`);
- saveJob(job, result, token, mode);
- },
- onError: (error) => {
- Logger.error(`🐂 Job failed ${job.id}`);
- ScrapeEvents.logJobEvent(job, "failed");
- job.moveToFailed(error, token, false);
- },
+ internalOptions: job.data.internalOptions,
+ // onSuccess: (result, mode) => {
+ // logger.debug(`🐂 Job completed ${job.id}`);
+ // saveJob(job, result, token, mode);
+ // },
+ // onError: (error) => {
+ // logger.error(`🐂 Job failed ${job.id}`);
+ // ScrapeEvents.logJobEvent(job, "failed");
+ // },
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
- })) as { success: boolean; message: string; docs: Document[] };
+ }));
}
export async function runWebScraper({
url,
mode,
- crawlerOptions,
- pageOptions,
- extractorOptions,
- inProgress,
- onSuccess,
- onError,
+ scrapeOptions,
+ internalOptions,
+ // onSuccess,
+ // onError,
team_id,
bull_job_id,
priority,
is_scrape=false,
-}: RunWebScraperParams): Promise {
+}: RunWebScraperParams): Promise {
+ let response: ScrapeUrlResponse | undefined = undefined;
+ let engines: EngineResultsTracker = {};
try {
- const provider = new WebScraperDataProvider();
- if (mode === "crawl") {
- await provider.setOptions({
- jobId: bull_job_id,
- mode: mode,
- urls: [url],
- extractorOptions,
- crawlerOptions: crawlerOptions,
- pageOptions: pageOptions,
- bullJobId: bull_job_id,
- priority,
- });
- } else {
- await provider.setOptions({
- jobId: bull_job_id,
- mode: mode,
- urls: url.split(","),
- extractorOptions,
- crawlerOptions: crawlerOptions,
- pageOptions: pageOptions,
- priority,
- teamId: team_id
- });
+ response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
+ if (!response.success) {
+ if (response.error instanceof Error) {
+ throw response.error;
+ } else {
+ throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
+ }
}
- const docs = (await provider.getDocuments(false, (progress: Progress) => {
- inProgress(progress);
- })) as Document[];
-
- if (docs.length === 0) {
- return {
- success: true,
- message: "No pages found",
- docs: [],
- };
- }
-
- // remove docs with empty content
- const filteredDocs = crawlerOptions?.returnOnlyUrls
- ? docs.map((doc) => {
- if (doc.metadata.sourceURL) {
- return { url: doc.metadata.sourceURL };
- }
- })
- : docs;
if(is_scrape === false) {
- billTeam(team_id, undefined, filteredDocs.length).catch(error => {
- Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
+ let creditsToBeBilled = 1; // Assuming 1 credit per document
+ if (scrapeOptions.extract) {
+ creditsToBeBilled = 5;
+ }
+
+ billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
+ logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
-
-
// This is where the returnvalue from the job is set
- onSuccess(filteredDocs, mode);
+ // onSuccess(response.document, mode);
- // this return doesn't matter too much for the job completion result
- return { success: true, message: "", docs: filteredDocs };
+ engines = response.engines;
+ return response;
} catch (error) {
- onError(error);
- return { success: false, message: error.message, docs: [] };
+ engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
+
+ if (response !== undefined) {
+ return {
+ ...response,
+ success: false,
+ error,
+ }
+ } else {
+ return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
+ }
+ // onError(error);
+ } finally {
+ const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
+
+ for (const engine of engineOrder) {
+ const result = engines[engine] as Exclude;
+ ScrapeEvents.insert(bull_job_id, {
+ type: "scrape",
+ url,
+ method: engine,
+ result: {
+ success: result.state === "success",
+ response_code: (result.state === "success" ? result.result.statusCode : undefined),
+ response_size: (result.state === "success" ? result.result.html.length : undefined),
+ error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
+ time_taken: result.finishedAt - result.startedAt,
+ },
+ });
+ }
}
}
-const saveJob = async (job: Job, result: any, token: string, mode: string) => {
+const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
@@ -168,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
}
ScrapeEvents.logJobEvent(job, "completed");
} catch (error) {
- Logger.error(`🐂 Failed to update job status: ${error}`);
+ logger.error(`🐂 Failed to update job status: ${error}`);
}
};
diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts
index 88159060..ac61519a 100644
--- a/apps/api/src/routes/admin.ts
+++ b/apps/api/src/routes/admin.ts
@@ -6,8 +6,8 @@ import {
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/v0/admin/queue";
-import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { wrap } from "./v1";
+import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
export const adminRouter = express.Router();
diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts
index 4e4b6052..3eaace3b 100644
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@@ -14,7 +14,7 @@ import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
@@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
}
- const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
- req.acuc = chunk;
+ const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
+ if (chunk) {
+ req.acuc = chunk;
+ }
if (!success) {
- Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
+ logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
}
@@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
- const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+ const auth = await authenticateUser(
req,
res,
rateLimiterMode,
);
- if (!success) {
+ if (!auth.success) {
if (!res.headersSent) {
- return res.status(status).json({ success: false, error });
+ return res.status(auth.status).json({ success: false, error: auth.error });
+ } else {
+ return;
}
}
+ const { team_id, plan, chunk } = auth;
+
req.auth = { team_id, plan };
- req.acuc = chunk;
+ req.acuc = chunk ?? undefined;
+ if (chunk) {
+ req.account = { remainingCredits: chunk.remaining_credits };
+ }
next();
})()
.catch(err => next(err));
diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
index 20419ffa..eba0ddb4 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
@@ -2,7 +2,6 @@
import { WebCrawler } from '../crawler';
import axios from 'axios';
import robotsParser from 'robots-parser';
-import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
jest.mock('axios');
jest.mock('robots-parser');
@@ -35,165 +34,6 @@ describe('WebCrawler', () => {
});
});
- it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
- const initialUrl = 'http://example.com'; // Set initial URL for this test
- const enteredMaxCrawledDepth = 2;
- maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-
-
- crawler = new WebCrawler({
- jobId: "TEST",
- initialUrl: initialUrl,
- includes: [],
- excludes: [],
- limit: 100,
- maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
- });
-
- // Mock sitemap fetching function to return controlled links
- crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
- initialUrl, // depth 0
- initialUrl + '/page1', // depth 1
- initialUrl + '/page1/page2', // depth 2
- initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
- ]);
-
- const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
- expect(results).toEqual([
- { url: initialUrl, html: '' },
- { url: initialUrl + '/page1', html: '' },
- { url: initialUrl + '/page1/page2', html: '' }
- ]);
-
-
- // Ensure that the link with depth 3 is not included
- expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
- });
-
- it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
- const initialUrl = 'http://example.com'; // Set initial URL for this test
- const enteredMaxCrawledDepth = 0;
- maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-
-
- crawler = new WebCrawler({
- jobId: "TEST",
- initialUrl: initialUrl,
- includes: [],
- excludes: [],
- limit: 100,
- maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
- });
-
- // Mock sitemap fetching function to return controlled links
- crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
- initialUrl, // depth 0
- initialUrl + '/page1', // depth 1
- initialUrl + '/page1/page2', // depth 2
- initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
- ]);
-
- const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
- expect(results).toEqual([
- { url: initialUrl, html: '' },
- ]);
- });
-
- it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
- const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
- const enteredMaxCrawledDepth = 1;
- maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-
-
- crawler = new WebCrawler({
- jobId: "TEST",
- initialUrl: initialUrl,
- includes: [],
- excludes: [],
- limit: 100,
- maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
- });
-
- // Mock sitemap fetching function to return controlled links
- crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
- initialUrl, // depth 0
- initialUrl + '/page2', // depth 1
- initialUrl + '/page2/page3', // depth 2
- initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
- ]);
-
- const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
- expect(results).toEqual([
- { url: initialUrl, html: '' },
- { url: initialUrl + '/page2', html: '' }
- ]);
- });
-
- it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
- const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
- const enteredMaxCrawledDepth = 2;
- maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-
-
- crawler = new WebCrawler({
- jobId: "TEST",
- initialUrl: initialUrl,
- includes: [],
- excludes: [],
- limit: 100,
- maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
- });
-
- // Mock sitemap fetching function to return controlled links
- crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
- initialUrl, // depth 0
- initialUrl + '/page2', // depth 1
- initialUrl + '/page2/page3', // depth 2
- initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
- ]);
-
- const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
- expect(results).toEqual([
- { url: initialUrl, html: '' },
- { url: initialUrl + '/page2', html: '' },
- { url: initialUrl + '/page2/page3', html: '' }
- ]);
- });
-
- it('should handle allowBackwardCrawling option correctly', async () => {
- const initialUrl = 'https://mendable.ai/blog';
-
- // Setup the crawler with the specific test case options
- const crawler = new WebCrawler({
- jobId: "TEST",
- initialUrl: initialUrl,
- includes: [],
- excludes: [],
- limit: 100,
- maxCrawledDepth: 3, // Example depth
- allowBackwardCrawling: true
- });
-
- // Mock the sitemap fetching function to simulate backward crawling
- crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
- initialUrl,
- 'https://mendable.ai', // backward link
- initialUrl + '/page1',
- initialUrl + '/page1/page2'
- ]);
-
- const results = await crawler.start();
- expect(results).toEqual([
- { url: initialUrl, html: '' },
- { url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
- { url: initialUrl + '/page1', html: '' },
- { url: initialUrl + '/page1/page2', html: '' }
- ]);
-
- // Check that the backward link is included if allowBackwardCrawling is true
- expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
- });
-
it('should respect the limit parameter by not returning more links than specified', async () => {
const initialUrl = 'http://example.com';
const limit = 2; // Set a limit for the number of links
diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
deleted file mode 100644
index 02c8a7e0..00000000
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ /dev/null
@@ -1,37 +0,0 @@
-import { scrapSingleUrl } from '../single_url';
-import { PageOptions } from '../../../lib/entities';
-
-
-jest.mock('../single_url', () => {
- const originalModule = jest.requireActual('../single_url');
- originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('TestRoast
');
-
- return originalModule;
-});
-
-describe('scrapSingleUrl', () => {
- it('should handle includeHtml option correctly', async () => {
- const url = 'https://roastmywebsite.ai';
- const pageOptionsWithHtml: PageOptions = { includeHtml: true };
- const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
-
- const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
- const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
-
- expect(resultWithHtml.html).toBeDefined();
- expect(resultWithoutHtml.html).toBeUndefined();
- }, 10000);
-});
-
-it('should return a list of links on the firecrawl.ai page', async () => {
- const url = 'https://flutterbricks.com';
- const pageOptions: PageOptions = { includeHtml: true };
-
- const result = await scrapSingleUrl("TEST", url, pageOptions);
-
- // Check if the result contains a list of links
- expect(result.linksOnPage).toBeDefined();
- expect(Array.isArray(result.linksOnPage)).toBe(true);
- expect(result.linksOnPage.length).toBeGreaterThan(0);
- expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
-}, 15000);
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 72a49fd8..9e3f7cd2 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
-import async from "async";
-import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
-import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler {
private jobId: string;
@@ -73,7 +70,7 @@ export class WebCrawler {
try {
url = new URL(link.trim(), this.baseUrl);
} catch (error) {
- Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
+ logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
return false;
}
const path = url.pathname;
@@ -132,7 +129,7 @@ export class WebCrawler {
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt
if (!isAllowed) {
- Logger.debug(`Link disallowed by robots.txt: ${link}`);
+ logger.debug(`Link disallowed by robots.txt: ${link}`);
return false;
}
@@ -161,7 +158,7 @@ export class WebCrawler {
}
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
- Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
+ logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
@@ -170,115 +167,6 @@ export class WebCrawler {
return null;
}
- public async start(
- inProgress?: (progress: Progress) => void,
- pageOptions?: PageOptions,
- crawlerOptions?: CrawlerOptions,
- concurrencyLimit: number = 5,
- limit: number = 10000,
- maxDepth: number = 10
- ): Promise<{ url: string, html: string }[]> {
-
- Logger.debug(`Crawler starting with ${this.initialUrl}`);
- // Fetch and parse robots.txt
- try {
- const txt = await this.getRobotsTxt();
- this.importRobotsTxt(txt);
- Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
- } catch (error) {
- Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
- }
-
- if (!crawlerOptions?.ignoreSitemap){
- const sm = await this.tryGetSitemap();
- if (sm !== null) {
- return sm;
- }
- }
-
- const urls = await this.crawlUrls(
- [this.initialUrl],
- pageOptions,
- concurrencyLimit,
- inProgress
- );
-
- if (
- urls.length === 0 &&
- this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
- ) {
- return [{ url: this.initialUrl, html: "" }];
- }
-
- // make sure to run include exclude here again
- const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
- return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
- }
-
- private async crawlUrls(
- urls: string[],
- pageOptions: PageOptions,
- concurrencyLimit: number,
- inProgress?: (progress: Progress) => void,
- ): Promise<{ url: string, html: string }[]> {
- const queue = async.queue(async (task: string, callback) => {
- Logger.debug(`Crawling ${task}`);
- if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
- if (callback && typeof callback === "function") {
- callback();
- }
- return;
- }
- const newUrls = await this.crawl(task, pageOptions);
- // add the initial url if not already added
- // if (this.visited.size === 1) {
- // let normalizedInitial = this.initialUrl;
- // if (!normalizedInitial.endsWith("/")) {
- // normalizedInitial = normalizedInitial + "/";
- // }
- // if (!newUrls.some(page => page.url === this.initialUrl)) {
- // newUrls.push({ url: this.initialUrl, html: "" });
- // }
- // }
-
- newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
-
- if (inProgress && newUrls.length > 0) {
- inProgress({
- current: this.crawledUrls.size,
- total: Math.min(this.maxCrawledLinks, this.limit),
- status: "SCRAPING",
- currentDocumentUrl: newUrls[newUrls.length - 1].url,
- });
- } else if (inProgress) {
- inProgress({
- current: this.crawledUrls.size,
- total: Math.min(this.maxCrawledLinks, this.limit),
- status: "SCRAPING",
- currentDocumentUrl: task,
- });
- }
- await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
- if (callback && typeof callback === "function") {
- callback();
- }
- }, concurrencyLimit);
-
- Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
- queue.push(
- urls.filter(
- (url) =>
- !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
- ),
- (err) => {
- if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
- }
- );
- await queue.drain();
- Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
- return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
- }
-
public filterURL(href: string, url: string): string | null {
let fullUrl = href;
if (!href.startsWith("http")) {
@@ -333,82 +221,22 @@ export class WebCrawler {
}
});
+ // Extract links from iframes with inline src
+ $("iframe").each((_, element) => {
+ const src = $(element).attr("src");
+ if (src && src.startsWith("data:text/html")) {
+ const iframeHtml = decodeURIComponent(src.split(",")[1]);
+ const iframeLinks = this.extractLinksFromHTML(iframeHtml, url);
+ links = links.concat(iframeLinks);
+ }
+ });
+
return links;
}
- async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
- if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
- return [];
- }
- this.visited.add(url);
-
- if (!url.startsWith("http")) {
- url = "https://" + url;
- }
- if (url.endsWith("/")) {
- url = url.slice(0, -1);
- }
-
- if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
- return [];
- }
-
- try {
- let content: string = "";
- let pageStatusCode: number;
- let pageError: string | undefined = undefined;
-
- // If it is the first link, fetch with single url
- if (this.visited.size === 1) {
- const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
- content = page.html ?? "";
- pageStatusCode = page.metadata?.pageStatusCode;
- pageError = page.metadata?.pageError || undefined;
- } else {
- const response = await axios.get(url, { timeout: axiosTimeout });
- content = response.data ?? "";
- pageStatusCode = response.status;
- pageError = response.statusText != "OK" ? response.statusText : undefined;
- }
-
- const $ = load(content);
- let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
-
- // Add the initial URL to the list of links
- if (this.visited.size === 1) {
- links.push({ url, html: content, pageStatusCode, pageError });
- }
-
- links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
-
- if (this.visited.size === 1) {
- return links;
- }
-
- // Create a new list to return to avoid modifying the visited list
- return links.filter((link) => !this.visited.has(link.url));
- } catch (error) {
- return [];
- }
- }
-
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
- private normalizeCrawlUrl(url: string): string {
- try{
- const urlObj = new URL(url);
- urlObj.searchParams.sort(); // Sort query parameters to normalize
- return urlObj.toString();
- } catch (error) {
- return url;
- }
- }
-
- private matchesIncludes(url: string): boolean {
- if (this.includes.length === 0 || this.includes[0] == "") return true;
- return this.includes.some((pattern) => new RegExp(pattern).test(url));
- }
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
return this.excludes.some((pattern) => {
@@ -493,7 +321,7 @@ export class WebCrawler {
const urlWithoutQuery = url.split('?')[0].toLowerCase();
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
} catch (error) {
- Logger.error(`Error processing URL in isFile: ${error}`);
+ logger.error(`Error processing URL in isFile: ${error}`);
return false;
}
}
@@ -514,7 +342,6 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
- //
private async tryFetchSitemapLinks(url: string): Promise {
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
@@ -536,7 +363,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
}
} catch (error) {
- Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
+ logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
@@ -555,7 +382,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
}
} catch (error) {
- Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
+ logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index e5841978..48aa2ffd 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -1,4 +1,4 @@
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
export async function handleCustomScraping(
text: string,
@@ -6,7 +6,7 @@ export async function handleCustomScraping(
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case
if (text.includes(' {
- throw new Error("Method not implemented.");
- }
-
- private async convertUrlsToDocuments(
- urls: string[],
- inProgress?: (progress: Progress) => void,
- allHtmls?: string[]
- ): Promise {
- const totalUrls = urls.length;
- let processedUrls = 0;
-
- const results: (Document | null)[] = new Array(urls.length).fill(null);
- for (let i = 0; i < urls.length; i += this.concurrentRequests) {
- const batchUrls = urls.slice(i, i + this.concurrentRequests);
- await Promise.all(
- batchUrls.map(async (url, index) => {
- const existingHTML = allHtmls ? allHtmls[i + index] : "";
- const result = await scrapSingleUrl(
- this.jobId,
- url,
- this.pageOptions,
- this.extractorOptions,
- existingHTML,
- this.priority,
- this.teamId,
- );
- processedUrls++;
- if (inProgress) {
- inProgress({
- current: processedUrls,
- total: totalUrls,
- status: "SCRAPING",
- currentDocumentUrl: url,
- currentDocument: { ...result, index: processedUrls },
- });
- }
-
- results[i + index] = result;
- })
- );
- }
- return results.filter((result) => result !== null) as Document[];
- }
-
- async getDocuments(
- useCaching: boolean = false,
- inProgress?: (progress: Progress) => void
- ): Promise {
- this.validateInitialUrl();
- if (!useCaching) {
- return this.processDocumentsWithoutCache(inProgress);
- }
-
- return this.processDocumentsWithCache(inProgress);
- }
-
- private validateInitialUrl(): void {
- if (this.urls[0].trim() === "") {
- throw new Error("Url is required");
- }
- }
-
- /**
- * Process documents without cache handling each mode
- * @param inProgress inProgress
- * @returns documents
- */
- private async processDocumentsWithoutCache(
- inProgress?: (progress: Progress) => void
- ): Promise {
- switch (this.mode) {
- case "crawl":
- return this.handleCrawlMode(inProgress);
- case "single_urls":
- return this.handleSingleUrlsMode(inProgress);
- case "sitemap":
- return this.handleSitemapMode(inProgress);
- default:
- return [];
- }
- }
-
- private async cleanIrrelevantPath(links: string[]) {
- return links.filter((link) => {
- const normalizedInitialUrl = new URL(this.urls[0]);
- const normalizedLink = new URL(link);
-
- // Normalize the hostname to account for www and non-www versions
- const initialHostname = normalizedInitialUrl.hostname.replace(
- /^www\./,
- ""
- );
- const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
-
- // Ensure the protocol and hostname match, and the path starts with the initial URL's path
- return (
- linkHostname === initialHostname &&
- normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
- );
- });
- }
-
- private async handleCrawlMode(
- inProgress?: (progress: Progress) => void
- ): Promise {
- let includes: string[];
- if (Array.isArray(this.includes)) {
- if (this.includes[0] != "") {
- includes = this.includes;
- }
- } else {
- includes = this.includes.split(',');
- }
-
- let excludes: string[];
- if (Array.isArray(this.excludes)) {
- if (this.excludes[0] != "") {
- excludes = this.excludes;
- }
- } else {
- excludes = this.excludes.split(',');
- }
-
- const crawler = new WebCrawler({
- jobId: this.jobId,
- initialUrl: this.urls[0],
- includes,
- excludes,
- maxCrawledLinks: this.maxCrawledLinks,
- maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
- limit: this.limit,
- generateImgAltText: this.generateImgAltText,
- allowBackwardCrawling: this.allowBackwardCrawling,
- allowExternalContentLinks: this.allowExternalContentLinks,
- });
-
- let links = await crawler.start(
- inProgress,
- this.pageOptions,
- {
- ignoreSitemap: this.ignoreSitemap,
- },
- 5,
- this.limit,
- this.maxCrawledDepth
- );
-
- let allLinks = links.map((e) => e.url);
- const allHtmls = links.map((e) => e.html);
-
- if (this.returnOnlyUrls) {
- return this.returnOnlyUrlsResponse(allLinks, inProgress);
- }
-
- let documents = [];
- // check if fast mode is enabled and there is html inside the links
- if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
- documents = await this.processLinks(allLinks, inProgress, allHtmls);
- } else {
- documents = await this.processLinks(allLinks, inProgress);
- }
-
- return this.cacheAndFinalizeDocuments(documents, allLinks);
- }
-
- private async handleSingleUrlsMode(
- inProgress?: (progress: Progress) => void
- ): Promise {
- const links = this.urls;
-
- let documents = await this.processLinks(links, inProgress);
- return documents;
- }
-
- private async handleSitemapMode(
- inProgress?: (progress: Progress) => void
- ): Promise {
- let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
- links = await this.cleanIrrelevantPath(links);
-
- if (this.returnOnlyUrls) {
- return this.returnOnlyUrlsResponse(links, inProgress);
- }
-
- let documents = await this.processLinks(links, inProgress);
- return this.cacheAndFinalizeDocuments(documents, links);
- }
-
- private async returnOnlyUrlsResponse(
- links: string[],
- inProgress?: (progress: Progress) => void
- ): Promise {
- inProgress?.({
- current: links.length,
- total: links.length,
- status: "COMPLETED",
- currentDocumentUrl: this.urls[0],
- });
- return links.map((url) => ({
- content: "",
- html: this.pageOptions?.includeHtml ? "" : undefined,
- markdown: "",
- metadata: { sourceURL: url, pageStatusCode: 200 },
- }));
- }
-
- private async processLinks(
- links: string[],
- inProgress?: (progress: Progress) => void,
- allHtmls?: string[]
- ): Promise {
- const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
- const docLinks = links.filter(
- (link) => link.endsWith(".doc") || link.endsWith(".docx")
- );
-
- const [pdfDocuments, docxDocuments] = await Promise.all([
- this.fetchPdfDocuments(pdfLinks),
- this.fetchDocxDocuments(docLinks),
- ]);
-
- links = links.filter(
- (link) => !pdfLinks.includes(link) && !docLinks.includes(link)
- );
-
- let [documents, sitemapData] = await Promise.all([
- this.convertUrlsToDocuments(links, inProgress, allHtmls),
- this.mode === "single_urls" && links.length > 0
- ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
- (error) => {
- Logger.debug(`Failed to fetch sitemap data: ${error}`);
- return null;
- }
- )
- : Promise.resolve(null),
- ]);
-
- if (this.mode === "single_urls" && documents.length > 0) {
- documents[0].metadata.sitemap = sitemapData ?? undefined;
- } else {
- documents = await this.getSitemapData(this.urls[0], documents);
- }
-
- if (this.pageOptions.includeMarkdown) {
- documents = this.applyPathReplacements(documents);
- }
-
- if (!this.pageOptions.includeHtml) {
- for (let document of documents) {
- delete document.html;
- }
- }
-
- // documents = await this.applyImgAltText(documents);
- if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
- const extractionMode = this.extractorOptions?.mode ?? "markdown";
- const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
-
- if (
- extractionMode === "llm-extraction" ||
- extractionMode === "llm-extraction-from-markdown" ||
- extractionMode === "llm-extraction-from-raw-html"
- ) {
- documents = await generateCompletions(
- documents,
- this.extractorOptions,
- completionMode
- );
- }
- }
- return documents.concat(pdfDocuments).concat(docxDocuments);
- }
-
- private async fetchPdfDocuments(pdfLinks: string[]): Promise {
- return Promise.all(
- pdfLinks.map(async (pdfLink) => {
- const timer = Date.now();
- const logInsertPromise = ScrapeEvents.insert(this.jobId, {
- type: "scrape",
- url: pdfLink,
- worker: process.env.FLY_MACHINE_ID,
- method: "pdf-scrape",
- result: null,
- });
-
- const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
- pdfLink,
- this.pageOptions.parsePDF
- );
-
- const insertedLogId = await logInsertPromise;
- ScrapeEvents.updateScrapeResult(insertedLogId, {
- response_size: content.length,
- success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
- error: pageError,
- response_code: pageStatusCode,
- time_taken: Date.now() - timer,
- });
- return {
- content: content,
- markdown: content,
- metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
- provider: "web-scraper",
- };
- })
- );
- }
- private async fetchDocxDocuments(docxLinks: string[]): Promise {
- return Promise.all(
- docxLinks.map(async (docxLink) => {
- const timer = Date.now();
- const logInsertPromise = ScrapeEvents.insert(this.jobId, {
- type: "scrape",
- url: docxLink,
- worker: process.env.FLY_MACHINE_ID,
- method: "docx-scrape",
- result: null,
- });
-
- const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
- docxLink
- );
-
- const insertedLogId = await logInsertPromise;
- ScrapeEvents.updateScrapeResult(insertedLogId, {
- response_size: content.length,
- success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
- error: pageError,
- response_code: pageStatusCode,
- time_taken: Date.now() - timer,
- });
-
- return {
- content,
- metadata: { sourceURL: docxLink, pageStatusCode, pageError },
- provider: "web-scraper",
- };
- })
- );
- }
-
- private applyPathReplacements(documents: Document[]): Document[] {
- if (this.replaceAllPathsWithAbsolutePaths) {
- documents = replacePathsWithAbsolutePaths(documents);
- }
- return replaceImgPathsWithAbsolutePaths(documents);
- }
-
- private async applyImgAltText(documents: Document[]): Promise {
- return this.generateImgAltText
- ? this.generatesImgAltText(documents)
- : documents;
- }
-
- private async cacheAndFinalizeDocuments(
- documents: Document[],
- links: string[]
- ): Promise {
- // await this.setCachedDocuments(documents, links);
- documents = this.removeChildLinks(documents);
- return documents.splice(0, this.limit);
- }
-
- private async processDocumentsWithCache(
- inProgress?: (progress: Progress) => void
- ): Promise {
- let documents = await this.getCachedDocuments(
- this.urls.slice(0, this.limit)
- );
- if (documents.length < this.limit) {
- const newDocuments: Document[] = await this.getDocuments(
- false,
- inProgress
- );
- documents = this.mergeNewDocuments(documents, newDocuments);
- }
- documents = this.filterDocsExcludeInclude(documents);
- documents = this.filterDepth(documents);
- documents = this.removeChildLinks(documents);
- return documents.splice(0, this.limit);
- }
-
- private mergeNewDocuments(
- existingDocuments: Document[],
- newDocuments: Document[]
- ): Document[] {
- newDocuments.forEach((doc) => {
- if (
- !existingDocuments.some(
- (d) =>
- this.normalizeUrl(d.metadata.sourceURL) ===
- this.normalizeUrl(doc.metadata?.sourceURL)
- )
- ) {
- existingDocuments.push(doc);
- }
- });
- return existingDocuments;
- }
-
- private filterDocsExcludeInclude(documents: Document[]): Document[] {
- return documents.filter((document) => {
- const url = new URL(document.metadata.sourceURL);
- const path = url.pathname;
-
- if (!Array.isArray(this.excludes)) {
- this.excludes = this.excludes.split(',');
- }
-
- if (this.excludes.length > 0 && this.excludes[0] !== "") {
- // Check if the link should be excluded
- if (
- this.excludes.some((excludePattern) =>
- new RegExp(excludePattern).test(path)
- )
- ) {
- return false;
- }
- }
-
- if (!Array.isArray(this.includes)) {
- this.includes = this.includes.split(',');
- }
-
- if (this.includes.length > 0 && this.includes[0] !== "") {
- // Check if the link matches the include patterns, if any are specified
- if (this.includes.length > 0) {
- return this.includes.some((includePattern) =>
- new RegExp(includePattern).test(path)
- );
- }
- }
- return true;
- });
- }
-
- private normalizeUrl(url: string): string {
- if (url.includes("//www.")) {
- return url.replace("//www.", "//");
- }
- return url;
- }
-
- private removeChildLinks(documents: Document[]): Document[] {
- for (let document of documents) {
- if (document?.childrenLinks) delete document.childrenLinks;
- }
- return documents;
- }
-
- async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
- for (const document of documents) {
- if (document.content.trim().length === 0) {
- continue;
- }
- const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
- await setValue(
- "web-scraper-cache:" + normalizedUrl,
- JSON.stringify({
- ...document,
- childrenLinks: childrenLinks || [],
- }),
- 60 * 60
- ); // 10 days
- }
- }
-
- async getCachedDocuments(urls: string[]): Promise {
- let documents: Document[] = [];
- for (const url of urls) {
- const normalizedUrl = this.normalizeUrl(url);
- Logger.debug(
- "Getting cached document for web-scraper-cache:" + normalizedUrl
- );
- const cachedDocumentString = await getValue(
- "web-scraper-cache:" + normalizedUrl
- );
- if (cachedDocumentString) {
- const cachedDocument = JSON.parse(cachedDocumentString);
- documents.push(cachedDocument);
-
- // get children documents
- for (const childUrl of cachedDocument.childrenLinks || []) {
- const normalizedChildUrl = this.normalizeUrl(childUrl);
- const childCachedDocumentString = await getValue(
- "web-scraper-cache:" + normalizedChildUrl
- );
- if (childCachedDocumentString) {
- const childCachedDocument = JSON.parse(childCachedDocumentString);
- if (
- !documents.find(
- (doc) =>
- doc.metadata.sourceURL ===
- childCachedDocument.metadata.sourceURL
- )
- ) {
- documents.push(childCachedDocument);
- }
- }
- }
- }
- }
- return documents;
- }
-
- setOptions(options: WebScraperOptions): void {
- if (!options.urls) {
- throw new Error("Urls are required");
- }
-
- this.jobId = options.jobId;
- this.bullJobId = options.bullJobId;
- this.urls = options.urls;
- this.mode = options.mode;
- this.concurrentRequests = options.concurrentRequests ?? 20;
- this.includes = options.crawlerOptions?.includes ?? [];
- this.excludes = options.crawlerOptions?.excludes ?? [];
- this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
- this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
- this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
- this.limit = options.crawlerOptions?.limit ?? 10000;
- this.generateImgAltText =
- options.crawlerOptions?.generateImgAltText ?? false;
- this.pageOptions = {
- onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
- includeHtml: options.pageOptions?.includeHtml ?? false,
- replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
- parsePDF: options.pageOptions?.parsePDF ?? true,
- onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
- removeTags: options.pageOptions?.removeTags ?? [],
- includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
- includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
- includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
- waitFor: options.pageOptions?.waitFor ?? undefined,
- headers: options.pageOptions?.headers ?? undefined,
- includeLinks: options.pageOptions?.includeLinks ?? true,
- fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
- screenshot: options.pageOptions?.screenshot ?? false,
- useFastMode: options.pageOptions?.useFastMode ?? false,
- disableJsDom: options.pageOptions?.disableJsDom ?? false,
- atsv: options.pageOptions?.atsv ?? false,
- actions: options.pageOptions?.actions ?? undefined,
- geolocation: options.pageOptions?.geolocation ?? undefined,
- skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
- };
- this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
- this.replaceAllPathsWithAbsolutePaths =
- options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
- options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
- false;
-
- if (typeof options.crawlerOptions?.excludes === 'string') {
- this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
- }
-
- if (typeof options.crawlerOptions?.includes === 'string') {
- this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
- }
-
- this.crawlerMode = options.crawlerOptions?.mode ?? "default";
- this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
- this.allowBackwardCrawling =
- options.crawlerOptions?.allowBackwardCrawling ?? false;
- this.allowExternalContentLinks =
- options.crawlerOptions?.allowExternalContentLinks ?? false;
- this.priority = options.priority;
- this.teamId = options.teamId ?? null;
-
-
-
- // make sure all urls start with https://
- this.urls = this.urls.map((url) => {
- if (!url.trim().startsWith("http")) {
- return `https://${url}`;
- }
- return url;
- });
- }
-
- private async getSitemapData(baseUrl: string, documents: Document[]) {
- const sitemapData = await fetchSitemapData(baseUrl);
- if (sitemapData) {
- for (let i = 0; i < documents.length; i++) {
- const docInSitemapData = sitemapData.find(
- (data) =>
- this.normalizeUrl(data.loc) ===
- this.normalizeUrl(documents[i].metadata.sourceURL)
- );
- if (docInSitemapData) {
- let sitemapDocData: Partial = {};
- if (docInSitemapData.changefreq) {
- sitemapDocData.changefreq = docInSitemapData.changefreq;
- }
- if (docInSitemapData.priority) {
- sitemapDocData.priority = Number(docInSitemapData.priority);
- }
- if (docInSitemapData.lastmod) {
- sitemapDocData.lastmod = docInSitemapData.lastmod;
- }
- if (Object.keys(sitemapDocData).length !== 0) {
- documents[i].metadata.sitemap = sitemapDocData;
- }
- }
- }
- }
- return documents;
- }
- private async getSitemapDataForSingleUrl(
- baseUrl: string,
- url: string,
- timeout?: number
- ) {
- const sitemapData = await fetchSitemapData(baseUrl, timeout);
- if (sitemapData) {
- const docInSitemapData = sitemapData.find(
- (data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
- );
- if (docInSitemapData) {
- let sitemapDocData: Partial = {};
- if (docInSitemapData.changefreq) {
- sitemapDocData.changefreq = docInSitemapData.changefreq;
- }
- if (docInSitemapData.priority) {
- sitemapDocData.priority = Number(docInSitemapData.priority);
- }
- if (docInSitemapData.lastmod) {
- sitemapDocData.lastmod = docInSitemapData.lastmod;
- }
- if (Object.keys(sitemapDocData).length !== 0) {
- return sitemapDocData;
- }
- }
- }
- return null;
- }
- generatesImgAltText = async (documents: Document[]): Promise => {
- await Promise.all(
- documents.map(async (document) => {
- const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
-
- await Promise.all(
- images.map(async (image: string) => {
- let imageUrl = image.match(/\(([^)]+)\)/)[1];
- let altText = image.match(/\[(.*?)\]/)[1];
-
- if (
- !altText &&
- !imageUrl.startsWith("data:image") &&
- /\.(png|jpeg|gif|webp)$/.test(imageUrl)
- ) {
- const imageIndex = document.content.indexOf(image);
- const contentLength = document.content.length;
- let backText = document.content.substring(
- imageIndex + image.length,
- Math.min(imageIndex + image.length + 1000, contentLength)
- );
- let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
- let frontText = document.content.substring(
- frontTextStartIndex,
- imageIndex
- );
- altText = await getImageDescription(
- imageUrl,
- backText,
- frontText,
- this.generateImgAltTextModel
- );
- }
-
- document.content = document.content.replace(
- image,
- ``
- );
- })
- );
- })
- );
-
- return documents;
- };
-
- filterDepth(documents: Document[]): Document[] {
- return documents.filter((document) => {
- const url = new URL(document.metadata.sourceURL);
- return getURLDepth(url.toString()) <= this.maxCrawledDepth;
- });
- }
-}
diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts
deleted file mode 100644
index 0df3be72..00000000
--- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-import axios from "axios";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with Axios
- * @param url The URL to scrape
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithFetch(
- url: string,
- pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
- const logParams = {
- url,
- scraper: "fetch",
- success: false,
- response_code: null,
- time_taken_seconds: null,
- error_message: null,
- html: "",
- startTime: Date.now(),
- };
-
- try {
- const response = await axios.get(url, {
- headers: {
- "Content-Type": "application/json",
- },
- timeout: universalTimeout,
- transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
- });
-
- if (response.status !== 200) {
- Logger.debug(
- `⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
- );
- logParams.error_message = response.statusText;
- logParams.response_code = response.status;
- return {
- content: "",
- pageStatusCode: response.status,
- pageError: response.statusText,
- };
- }
-
- const contentType = response.headers["content-type"];
- if (contentType && contentType.includes("application/pdf")) {
- logParams.success = true;
- const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
- url,
- pageOptions?.parsePDF
- );
- logParams.response_code = pageStatusCode;
- logParams.error_message = pageError;
- return { content, pageStatusCode: response.status, pageError };
- } else {
- const text = response.data;
- logParams.success = true;
- logParams.html = text;
- logParams.response_code = response.status;
- return {
- content: text,
- pageStatusCode: response.status,
- pageError: null,
- };
- }
- } catch (error) {
- if (error.code === "ECONNABORTED") {
- logParams.error_message = "Request timed out";
- Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
- } else {
- logParams.error_message = error.message || error;
- Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
- }
- return {
- content: "",
- pageStatusCode: error.response?.status ?? null,
- pageError: logParams.error_message,
- };
- } finally {
- const endTime = Date.now();
- logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
- await logScrape(logParams);
- }
-}
diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
deleted file mode 100644
index 3bbd74eb..00000000
--- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
+++ /dev/null
@@ -1,226 +0,0 @@
-import axios from "axios";
-import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-import * as Sentry from "@sentry/node";
-import axiosRetry from 'axios-retry';
-
-axiosRetry(axios, { retries: 3 , onRetry:()=>{
- console.log("Retrying (fire-engine)...");
-}, retryDelay: axiosRetry.exponentialDelay});
-/**
- * Scrapes a URL with Fire-Engine
- * @param url The URL to scrape
- * @param waitFor The time to wait for the page to load
- * @param screenshot Whether to take a screenshot
- * @param fullPageScreenshot Whether to take a full page screenshot
- * @param pageOptions The options for the page
- * @param headers The headers to send with the request
- * @param options The options for the request
- * @returns The scraped content
- */
-export async function scrapWithFireEngine({
- url,
- actions,
- waitFor = 0,
- screenshot = false,
- fullPageScreenshot = false,
- pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
- fireEngineOptions = {},
- headers,
- options,
- priority,
- teamId,
-}: {
- url: string;
- actions?: Action[];
- waitFor?: number;
- screenshot?: boolean;
- fullPageScreenshot?: boolean;
- pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
- fireEngineOptions?: FireEngineOptions;
- headers?: Record;
- options?: any;
- priority?: number;
- teamId?: string;
-}): Promise {
- const logParams = {
- url,
- scraper: "fire-engine",
- success: false,
- response_code: null,
- time_taken_seconds: null,
- error_message: null,
- html: "",
- startTime: Date.now(),
- };
-
- try {
- const reqParams = await generateRequestParams(url);
- let waitParam = reqParams["params"]?.wait ?? waitFor;
- let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
- let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
- let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
- let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
-
-
- let endpoint = "/scrape";
-
- if(options?.endpoint === "request") {
- endpoint = "/request";
- }
-
- let engine = engineParam; // do we want fireEngineOptions as first choice?
-
- if (pageOptions?.useFastMode) {
- fireEngineOptionsParam.engine = "tlsclient";
- engine = "tlsclient";
- }
-
- Logger.info(
- `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
- );
-
- // atsv is only available for beta customers
- const betaCustomersString = process.env.BETA_CUSTOMERS;
- const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
-
- if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
- fireEngineOptionsParam.atsv = true;
- } else {
- pageOptions.atsv = false;
- }
-
- const axiosInstance = axios.create({
- headers: { "Content-Type": "application/json" }
- });
-
- const startTime = Date.now();
- const _response = await Sentry.startSpan({
- name: "Call to fire-engine"
- }, async span => {
-
- return await axiosInstance.post(
- process.env.FIRE_ENGINE_BETA_URL + endpoint,
- {
- url: url,
- headers: headers,
- wait: waitParam,
- screenshot: screenshotParam,
- fullPageScreenshot: fullPageScreenshotParam,
- disableJsDom: pageOptions?.disableJsDom ?? false,
- priority,
- engine,
- instantReturn: true,
- ...fireEngineOptionsParam,
- atsv: pageOptions?.atsv ?? false,
- scrollXPaths: pageOptions?.scrollXPaths ?? [],
- geolocation: pageOptions?.geolocation,
- skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
- actions: actions,
- },
- {
- headers: {
- "Content-Type": "application/json",
- ...(Sentry.isInitialized() ? ({
- "sentry-trace": Sentry.spanToTraceHeader(span),
- "baggage": Sentry.spanToBaggageHeader(span),
- }) : {}),
- }
- }
- );
- });
-
- const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
-
- let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
-
- // added 5 seconds to the timeout to account for 'smart wait'
- while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
- await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
- checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
- }
-
- if (checkStatusResponse.data.processing) {
- Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
- axiosInstance.delete(
- process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
- validateStatus: (status) => true
- }
- ).catch((error) => {
- Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
- });
-
- Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
- logParams.error_message = "Request timed out";
- return { html: "", pageStatusCode: null, pageError: "" };
- }
-
- if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
- Logger.debug(
- `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
- );
-
- logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
- logParams.response_code = checkStatusResponse.data?.pageStatusCode;
-
- if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
- Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
- }
-
- const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
-
- return {
- html: "",
- pageStatusCode,
- pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
- };
- }
-
- const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
-
- if (contentType && contentType.includes("application/pdf")) {
- const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
- url,
- pageOptions?.parsePDF
- );
- logParams.success = true;
- logParams.response_code = pageStatusCode;
- logParams.error_message = pageError;
- return { html: content, pageStatusCode, pageError };
- } else {
- const data = checkStatusResponse.data;
-
- logParams.success =
- (data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
- data.pageStatusCode === 404;
- logParams.html = data.content ?? "";
- logParams.response_code = data.pageStatusCode;
- logParams.error_message = data.pageError ?? data.error;
- return {
- html: data.content ?? "",
- screenshots: data.screenshots ?? [data.screenshot] ?? [],
- pageStatusCode: data.pageStatusCode,
- pageError: data.pageError ?? data.error,
- };
- }
- } catch (error) {
- if (error.code === "ECONNABORTED") {
- Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
- logParams.error_message = "Request timed out";
- } else {
- Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
- logParams.error_message = error.message || error;
- }
- return { html: "", pageStatusCode: null, pageError: logParams.error_message };
- } finally {
- const endTime = Date.now();
- logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
- await logScrape(logParams, pageOptions);
- }
-}
-
-
diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts
deleted file mode 100644
index 09c7353b..00000000
--- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts
+++ /dev/null
@@ -1,111 +0,0 @@
-import axios from "axios";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with Playwright
- * @param url The URL to scrape
- * @param waitFor The time to wait for the page to load
- * @param headers The headers to send with the request
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithPlaywright(
- url: string,
- waitFor: number = 0,
- headers?: Record,
- pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
- const logParams = {
- url,
- scraper: "playwright",
- success: false,
- response_code: null,
- time_taken_seconds: null,
- error_message: null,
- html: "",
- startTime: Date.now(),
- };
-
- try {
- const reqParams = await generateRequestParams(url);
- // If the user has passed a wait parameter in the request, use that
- const waitParam = reqParams["params"]?.wait ?? waitFor;
-
- const response = await axios.post(
- process.env.PLAYWRIGHT_MICROSERVICE_URL,
- {
- url: url,
- wait_after_load: waitParam,
- timeout: universalTimeout + waitParam,
- headers: headers,
- },
- {
- headers: {
- "Content-Type": "application/json",
- },
- timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
- transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
- }
- );
-
- if (response.status !== 200) {
- Logger.debug(
- `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
- );
- logParams.error_message = response.data?.pageError;
- logParams.response_code = response.data?.pageStatusCode;
- return {
- content: "",
- pageStatusCode: response.data?.pageStatusCode,
- pageError: response.data?.pageError,
- };
- }
-
- const contentType = response.headers["content-type"];
- if (contentType && contentType.includes("application/pdf")) {
- logParams.success = true;
- const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
- logParams.response_code = pageStatusCode;
- logParams.error_message = pageError;
- return { content, pageStatusCode, pageError };
- } else {
- const textData = response.data;
- try {
- const data = JSON.parse(textData);
- const html = data.content;
- logParams.success = true;
- logParams.html = html;
- logParams.response_code = data.pageStatusCode;
- logParams.error_message = data.pageError;
- return {
- content: html ?? "",
- pageStatusCode: data.pageStatusCode,
- pageError: data.pageError,
- };
- } catch (jsonError) {
- logParams.error_message = jsonError.message || jsonError;
- Logger.debug(
- `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
- );
- return { content: "", pageStatusCode: null, pageError: logParams.error_message };
- }
- }
- } catch (error) {
- if (error.code === "ECONNABORTED") {
- logParams.error_message = "Request timed out";
- Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
- } else {
- logParams.error_message = error.message || error;
- Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
- }
- return { content: "", pageStatusCode: null, pageError: logParams.error_message };
- } finally {
- const endTime = Date.now();
- logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
- await logScrape(logParams);
- }
-}
diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
deleted file mode 100644
index b72fa8b2..00000000
--- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
+++ /dev/null
@@ -1,92 +0,0 @@
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { ScrapingBeeClient } from "scrapingbee";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with ScrapingBee
- * @param url The URL to scrape
- * @param wait_browser The browser event to wait for
- * @param timeout The timeout for the scrape
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithScrapingBee(
- url: string,
- wait_browser: string = "domcontentloaded",
- timeout: number = universalTimeout,
- pageOptions: { parsePDF?: boolean } = { parsePDF: true }
- ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
- const logParams = {
- url,
- scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
- success: false,
- response_code: null,
- time_taken_seconds: null,
- error_message: null,
- html: "",
- startTime: Date.now(),
- };
- try {
- const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
- const clientParams = await generateRequestParams(
- url,
- wait_browser,
- timeout
- );
- const response = await client.get({
- ...clientParams,
- params: {
- ...clientParams.params,
- transparent_status_code: "True",
- },
- });
- Logger.info(
- `⛏️ ScrapingBee: Scraping ${url}`
- );
- const contentType = response.headers["content-type"];
- if (contentType && contentType.includes("application/pdf")) {
- logParams.success = true;
- const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
- logParams.response_code = pageStatusCode;
- logParams.error_message = pageError;
- return { content, pageStatusCode, pageError };
- } else {
- let text = "";
- try {
- const decoder = new TextDecoder();
- text = decoder.decode(response.data);
- logParams.success = true;
- } catch (decodeError) {
- Logger.debug(
- `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
- );
- logParams.error_message = decodeError.message || decodeError;
- }
- logParams.response_code = response.status;
- logParams.html = text;
- logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
- logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
- return {
- content: text,
- pageStatusCode: response.status,
- pageError: response.statusText !== "OK" ? response.statusText : undefined,
- };
- }
- } catch (error) {
- Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
- logParams.error_message = error.message || error;
- logParams.response_code = error.response?.status;
- return {
- content: "",
- pageStatusCode: error.response?.status,
- pageError: error.response?.statusText,
- };
- } finally {
- const endTime = Date.now();
- logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
- await logScrape(logParams);
- }
- }
\ No newline at end of file
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
deleted file mode 100644
index cd76793c..00000000
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ /dev/null
@@ -1,496 +0,0 @@
-import * as cheerio from "cheerio";
-import { extractMetadata } from "./utils/metadata";
-import dotenv from "dotenv";
-import {
- Document,
- PageOptions,
- FireEngineResponse,
- ExtractorOptions,
- Action,
-} from "../../lib/entities";
-import { parseMarkdown } from "../../lib/html-to-markdown";
-import { urlSpecificParams } from "./utils/custom/website_params";
-import { fetchAndProcessPdf } from "./utils/pdfProcessor";
-import { handleCustomScraping } from "./custom/handleCustomScraping";
-import { removeUnwantedElements } from "./utils/removeUnwantedElements";
-import { scrapWithFetch } from "./scrapers/fetch";
-import { scrapWithFireEngine } from "./scrapers/fireEngine";
-import { scrapWithPlaywright } from "./scrapers/playwright";
-import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
-import { extractLinks } from "./utils/utils";
-import { Logger } from "../../lib/logger";
-import { ScrapeEvents } from "../../lib/scrape-events";
-import { clientSideError } from "../../strings";
-
-dotenv.config();
-
-const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
-const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
-
-export const baseScrapers = [
- useFireEngine ? "fire-engine;chrome-cdp" : undefined,
- useFireEngine ? "fire-engine" : undefined,
- useScrapingBee ? "scrapingBee" : undefined,
- useFireEngine ? undefined : "playwright",
- useScrapingBee ? "scrapingBeeLoad" : undefined,
- "fetch",
-].filter(Boolean);
-
-export async function generateRequestParams(
- url: string,
- wait_browser: string = "domcontentloaded",
- timeout: number = 15000
-): Promise {
- const defaultParams = {
- url: url,
- params: { timeout: timeout, wait_browser: wait_browser },
- headers: { "ScrapingService-Request": "TRUE" },
- };
-
- try {
- const urlKey = new URL(url).hostname.replace(/^www\./, "");
- if (urlSpecificParams.hasOwnProperty(urlKey)) {
- return { ...defaultParams, ...urlSpecificParams[urlKey] };
- } else {
- return defaultParams;
- }
- } catch (error) {
- Logger.error(`Error generating URL key: ${error}`);
- return defaultParams;
- }
-}
-
-/**
- * Get the order of scrapers to be used for scraping a URL
- * If the user doesn't have envs set for a specific scraper, it will be removed from the order.
- * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
- * @returns The order of scrapers to be used for scraping a URL
- */
-function getScrapingFallbackOrder(
- defaultScraper?: string,
- isWaitPresent: boolean = false,
- isScreenshotPresent: boolean = false,
- isHeadersPresent: boolean = false,
- isActionsPresent: boolean = false,
-) {
- if (isActionsPresent) {
- return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
- }
-
- const availableScrapers = baseScrapers.filter((scraper) => {
- switch (scraper) {
- case "scrapingBee":
- case "scrapingBeeLoad":
- return !!process.env.SCRAPING_BEE_API_KEY;
- case "fire-engine":
- return !!process.env.FIRE_ENGINE_BETA_URL;
- case "fire-engine;chrome-cdp":
- return !!process.env.FIRE_ENGINE_BETA_URL;
- case "playwright":
- return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
- default:
- return true;
- }
- });
-
- let defaultOrder = [
- useFireEngine ? "fire-engine;chrome-cdp" : undefined,
- useFireEngine ? "fire-engine" : undefined,
- useScrapingBee ? "scrapingBee" : undefined,
- useScrapingBee ? "scrapingBeeLoad" : undefined,
- useFireEngine ? undefined : "playwright",
- "fetch",
- ].filter(Boolean);
-
- // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
- // defaultOrder = [
- // "fire-engine",
- // useFireEngine ? undefined : "playwright",
- // ...defaultOrder.filter(
- // (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
- // ),
- // ].filter(Boolean);
- // }
-
- const filteredDefaultOrder = defaultOrder.filter(
- (scraper: (typeof baseScrapers)[number]) =>
- availableScrapers.includes(scraper)
- );
- const uniqueScrapers = new Set(
- defaultScraper
- ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
- : [...filteredDefaultOrder, ...availableScrapers]
- );
-
- const scrapersInOrder = Array.from(uniqueScrapers);
- return scrapersInOrder as (typeof baseScrapers)[number][];
-}
-
-
-
-export async function scrapSingleUrl(
- jobId: string,
- urlToScrap: string,
- pageOptions: PageOptions,
- extractorOptions?: ExtractorOptions,
- existingHtml?: string,
- priority?: number,
- teamId?: string
-): Promise {
- pageOptions = {
- includeMarkdown: pageOptions.includeMarkdown ?? true,
- includeExtract: pageOptions.includeExtract ?? false,
- onlyMainContent: pageOptions.onlyMainContent ?? false,
- includeHtml: pageOptions.includeHtml ?? false,
- includeRawHtml: pageOptions.includeRawHtml ?? false,
- waitFor: pageOptions.waitFor ?? undefined,
- screenshot: pageOptions.screenshot ?? false,
- fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
- headers: pageOptions.headers ?? undefined,
- includeLinks: pageOptions.includeLinks ?? true,
- replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
- parsePDF: pageOptions.parsePDF ?? true,
- removeTags: pageOptions.removeTags ?? [],
- onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
- useFastMode: pageOptions.useFastMode ?? false,
- disableJsDom: pageOptions.disableJsDom ?? false,
- atsv: pageOptions.atsv ?? false,
- actions: pageOptions.actions ?? undefined,
- geolocation: pageOptions.geolocation ?? undefined,
- skipTlsVerification: pageOptions.skipTlsVerification ?? false,
- }
-
- if (extractorOptions) {
- extractorOptions = {
- mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
- }
- }
-
- if (!existingHtml) {
- existingHtml = "";
- }
-
- urlToScrap = urlToScrap.trim();
-
- const attemptScraping = async (
- url: string,
- method: (typeof baseScrapers)[number]
- ) => {
- let scraperResponse: {
- text: string;
- screenshot: string;
- actions?: {
- screenshots: string[];
- };
- metadata: { pageStatusCode?: number; pageError?: string | null };
- } = { text: "", screenshot: "", metadata: {} };
- let screenshot = "";
-
- const timer = Date.now();
- const logInsertPromise = ScrapeEvents.insert(jobId, {
- type: "scrape",
- url,
- worker: process.env.FLY_MACHINE_ID,
- method,
- result: null,
- });
-
- switch (method) {
- case "fire-engine":
- case "fire-engine;chrome-cdp":
-
- let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
- if (method === "fire-engine;chrome-cdp") {
- engine = "chrome-cdp";
- }
-
- if (process.env.FIRE_ENGINE_BETA_URL) {
- const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
- if (action.type === "click" || action.type === "write" || action.type === "press") {
- const result: Action[] = [];
- // Don't add a wait if the previous action is a wait
- if (index === 0 || array[index - 1].type !== "wait") {
- result.push({ type: "wait", milliseconds: 1200 } as Action);
- }
- result.push(action);
- // Don't add a wait if the next action is a wait
- if (index === array.length - 1 || array[index + 1].type !== "wait") {
- result.push({ type: "wait", milliseconds: 1200 } as Action);
- }
- return result;
- }
- return [action as Action];
- }) ?? [] as Action[];
-
- const response = await scrapWithFireEngine({
- url,
- ...(engine === "chrome-cdp" ? ({
- actions: [
- ...(pageOptions.waitFor ? [{
- type: "wait" as const,
- milliseconds: pageOptions.waitFor,
- }] : []),
- ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
- type: "screenshot" as const,
- fullPage: !!pageOptions.fullPageScreenshot,
- }] : []),
- ...processedActions,
- ],
- }) : ({
- waitFor: pageOptions.waitFor,
- screenshot: pageOptions.screenshot,
- fullPageScreenshot: pageOptions.fullPageScreenshot,
- })),
- pageOptions: pageOptions,
- headers: pageOptions.headers,
- fireEngineOptions: {
- engine: engine,
- atsv: pageOptions.atsv,
- disableJsDom: pageOptions.disableJsDom,
- },
- priority,
- teamId,
- });
- scraperResponse.text = response.html;
- if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
- scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
- }
- if (pageOptions.actions) {
- scraperResponse.actions = {
- screenshots: response.screenshots ?? [],
- };
- }
- scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
- scraperResponse.metadata.pageError = response.pageError;
- }
- break;
- case "scrapingBee":
- if (process.env.SCRAPING_BEE_API_KEY) {
- const response = await scrapWithScrapingBee(
- url,
- "domcontentloaded",
- pageOptions.fallback === false ? 7000 : 15000
- );
- scraperResponse.text = response.content;
- scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
- scraperResponse.metadata.pageError = response.pageError;
- }
- break;
- case "playwright":
- if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
- const response = await scrapWithPlaywright(
- url,
- pageOptions.waitFor,
- pageOptions.headers
- );
- scraperResponse.text = response.content;
- scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
- scraperResponse.metadata.pageError = response.pageError;
- }
- break;
- case "scrapingBeeLoad":
- if (process.env.SCRAPING_BEE_API_KEY) {
- const response = await scrapWithScrapingBee(url, "networkidle2");
- scraperResponse.text = response.content;
- scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
- scraperResponse.metadata.pageError = response.pageError;
- }
- break;
- case "fetch":
- const response = await scrapWithFetch(url);
- scraperResponse.text = response.content;
- scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
- scraperResponse.metadata.pageError = response.pageError;
- break;
- }
-
- let customScrapedContent: FireEngineResponse | null = null;
-
- // Check for custom scraping conditions
- const customScraperResult = await handleCustomScraping(
- scraperResponse.text,
- url
- );
-
- if (customScraperResult) {
- switch (customScraperResult.scraper) {
- case "fire-engine":
- customScrapedContent = await scrapWithFireEngine({
- url: customScraperResult.url,
- actions: customScraperResult.waitAfterLoad ? ([
- {
- type: "wait",
- milliseconds: customScraperResult.waitAfterLoad,
- }
- ]) : ([]),
- pageOptions: customScraperResult.pageOptions,
- });
- break;
- case "pdf":
- const { content, pageStatusCode, pageError } =
- await fetchAndProcessPdf(
- customScraperResult.url,
- pageOptions?.parsePDF
- );
- customScrapedContent = {
- html: content,
- pageStatusCode,
- pageError,
- };
- break;
- }
- }
-
- if (customScrapedContent) {
- scraperResponse.text = customScrapedContent.html;
- }
- //* TODO: add an optional to return markdown or structured/extracted content
- let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
- const text = await parseMarkdown(cleanedHtml);
-
- const insertedLogId = await logInsertPromise;
- ScrapeEvents.updateScrapeResult(insertedLogId, {
- response_size: scraperResponse.text.length,
- success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
- error: scraperResponse.metadata.pageError,
- response_code: scraperResponse.metadata.pageStatusCode,
- time_taken: Date.now() - timer,
- });
-
- return {
- text,
- html: cleanedHtml,
- rawHtml: scraperResponse.text,
- screenshot: scraperResponse.screenshot,
- actions: scraperResponse.actions,
- pageStatusCode: scraperResponse.metadata.pageStatusCode,
- pageError: scraperResponse.metadata.pageError || undefined,
- };
- };
-
- let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
- text: "",
- html: "",
- rawHtml: "",
- screenshot: "",
- actions: undefined,
- pageStatusCode: 200,
- pageError: undefined,
- };
- try {
- let urlKey = urlToScrap;
- try {
- urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
- } catch (error) {
- Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
- }
- const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
- const scrapersInOrder = getScrapingFallbackOrder(
- defaultScraper,
- pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
- pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
- pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
- pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
- );
-
- for (const scraper of scrapersInOrder) {
- // If exists text coming from crawler, use it
- if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
- let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
- text = await parseMarkdown(cleanedHtml);
- html = cleanedHtml;
- break;
- }
-
- const attempt = await attemptScraping(urlToScrap, scraper);
- text = attempt.text ?? "";
- html = attempt.html ?? "";
- rawHtml = attempt.rawHtml ?? "";
- screenshot = attempt.screenshot ?? "";
- actions = attempt.actions ?? undefined;
-
- if (attempt.pageStatusCode) {
- pageStatusCode = attempt.pageStatusCode;
- }
-
- if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
- pageError = attempt.pageError;
-
- if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
- pageStatusCode = 500;
- }
- } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
- pageError = undefined;
- }
-
- if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
- Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
- break;
- }
- if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
- Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
- break;
- }
- // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
- // if (nextScraperIndex < scrapersInOrder.length) {
- // Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
- // }
- }
-
- if (!text) {
- throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
- }
-
- const soup = cheerio.load(rawHtml);
- const metadata = extractMetadata(soup, urlToScrap);
-
- let linksOnPage: string[] | undefined;
-
- if (pageOptions.includeLinks) {
- linksOnPage = extractLinks(rawHtml, urlToScrap);
- }
-
- let document: Document = {
- content: text,
- markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
- html: pageOptions.includeHtml ? html : undefined,
- rawHtml:
- pageOptions.includeRawHtml ||
- (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
- ? rawHtml
- : undefined,
- linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
- actions,
- metadata: {
- ...metadata,
- ...(screenshot && screenshot.length > 0 ? ({
- screenshot,
- }) : {}),
- sourceURL: urlToScrap,
- pageStatusCode: pageStatusCode,
- pageError: pageError,
- },
- };
-
- return document;
- } catch (error) {
- Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
- ScrapeEvents.insert(jobId, {
- type: "error",
- message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
- stack: error.stack,
- });
-
- return {
- content: "",
- markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
- html: "",
- linksOnPage: pageOptions.includeLinks ? [] : undefined,
- metadata: {
- sourceURL: urlToScrap,
- pageStatusCode: pageStatusCode,
- pageError: pageError,
- },
- } as Document;
- }
-}
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 756cd765..05b3d00d 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -1,9 +1,10 @@
import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
-import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
+import { scrapeURL } from "../scrapeURL";
+import { scrapeOptions } from "../../controllers/v1/types";
export async function getLinksFromSitemap(
{
@@ -17,17 +18,20 @@ export async function getLinksFromSitemap(
}
): Promise {
try {
- let content: string;
+ let content: string = "";
try {
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
- const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } });
- content = response.html;
+ const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
+ if (!response.success) {
+ throw response.error;
+ }
+ content = response.document.rawHtml!;
}
} catch (error) {
- Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
+ logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
return allUrls;
}
@@ -47,7 +51,7 @@ export async function getLinksFromSitemap(
allUrls.push(...validUrls);
}
} catch (error) {
- Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
+ logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
}
return allUrls;
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts
deleted file mode 100644
index 53237ef8..00000000
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts
+++ /dev/null
@@ -1,15 +0,0 @@
-import * as docxProcessor from "../docxProcessor";
-
-describe("DOCX Processing Module - Integration Test", () => {
- it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
- delete process.env.LLAMAPARSE_API_KEY;
- const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
- "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
- );
- expect(content.trim()).toContain(
- "SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
- );
- expect(pageStatusCode).toBe(200);
- expect(pageError).toBeUndefined();
- });
-});
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
deleted file mode 100644
index 8d644c7b..00000000
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts
+++ /dev/null
@@ -1,128 +0,0 @@
-import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
-import cheerio from 'cheerio';
-
-describe('parseTablesToMarkdown', () => {
- it('converts a simple HTML table to Markdown', async () => {
- const html = `
-
- Header 1 | Header 2 |
- Row 1 Col 1 | Row 1 Col 2 |
- Row 2 Col 1 | Row 2 Col 2 |
-
- `;
- const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with a single row to Markdown', async () => {
- const html = `
-
- Header 1 | Header 2 |
- Row 1 Col 1 | Row 1 Col 2 |
-
- `;
- const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with a single column to Markdown', async () => {
- const html = `
-
- Header 1 |
- Row 1 Col 1 |
- Row 2 Col 1 |
-
- `;
- const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with a single cell to Markdown', async () => {
- const html = `
-
- Header 1 |
- Row 1 Col 1 |
-
- `;
- const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |
`;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with no header to Markdown', async () => {
- const html = `
-
- Row 1 Col 1 | Row 1 Col 2 |
- Row 2 Col 1 | Row 2 Col 2 |
-
- `;
- const expectedMarkdown = `| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with no rows to Markdown', async () => {
- const html = `
-
- `;
- const expectedMarkdown = ``;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with no cells to Markdown', async () => {
- const html = `
-
- `;
- const expectedMarkdown = ``;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with no columns to Markdown', async () => {
- const html = `
-
- `;
- const expectedMarkdown = ``;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
- it('converts a table with no table to Markdown', async () => {
- const html = ``;
- const expectedMarkdown = ``;
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
- });
-
-it('converts a table inside of a bunch of html noise', async () => {
- const html = `
-
-
Some text before
-
- Row 1 Col 1 | Row 1 Col 2 |
- Row 2 Col 1 | Row 2 Col 2 |
-
-
Some text after
-
- `;
- const expectedMarkdown = `
-
Some text before
-
| Row 1 Col 1 | Row 1 Col 2 |
-| Row 2 Col 1 | Row 2 Col 2 |
-
Some text after
-
`;
-
- const markdown = await parseTablesToMarkdown(html);
- expect(markdown).toBe(expectedMarkdown);
-});
-
-});
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
deleted file mode 100644
index 18302654..00000000
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-import * as pdfProcessor from '../pdfProcessor';
-
-describe('PDF Processing Module - Integration Test', () => {
- it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
- delete process.env.LLAMAPARSE_API_KEY;
- const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
- expect(content.trim()).toEqual("Dummy PDF file");
- expect(pageStatusCode).toEqual(200);
- expect(pageError).toBeUndefined();
- });
-
- it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
- const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
- expect(pageStatusCode).toBe(200);
- expect(pageError).toBeUndefined();
- expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
- }, 60000); // 60 seconds
-
-});
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
deleted file mode 100644
index b3d4a244..00000000
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
+++ /dev/null
@@ -1,192 +0,0 @@
-import { removeUnwantedElements } from "../removeUnwantedElements";
-import { PageOptions } from "../../../../lib/entities";
-
-describe('removeUnwantedElements', () => {
- it('should remove script, style, iframe, noscript, meta, and head tags', () => {
- const html = `TestContent
`;
- const options: PageOptions = {};
- const result = removeUnwantedElements(html, options);
- expect(result).not.toContain('