This commit is contained in:
Nicolas 2025-02-20 10:50:32 -03:00
commit 2151ca846c
61 changed files with 2318 additions and 576 deletions

View File

@ -1,20 +0,0 @@
name: Clean Every 30 Minutes Before 24h Completed Jobs
on:
schedule:
- cron: '30 * * * *'
env:
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
jobs:
clean-jobs:
runs-on: ubuntu-latest
steps:
- name: Send GET request to clean jobs
run: |
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
if [ "$response" -ne 200 ]; then
echo "Failed to clean jobs. Response: $response"
exit 1
fi
echo "Successfully cleaned jobs. Response: $response"

View File

@ -4,9 +4,6 @@ env:
DOTNET_VERSION: '6.0.x'
on:
push:
branches:
- mog/webscraper-refactor
workflow_dispatch:
jobs:

View File

@ -2,12 +2,13 @@ name: Deploy Images to GHCR
env:
DOTNET_VERSION: '6.0.x'
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
on:
push:
branches:
- main
paths:
- apps/api/**
workflow_dispatch:
jobs:
@ -29,5 +30,5 @@ jobs:
- name: 'Build Inventory Image'
run: |
docker build . --tag ghcr.io/mendableai/firecrawl:latest --secret id=SENTRY_AUTH_TOKEN
docker build . --tag ghcr.io/mendableai/firecrawl:latest
docker push ghcr.io/mendableai/firecrawl:latest

32
.github/workflows/publish-js-sdk.yml vendored Normal file
View File

@ -0,0 +1,32 @@
name: Publish JS SDK
on:
push:
branches:
- main
paths:
- apps/js-sdk/firecrawl/package.json
env:
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
jobs:
publish:
name: Publish
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
- name: Authenticate
run: echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" > ~/.npmrc
- name: Publish
run: |
npm publish
sed -i 's/"name": "@mendable\/firecrawl-js"/"name": "@mendable\/firecrawl"/g' package.json
npm publish
sed -i 's/"name": "@mendable\/firecrawl-js"/"name": "firecrawl"/g' package.json
npm publish
working-directory: ./apps/js-sdk/firecrawl

30
.github/workflows/test-js-sdk.yml vendored Normal file
View File

@ -0,0 +1,30 @@
name: JS SDK Test Suite
on:
pull_request:
branches:
- main
paths:
- apps/js-sdk/firecrawl/**
env:
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
jobs:
test:
name: Run tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
cache: "npm"
cache-dependency-path: './apps/js-sdk/firecrawl/package-lock.json'
- name: Install dependencies
run: npm install
working-directory: ./apps/js-sdk/firecrawl
- name: Run tests
run: npm run test
working-directory: ./apps/js-sdk/firecrawl

View File

@ -0,0 +1,138 @@
name: Self-hosted Server Test Suite
on:
pull_request:
branches:
- main
paths:
- apps/api/**
- apps/playwright-service-ts/**
env:
PORT: 3002
REDIS_URL: redis://localhost:6379
HOST: 0.0.0.0
ENV: ${{ secrets.ENV }}
TEST_SUITE_SELF_HOSTED: true
USE_GO_MARKDOWN_PARSER: true
jobs:
test:
name: Run tests
strategy:
matrix:
ai: ["openai", "no-ai"]
search: ["searxng", "google"]
engine: ["playwright", "fetch"]
proxy: ["proxy", "no-proxy"]
fail-fast: false
runs-on: ubuntu-latest
services:
redis:
image: redis
ports:
- 6379:6379
env:
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }}
PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }}
PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }}
steps:
- uses: actions/checkout@v3
- name: Install pnpm
uses: pnpm/action-setup@v4
with:
version: 10
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
cache: "pnpm"
cache-dependency-path: './apps/api/pnpm-lock.yaml'
- name: Install dependencies
run: pnpm install
working-directory: ./apps/api
- name: Install Playwright dependencies
if: matrix.engine == 'playwright'
run: |
pnpm install
pnpm exec playwright install-deps
pnpm exec playwright install
working-directory: ./apps/playwright-service-ts
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.19'
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
- name: Build go-html-to-md
run: |
go mod tidy
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Set up SearXNG
if: matrix.search == 'searxng'
run: |
mkdir searxng
echo "use_default_settings: true
search:
formats: [html, json, csv]
server:
secret_key: 'fcsecret'" > searxng/settings.yml
docker run -d -p 3434:8080 -v "${PWD}/searxng:/etc/searxng" --name searxng searxng/searxng
pnpx wait-on tcp:3434 -t 30s
working-directory: ./
- name: Start server
run: npm start > api.log 2>&1 &
working-directory: ./apps/api
- name: Start worker
run: npm run workers > worker.log 2>&1 &
working-directory: ./apps/api
- name: Start playwright
if: matrix.engine == 'playwright'
run: npm run dev > playwright.log 2>&1 &
working-directory: ./apps/playwright-service-ts
env:
PORT: 3003
- name: Wait for server
run: pnpx wait-on tcp:3002 -t 15s
- name: Wait for playwright
if: matrix.engine == 'playwright'
run: pnpx wait-on tcp:3003 -t 15s
- name: Run snippet tests
run: |
npm run test:snips
working-directory: ./apps/api
- name: Kill instances
if: always()
run: pkill -9 node
- name: Kill SearXNG
if: always() && matrix.search == 'searxng'
run: |
docker logs searxng > searxng/searxng.log 2>&1
docker kill searxng
working-directory: ./
- uses: actions/upload-artifact@v4
if: always()
with:
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./apps/api/api.log
./apps/api/worker.log
- uses: actions/upload-artifact@v4
if: always() && matrix.playwright
with:
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }})
path: |
./apps/playwright-service-ts/playwright.log
- uses: actions/upload-artifact@v4
if: always() && matrix.search == 'searxng'
with:
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./searxng/searxng.log
./searxng/settings.yml

View File

@ -1,8 +1,11 @@
name: CI/CD
name: Server Test Suite
on:
pull_request:
branches:
- main
paths:
- apps/api/**
# schedule:
# - cron: '0 */4 * * *'
@ -29,10 +32,11 @@ env:
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
ENV: ${{ secrets.ENV }}
USE_GO_MARKDOWN_PARSER: true
jobs:
pre-deploy:
name: Pre-deploy checks
test:
name: Run tests
runs-on: ubuntu-latest
services:
redis:
@ -47,15 +51,30 @@ jobs:
oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
tags: tag:ci
- name: Install pnpm
uses: pnpm/action-setup@v4
with:
version: 10
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
- name: Install pnpm
run: npm install -g pnpm
cache: "pnpm"
cache-dependency-path: './apps/api/pnpm-lock.yaml'
- name: Install dependencies
run: pnpm install
working-directory: ./apps/api
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.19'
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
- name: Build go-html-to-md
run: |
go mod tidy
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Start the application
run: npm start &
working-directory: ./apps/api

View File

@ -95,7 +95,7 @@ curl -X POST https://api.firecrawl.dev/v1/crawl \
-H 'Authorization: Bearer fc-YOUR_API_KEY' \
-d '{
"url": "https://docs.firecrawl.dev",
"limit": 100,
"limit": 10,
"scrapeOptions": {
"formats": ["markdown", "html"]
}

View File

@ -34,62 +34,72 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap
2. Set environment variables
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features)
Create an `.env` in the root directory using the template below.
`.env:`
```
# ===== Required ENVS ======
NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://redis:6379
REDIS_RATE_LIMIT_URL=redis://redis:6379
## To turn on DB authentication, you need to set up Supabase.
# To turn on DB authentication, you need to set up Supabase.
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
## === AI features (JSON format on scrape, /extract API) ===
# Provide your OpenAI API key here to enable AI features
# OPENAI_API_KEY=
# Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
BULL_AUTH_KEY= @
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
## === Proxy ===
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
# PROXY_SERVER=
# PROXY_USERNAME=
# PROXY_PASSWORD=
## === /search API ===
# By default, the /search API will use Google search.
# You can specify a SearXNG server with the JSON format enabled, if you'd like to use that instead of direct Google.
# You can also customize the engines and categories parameters, but the defaults should also work just fine.
# SEARXNG_ENDPOINT=http://your.searxng.server
# SEARXNG_ENGINES=
# SEARXNG_CATEGORIES=
## === Other ===
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
# SUPABASE_ANON_TOKEN=
# SUPABASE_URL=
# SUPABASE_SERVICE_TOKEN=
# Use if you've set up authentication and want to test with a real API key
# TEST_API_KEY=
# You can add this to enable ScrapingBee as a fallback scraping engine.
# SCRAPING_BEE_API_KEY=
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
BULL_AUTH_KEY=CHANGEME
# This is now autoconfigured by the docker-compose.yaml. You shouldn't need to set it.
# PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/scrape
# REDIS_URL=redis://redis:6379
# REDIS_RATE_LIMIT_URL=redis://redis:6379
# Set if you have a llamaparse key you'd like to use to parse pdfs
# LLAMAPARSE_API_KEY=
# Set if you'd like to send server health status messages to Slack
# SLACK_WEBHOOK_URL=
# Set if you'd like to send posthog events like job logs
# POSTHOG_API_KEY=
# POSTHOG_HOST=
```
3. *(Optional) Running with TypeScript Playwright Service*
* Update the `docker-compose.yml` file to change the Playwright service:
```plaintext
build: apps/playwright-service
```
TO
```plaintext
build: apps/playwright-service-ts
```
* Set the `PLAYWRIGHT_MICROSERVICE_URL` in your `.env` file:
```plaintext
PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
```
* Don't forget to set the proxy server in your `.env` file as needed.
4. Build and run the Docker containers:
3. Build and run the Docker containers:
```bash
docker compose build
@ -98,9 +108,9 @@ POSTHOG_HOST= # set if you'd like to send posthog events like job logs
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/@/queues`.
You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/CHANGEME/queues`.
5. *(Optional)* Test the API
4. *(Optional)* Test the API
If youd like to test the crawl endpoint, you can run this:
@ -108,7 +118,7 @@ If youd like to test the crawl endpoint, you can run this:
curl -X POST http://localhost:3002/v1/crawl \
-H 'Content-Type: application/json' \
-d '{
"url": "https://mendable.ai"
"url": "https://firecrawl.dev"
}'
```

View File

@ -19,8 +19,7 @@ RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile
RUN apt-get clean && apt-get update -qq && apt-get install -y ca-certificates && update-ca-certificates
RUN pnpm install
RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
RUN pnpm run build
# Install Go
FROM golang:1.19 AS go-base

View File

@ -9,7 +9,7 @@
"format": "prettier --write \"src/**/*.(js|ts)\"",
"flyio": "node dist/src/index.js",
"start:dev": "nodemon --exec ts-node src/index.ts",
"build": "tsc && pnpm sentry:sourcemaps",
"build": "tsc",
"build:nosentry": "tsc",
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
@ -56,6 +56,7 @@
"typescript": "^5.4.2"
},
"dependencies": {
"jsdom": "^26.0.0",
"@anthropic-ai/sdk": "^0.24.3",
"@apidevtools/json-schema-ref-parser": "^11.7.3",
"@brillout/import": "^0.2.2",

286
apps/api/pnpm-lock.yaml generated
View File

@ -125,6 +125,9 @@ importers:
joplin-turndown-plugin-gfm:
specifier: ^1.0.12
version: 1.0.12
jsdom:
specifier: ^26.0.0
version: 26.0.0
json-schema-to-zod:
specifier: ^2.3.0
version: 2.3.0
@ -136,7 +139,7 @@ importers:
version: 2.9.0
langchain:
specifier: ^0.2.8
version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
languagedetect:
specifier: ^2.0.0
version: 2.0.0
@ -332,6 +335,9 @@ packages:
resolution: {integrity: sha512-WApSdLdXEBb/1FUPca2lteASewEfpjEYJ8oXZP+0gExK5qSfsEKBKcA+WjY6Q4wvXwyv0+W6Kvc372pSceib9w==}
engines: {node: '>= 16'}
'@asamuzakjp/css-color@2.8.3':
resolution: {integrity: sha512-GIc76d9UI1hCvOATjZPyHFmE5qhRccp3/zGfMPapK3jBi+yocEzp6BBB0UnfRYP9NP4FANqUZYb0hnfs3TM3hw==}
'@aws-crypto/crc32@3.0.0':
resolution: {integrity: sha512-IzSgsrxUcsrejQbPVilIKy16kAT52EwB6zSaI+M3xxIhKh5+aldEyvI+z6erM7TCLB2BJsFrtHjp6/4/sr+3dA==}
@ -685,6 +691,34 @@ packages:
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
engines: {node: '>=12'}
'@csstools/color-helpers@5.0.1':
resolution: {integrity: sha512-MKtmkA0BX87PKaO1NFRTFH+UnkgnmySQOvNxJubsadusqPEC2aJ9MOQiMceZJJ6oitUl/i0L6u0M1IrmAOmgBA==}
engines: {node: '>=18'}
'@csstools/css-calc@2.1.1':
resolution: {integrity: sha512-rL7kaUnTkL9K+Cvo2pnCieqNpTKgQzy5f+N+5Iuko9HAoasP+xgprVh7KN/MaJVvVL1l0EzQq2MoqBHKSrDrag==}
engines: {node: '>=18'}
peerDependencies:
'@csstools/css-parser-algorithms': ^3.0.4
'@csstools/css-tokenizer': ^3.0.3
'@csstools/css-color-parser@3.0.7':
resolution: {integrity: sha512-nkMp2mTICw32uE5NN+EsJ4f5N+IGFeCFu4bGpiKgb2Pq/7J/MpyLBeQ5ry4KKtRFZaYs6sTmcMYrSRIyj5DFKA==}
engines: {node: '>=18'}
peerDependencies:
'@csstools/css-parser-algorithms': ^3.0.4
'@csstools/css-tokenizer': ^3.0.3
'@csstools/css-parser-algorithms@3.0.4':
resolution: {integrity: sha512-Up7rBoV77rv29d3uKHUIVubz1BTcgyUK72IvCQAbfbMv584xHcGKCKbWh7i8hPrRJ7qU4Y8IO3IY9m+iTB7P3A==}
engines: {node: '>=18'}
peerDependencies:
'@csstools/css-tokenizer': ^3.0.3
'@csstools/css-tokenizer@3.0.3':
resolution: {integrity: sha512-UJnjoFsmxfKUdNYdWgOB0mWUypuLvAfQPH1+pyvRJs6euowbFkFC6P13w1l8mJyi3vxYMxc9kld5jZEGRQs6bw==}
engines: {node: '>=18'}
'@dabh/diagnostics@2.0.3':
resolution: {integrity: sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==}
@ -1715,6 +1749,10 @@ packages:
resolution: {integrity: sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==}
engines: {node: '>= 14'}
agent-base@7.1.3:
resolution: {integrity: sha512-jRR5wdylq8CkOe6hei19GGZnxM6rBGwFl3Bg0YItGDimvjGtAvdZk4Pu6Cl4u4Igsws4a1fd1Vq3ezrhn4KmFw==}
engines: {node: '>= 14'}
agentkeepalive@4.5.0:
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
engines: {node: '>= 8.0.0'}
@ -2141,6 +2179,10 @@ packages:
resolution: {integrity: sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==}
engines: {node: '>= 6'}
cssstyle@4.2.1:
resolution: {integrity: sha512-9+vem03dMXG7gDmZ62uqmRiMRNtinIZ9ZyuF6BdxzfOD+FdN5hretzynkn0ReS2DO2GSw76RWHs0UmJPI2zUjw==}
engines: {node: '>=18'}
csv-parse@5.5.6:
resolution: {integrity: sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==}
@ -2152,6 +2194,10 @@ packages:
resolution: {integrity: sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==}
engines: {node: '>= 14'}
data-urls@5.0.0:
resolution: {integrity: sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==}
engines: {node: '>=18'}
date-fns@3.6.0:
resolution: {integrity: sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww==}
@ -2197,6 +2243,9 @@ packages:
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
engines: {node: '>=10'}
decimal.js@10.5.0:
resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==}
dedent@1.5.3:
resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==}
peerDependencies:
@ -2510,6 +2559,10 @@ packages:
resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==}
engines: {node: '>= 6'}
form-data@4.0.1:
resolution: {integrity: sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==}
engines: {node: '>= 6'}
formdata-node@4.4.1:
resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==}
engines: {node: '>= 12.20'}
@ -2647,6 +2700,10 @@ packages:
resolution: {integrity: sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==}
engines: {node: '>=12'}
html-encoding-sniffer@4.0.0:
resolution: {integrity: sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==}
engines: {node: '>=18'}
html-escaper@2.0.2:
resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==}
@ -2686,6 +2743,10 @@ packages:
resolution: {integrity: sha512-1e4Wqeblerz+tMKPIq2EMGiiWW1dIjZOksyHWSUm1rmuvw/how9hBHZ38lAGj5ID4Ik6EdkOw7NmWPy6LAwalw==}
engines: {node: '>= 14'}
https-proxy-agent@7.0.6:
resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==}
engines: {node: '>= 14'}
human-signals@2.1.0:
resolution: {integrity: sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==}
engines: {node: '>=10.17.0'}
@ -2798,6 +2859,9 @@ packages:
resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==}
engines: {node: '>=8'}
is-potential-custom-element-name@1.0.1:
resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==}
is-retry-allowed@2.2.0:
resolution: {integrity: sha512-XVm7LOeLpTW4jV19QSH38vkswxoLud8sQ57YwJVTPWdiaI9I8keEhGFpBlslyVsgdQy4Opg8QOLb8YRgsyZiQg==}
engines: {node: '>=10'}
@ -3012,6 +3076,15 @@ packages:
jsbn@1.1.0:
resolution: {integrity: sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==}
jsdom@26.0.0:
resolution: {integrity: sha512-BZYDGVAIriBWTpIxYzrXjv3E/4u8+/pSG5bQdIYCbNCGOvsPkDQfTVLAIXAf9ETdCpduCVTkDe2NNZ8NIwUVzw==}
engines: {node: '>=18'}
peerDependencies:
canvas: ^3.0.0
peerDependenciesMeta:
canvas:
optional: true
jsesc@2.5.2:
resolution: {integrity: sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==}
engines: {node: '>=4'}
@ -3298,6 +3371,9 @@ packages:
resolution: {integrity: sha512-CQl19J/g+Hbjbv4Y3mFNNXFEL/5t/KCg8POCuUqd4rMKjGG+j1ybER83hxV58zL+dFI1PTkt3GNFSHRt+d8qEQ==}
engines: {node: 14 || >=16.14}
lru-cache@10.4.3:
resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
lru-cache@5.1.1:
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
@ -3588,6 +3664,9 @@ packages:
resolution: {integrity: sha512-1MQz1Ed8z2yckoBeSfkQHHO9K1yDRxxtotKSJ9yvcTUUxSvfvzEq5GwBrjjHEpMlq/k5gvXdmJ1SbYxWtpNoVg==}
engines: {node: '>=8'}
nwsapi@2.2.16:
resolution: {integrity: sha512-F1I/bimDpj3ncaNDhfyMWuFqmQDBwDB0Fogc2qpL3BWvkQteFD/8BzWuIRl83rq0DXfm8SGt/HFhLXZyljTXcQ==}
object-assign@4.1.1:
resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
engines: {node: '>=0.10.0'}
@ -3697,6 +3776,9 @@ packages:
parse5@7.1.2:
resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==}
parse5@7.2.1:
resolution: {integrity: sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==}
parseley@0.12.1:
resolution: {integrity: sha512-e6qHKe3a9HWr0oMRVDTRhKce+bRO8VGQR3NyVwcjwrbhMmFCX9KszEV35+rn4AdilFAq9VPxP/Fe1wC9Qjd2lw==}
@ -4015,6 +4097,9 @@ packages:
resolution: {integrity: sha512-s+pyvQeIKIZ0dx5iJiQk1tPLJAWln39+MI5jtM8wnyws+G5azk+dMnMX0qfbqNetKKNgcWWOdi0sfm+FbQbgdQ==}
engines: {node: '>=10.0.0'}
rrweb-cssom@0.8.0:
resolution: {integrity: sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==}
rusha@0.8.14:
resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==}
@ -4034,6 +4119,10 @@ packages:
sax@1.4.1:
resolution: {integrity: sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==}
saxes@6.0.0:
resolution: {integrity: sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==}
engines: {node: '>=v12.22.7'}
scheduler@0.23.2:
resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==}
@ -4260,6 +4349,9 @@ packages:
resolution: {integrity: sha512-SzRP5LQ6Ts2G5NyAa/jg16s8e3R7rfdFjizy1zeoecYWw+nGL+YA1xZvW/+iJmidBGSdLkuvdwTYEyJEb+EiUw==}
engines: {node: '>=0.2.6'}
symbol-tree@3.2.4:
resolution: {integrity: sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==}
systeminformation@5.22.11:
resolution: {integrity: sha512-aLws5yi4KCHTb0BVvbodQY5bY8eW4asMRDTxTW46hqw9lGjACX6TlLdJrkdoHYRB0qs+MekqEq1zG7WDnWE8Ug==}
engines: {node: '>=8.0.0'}
@ -4315,6 +4407,10 @@ packages:
resolution: {integrity: sha512-r0eojU4bI8MnHr8c5bNo7lJDdI2qXlWWJk6a9EAFG7vbhTjElYhBVS3/miuE0uOuoLdb8Mc/rVfsmm6eo5o9GA==}
hasBin: true
tough-cookie@5.1.1:
resolution: {integrity: sha512-Ek7HndSVkp10hmHP9V4qZO1u+pn1RU5sI0Fw+jCU3lyvuMZcgqsNgc6CmJJZyByK4Vm/qotGRJlfgAX8q+4JiA==}
engines: {node: '>=16'}
tr46@0.0.3:
resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
@ -4322,6 +4418,10 @@ packages:
resolution: {integrity: sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==}
engines: {node: '>=14'}
tr46@5.0.0:
resolution: {integrity: sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==}
engines: {node: '>=18'}
triple-beam@1.4.1:
resolution: {integrity: sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==}
engines: {node: '>= 14.0.0'}
@ -4483,6 +4583,10 @@ packages:
resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==}
engines: {node: '>= 0.8'}
w3c-xmlserializer@5.0.0:
resolution: {integrity: sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==}
engines: {node: '>=18'}
walker@1.0.8:
resolution: {integrity: sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==}
@ -4505,13 +4609,25 @@ packages:
resolution: {integrity: sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==}
engines: {node: '>=12'}
whatwg-encoding@3.1.1:
resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==}
engines: {node: '>=18'}
whatwg-fetch@3.6.20:
resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==}
whatwg-mimetype@4.0.0:
resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==}
engines: {node: '>=18'}
whatwg-url@13.0.0:
resolution: {integrity: sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==}
engines: {node: '>=16'}
whatwg-url@14.1.1:
resolution: {integrity: sha512-mDGf9diDad/giZ/Sm9Xi2YcyzaFpbdLpJPr+E9fSkyQ7KpQD4SdFcugkRQYzhmfI4KeV4Qpnn2sKPdo+kmsgRQ==}
engines: {node: '>=18'}
whatwg-url@5.0.0:
resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
@ -4583,6 +4699,10 @@ packages:
utf-8-validate:
optional: true
xml-name-validator@5.0.0:
resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==}
engines: {node: '>=18'}
xml2js@0.6.2:
resolution: {integrity: sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==}
engines: {node: '>=4.0.0'}
@ -4595,6 +4715,9 @@ packages:
resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==}
engines: {node: '>=4.0'}
xmlchars@2.2.0:
resolution: {integrity: sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==}
xtend@4.0.2:
resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
engines: {node: '>=0.4'}
@ -4675,6 +4798,14 @@ snapshots:
'@types/json-schema': 7.0.15
js-yaml: 4.1.0
'@asamuzakjp/css-color@2.8.3':
dependencies:
'@csstools/css-calc': 2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)
'@csstools/css-color-parser': 3.0.7(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)
'@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3)
'@csstools/css-tokenizer': 3.0.3
lru-cache: 10.4.3
'@aws-crypto/crc32@3.0.0':
dependencies:
'@aws-crypto/util': 3.0.0
@ -5413,6 +5544,26 @@ snapshots:
dependencies:
'@jridgewell/trace-mapping': 0.3.9
'@csstools/color-helpers@5.0.1': {}
'@csstools/css-calc@2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)':
dependencies:
'@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3)
'@csstools/css-tokenizer': 3.0.3
'@csstools/css-color-parser@3.0.7(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)':
dependencies:
'@csstools/color-helpers': 5.0.1
'@csstools/css-calc': 2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)
'@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3)
'@csstools/css-tokenizer': 3.0.3
'@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3)':
dependencies:
'@csstools/css-tokenizer': 3.0.3
'@csstools/css-tokenizer@3.0.3': {}
'@dabh/diagnostics@2.0.3':
dependencies:
colorspace: 1.1.4
@ -5642,13 +5793,13 @@ snapshots:
'@jsdevtools/ono@7.1.3': {}
'@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
'@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
dependencies:
ansi-styles: 5.2.0
camelcase: 6.3.0
decamelize: 1.2.0
js-tiktoken: 1.0.12
langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4)
langsmith: 0.1.34(7lljbsleilzgkaubvlq4ipicvq)
ml-distance: 4.0.1
mustache: 4.2.0
p-queue: 6.6.2
@ -5660,9 +5811,9 @@ snapshots:
- langchain
- openai
'@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
'@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
js-tiktoken: 1.0.12
openai: 4.57.0(encoding@0.1.13)(zod@3.23.8)
zod: 3.23.8
@ -5671,9 +5822,9 @@ snapshots:
- encoding
- langchain
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
js-tiktoken: 1.0.12
transitivePeerDependencies:
- langchain
@ -6811,6 +6962,8 @@ snapshots:
transitivePeerDependencies:
- supports-color
agent-base@7.1.3: {}
agentkeepalive@4.5.0:
dependencies:
humanize-ms: 1.2.1
@ -7321,12 +7474,22 @@ snapshots:
css-what@6.1.0: {}
cssstyle@4.2.1:
dependencies:
'@asamuzakjp/css-color': 2.8.3
rrweb-cssom: 0.8.0
csv-parse@5.5.6: {}
data-uri-to-buffer@4.0.1: {}
data-uri-to-buffer@6.0.2: {}
data-urls@5.0.0:
dependencies:
whatwg-mimetype: 4.0.0
whatwg-url: 14.1.1
date-fns@3.6.0: {}
debug@2.6.9:
@ -7351,6 +7514,8 @@ snapshots:
decamelize@4.0.0: {}
decimal.js@10.5.0: {}
dedent@1.5.3: {}
deepmerge@4.3.1: {}
@ -7661,6 +7826,12 @@ snapshots:
combined-stream: 1.0.8
mime-types: 2.1.35
form-data@4.0.1:
dependencies:
asynckit: 0.4.0
combined-stream: 1.0.8
mime-types: 2.1.35
formdata-node@4.4.1:
dependencies:
node-domexception: 1.0.0
@ -7795,6 +7966,10 @@ snapshots:
dependencies:
whatwg-encoding: 2.0.0
html-encoding-sniffer@4.0.0:
dependencies:
whatwg-encoding: 3.1.1
html-escaper@2.0.2: {}
html-to-text@9.0.5:
@ -7875,6 +8050,13 @@ snapshots:
transitivePeerDependencies:
- supports-color
https-proxy-agent@7.0.6:
dependencies:
agent-base: 7.1.3
debug: 4.3.5
transitivePeerDependencies:
- supports-color
human-signals@2.1.0: {}
humanize-ms@1.2.1:
@ -7984,6 +8166,8 @@ snapshots:
is-plain-obj@2.1.0: {}
is-potential-custom-element-name@1.0.1: {}
is-retry-allowed@2.2.0: {}
is-stream@2.0.1: {}
@ -8400,6 +8584,34 @@ snapshots:
jsbn@1.1.0: {}
jsdom@26.0.0:
dependencies:
cssstyle: 4.2.1
data-urls: 5.0.0
decimal.js: 10.5.0
form-data: 4.0.1
html-encoding-sniffer: 4.0.0
http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.6
is-potential-custom-element-name: 1.0.1
nwsapi: 2.2.16
parse5: 7.2.1
rrweb-cssom: 0.8.0
saxes: 6.0.0
symbol-tree: 3.2.4
tough-cookie: 5.1.1
w3c-xmlserializer: 5.0.0
webidl-conversions: 7.0.0
whatwg-encoding: 3.1.1
whatwg-mimetype: 4.0.0
whatwg-url: 14.1.1
ws: 8.18.0
xml-name-validator: 5.0.0
transitivePeerDependencies:
- bufferutil
- supports-color
- utf-8-validate
jsesc@2.5.2: {}
json-parse-even-better-errors@2.3.1: {}
@ -8435,17 +8647,17 @@ snapshots:
kuler@2.0.0: {}
langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
binary-extensions: 2.3.0
js-tiktoken: 1.0.12
js-yaml: 4.1.0
jsonpointer: 5.0.1
langchainhub: 0.0.11
langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4)
langsmith: 0.1.34(7lljbsleilzgkaubvlq4ipicvq)
ml-distance: 4.0.1
openapi-types: 12.1.3
p-retry: 4.6.2
@ -8463,6 +8675,7 @@ snapshots:
handlebars: 4.7.8
html-to-text: 9.0.5
ioredis: 5.4.1
jsdom: 26.0.0
mammoth: 1.7.2
mongodb: 6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3)
pdf-parse: 1.1.1
@ -8475,7 +8688,7 @@ snapshots:
langchainhub@0.0.11: {}
langsmith@0.1.34(npkyd6f7wyl3urgrzoxaktl5a4):
langsmith@0.1.34(7lljbsleilzgkaubvlq4ipicvq):
dependencies:
'@types/uuid': 9.0.8
commander: 10.0.1
@ -8484,8 +8697,8 @@ snapshots:
p-retry: 4.6.2
uuid: 9.0.1
optionalDependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.57.0(encoding@0.1.13)(zod@3.23.8)
languagedetect@2.0.0: {}
@ -8554,6 +8767,8 @@ snapshots:
lru-cache@10.3.0: {}
lru-cache@10.4.3: {}
lru-cache@5.1.1:
dependencies:
yallist: 3.1.1
@ -8849,6 +9064,8 @@ snapshots:
num-sort@2.1.0: {}
nwsapi@2.2.16: {}
object-assign@4.1.1: {}
object-inspect@1.13.1: {}
@ -8979,6 +9196,10 @@ snapshots:
dependencies:
entities: 4.5.0
parse5@7.2.1:
dependencies:
entities: 4.5.0
parseley@0.12.1:
dependencies:
leac: 0.6.0
@ -9321,6 +9542,8 @@ snapshots:
robots-parser@3.0.1: {}
rrweb-cssom@0.8.0: {}
rusha@0.8.14: {}
safe-buffer@5.1.2: {}
@ -9333,6 +9556,10 @@ snapshots:
sax@1.4.1: {}
saxes@6.0.0:
dependencies:
xmlchars: 2.2.0
scheduler@0.23.2:
dependencies:
loose-envify: 1.4.0
@ -9583,6 +9810,8 @@ snapshots:
sylvester@0.0.12: {}
symbol-tree@3.2.4: {}
systeminformation@5.22.11: {}
tar-fs@3.0.5:
@ -9640,12 +9869,20 @@ snapshots:
touch@3.1.1: {}
tough-cookie@5.1.1:
dependencies:
tldts: 6.1.75
tr46@0.0.3: {}
tr46@4.1.1:
dependencies:
punycode: 2.3.1
tr46@5.0.0:
dependencies:
punycode: 2.3.1
triple-beam@1.4.1: {}
ts-jest@29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)))(typescript@5.4.5):
@ -9777,6 +10014,10 @@ snapshots:
vary@1.1.2: {}
w3c-xmlserializer@5.0.0:
dependencies:
xml-name-validator: 5.0.0
walker@1.0.8:
dependencies:
makeerror: 1.0.12
@ -9793,13 +10034,24 @@ snapshots:
dependencies:
iconv-lite: 0.6.3
whatwg-encoding@3.1.1:
dependencies:
iconv-lite: 0.6.3
whatwg-fetch@3.6.20: {}
whatwg-mimetype@4.0.0: {}
whatwg-url@13.0.0:
dependencies:
tr46: 4.1.1
webidl-conversions: 7.0.0
whatwg-url@14.1.1:
dependencies:
tr46: 5.0.0
webidl-conversions: 7.0.0
whatwg-url@5.0.0:
dependencies:
tr46: 0.0.3
@ -9868,6 +10120,8 @@ snapshots:
ws@8.18.0: {}
xml-name-validator@5.0.0: {}
xml2js@0.6.2:
dependencies:
sax: 1.4.1
@ -9877,6 +10131,8 @@ snapshots:
xmlbuilder@11.0.1: {}
xmlchars@2.2.0: {}
xtend@4.0.2: {}
y18n@5.0.8: {}

View File

@ -30,7 +30,7 @@ async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof bat
x = await batchScrapeStatus(bss.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed")
} while (x.body.status === "scraping");
expectBatchScrapeToSucceed(x);
return x;
@ -53,40 +53,51 @@ function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScr
}
describe("Batch scrape tests", () => {
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await batchScrape({
urls: ["http://firecrawl.dev"],
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
urls: ["http://firecrawl.dev"]
});
expect(response.body.data[0]).toHaveProperty("json");
expect(response.body.data[0].json).toHaveProperty("company_mission");
expect(typeof response.body.data[0].json.company_mission).toBe("string");
expect(response.body.data[0].json).toHaveProperty("supports_sso");
expect(response.body.data[0].json.supports_sso).toBe(false);
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
expect(response.body.data[0].json).toHaveProperty("is_open_source");
expect(response.body.data[0].json.is_open_source).toBe(true);
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
expect(response.body.data[0]).toHaveProperty("markdown");
expect(response.body.data[0].markdown).toContain("Firecrawl");
}, 30000);
});
if (!process.env.TEST_SUITE_SELF_HOSTED) {
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await batchScrape({
urls: ["http://firecrawl.dev"],
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});
expect(response.body.data[0]).toHaveProperty("json");
expect(response.body.data[0].json).toHaveProperty("company_mission");
expect(typeof response.body.data[0].json.company_mission).toBe("string");
expect(response.body.data[0].json).toHaveProperty("supports_sso");
expect(response.body.data[0].json.supports_sso).toBe(false);
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
expect(response.body.data[0].json).toHaveProperty("is_open_source");
expect(response.body.data[0].json.is_open_source).toBe(true);
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
}, 30000);
});
}
});

View File

@ -30,7 +30,7 @@ async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
x = await crawlStatus(cs.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed")
} while (x.body.status === "scraping");
expectCrawlToSucceed(x);
return x;

View File

@ -30,7 +30,7 @@ async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
x = await extractStatus(es.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed");
} while (x.body.status === "processing");
expectExtractToSucceed(x);
return x.body;
@ -51,31 +51,37 @@ function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatu
}
describe("Extract tests", () => {
it.concurrent("works", async () => {
const res = await extract({
urls: ["https://firecrawl.dev"],
schema: {
"type": "object",
"properties": {
"company_mission": {
"type": "string"
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
it.concurrent("works", async () => {
const res = await extract({
urls: ["https://firecrawl.dev"],
schema: {
"type": "object",
"properties": {
"company_mission": {
"type": "string"
},
"is_open_source": {
"type": "boolean"
}
},
"is_open_source": {
"type": "boolean"
}
"required": [
"company_mission",
"is_open_source"
]
},
"required": [
"company_mission",
"is_open_source"
]
},
origin: "api-sdk",
});
origin: "api-sdk",
});
expect(res.data).toHaveProperty("company_mission");
expect(typeof res.data.company_mission).toBe("string")
expect(res.data).toHaveProperty("is_open_source");
expect(typeof res.data.is_open_source).toBe("boolean");
expect(res.data.is_open_source).toBe(true);
}, 60000);
expect(res.data).toHaveProperty("company_mission");
expect(typeof res.data.company_mission).toBe("string")
expect(res.data).toHaveProperty("is_open_source");
expect(typeof res.data.is_open_source).toBe("boolean");
expect(res.data.is_open_source).toBe(true);
}, 60000);
} else {
it.concurrent("dummy test", () => {
expect(true).toBe(true);
});
}
});

View File

@ -21,7 +21,7 @@ function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
}
describe("Map tests", () => {
it("basic map succeeds", async () => {
it.concurrent("basic map succeeds", async () => {
const response = await map({
url: "http://firecrawl.dev",
});
@ -29,7 +29,7 @@ describe("Map tests", () => {
expectMapToSucceed(response);
}, 10000);
it("times out properly", async () => {
it.concurrent("times out properly", async () => {
const response = await map({
url: "http://firecrawl.dev",
timeout: 1
@ -40,14 +40,15 @@ describe("Map tests", () => {
expect(response.body.error).toBe("Request timed out");
}, 10000);
it("handles query parameters correctly", async () => {
it.concurrent("handles query parameters correctly", async () => {
let response = await map({
url: "https://www.hfea.gov.uk",
sitemapOnly: true,
useMock: "map-query-params",
});
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
}, 300000);
}, 60000);
});

File diff suppressed because one or more lines are too long

View File

@ -26,7 +26,7 @@ async function scrape(body: ScrapeRequestInput): Promise<Document> {
}
describe("Scrape tests", () => {
it("mocking works properly", async () => {
it.concurrent("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
// that as its actual markdown output
@ -41,41 +41,34 @@ describe("Scrape tests", () => {
);
}, 10000);
describe("Ad blocking (f-e dependant)", () => {
it.concurrent("blocks ads by default", async () => {
it.concurrent("works", async () => {
const response = await scrape({
url: "http://firecrawl.dev"
});
expect(response.markdown).toContain("Firecrawl");
}, 10000);
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
it.concurrent("self-hosted proxy works", async () => {
const response = await scrape({
url: "https://canyoublockit.com/testing/",
url: "https://icanhazip.com"
});
expect(response.markdown).not.toContain(".g.doubleclick.net/");
}, 10000);
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
});
}
it.concurrent("doesn't block ads if explicitly disabled", async () => {
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
it.concurrent("waitFor works", async () => {
const response = await scrape({
url: "https://canyoublockit.com/testing/",
blockAds: false,
url: "http://firecrawl.dev",
waitFor: 2000,
});
expect(response.markdown).toContain(".g.doubleclick.net/");
}, 10000);
});
describe("Location API (f-e dependant)", () => {
it.concurrent("works without specifying an explicit location", async () => {
const response = await scrape({
url: "https://iplocation.com",
});
}, 10000);
it.concurrent("works with country US", async () => {
const response = await scrape({
url: "https://iplocation.com",
location: { country: "US" },
});
expect(response.markdown).toContain("| Country | United States |");
}, 10000);
});
expect(response.markdown).toContain("Firecrawl");
}, 15000);
}
describe("JSON scrape support", () => {
it.concurrent("returns parseable JSON", async () => {
@ -89,82 +82,132 @@ describe("Scrape tests", () => {
}, 25000); // TODO: mock and shorten
});
describe("Screenshot", () => {
it.concurrent("screenshot format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot"]
});
if (!process.env.TEST_SUITE_SELF_HOSTED) {
describe("Ad blocking (f-e dependant)", () => {
it.concurrent("blocks ads by default", async () => {
const response = await scrape({
url: "https://www.allrecipes.com/recipe/18185/yum/",
});
expect(typeof response.screenshot).toBe("string");
}, 15000);
expect(response.markdown).not.toContain(".g.doubleclick.net/");
}, 10000);
it.concurrent("screenshot@fullPage format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot@fullPage"]
});
it.concurrent("doesn't block ads if explicitly disabled", async () => {
const response = await scrape({
url: "https://www.allrecipes.com/recipe/18185/yum/",
blockAds: false,
});
expect(typeof response.screenshot).toBe("string");
}, 15000);
});
expect(response.markdown).toContain(".g.doubleclick.net/");
}, 10000);
});
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
describe("Location API (f-e dependant)", () => {
it.concurrent("works without specifying an explicit location", async () => {
const response = await scrape({
url: "https://iplocation.com",
});
}, 10000);
it.concurrent("works with country US", async () => {
const response = await scrape({
url: "https://iplocation.com",
location: { country: "US" },
});
expect(response.markdown).toContain("| Country | United States |");
}, 10000);
});
describe("Screenshot (f-e/sb dependant)", () => {
it.concurrent("screenshot format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot"]
});
expect(typeof response.screenshot).toBe("string");
}, 30000);
it.concurrent("screenshot@fullPage format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot@fullPage"]
});
expect(typeof response.screenshot).toBe("string");
}, 30000);
});
describe("Proxy API (f-e dependant)", () => {
it.concurrent("undefined works", async () => {
await scrape({
url: "http://firecrawl.dev",
});
}, 15000);
it.concurrent("basic works", async () => {
await scrape({
url: "http://firecrawl.dev",
proxy: "basic",
});
}, 15000);
it.concurrent("stealth works", async () => {
await scrape({
url: "http://firecrawl.dev",
proxy: "stealth",
});
}, 15000);
});
describe("PDF (f-e dependant)", () => {
it.concurrent("works for PDFs behind anti-bot", async () => {
const response = await scrape({
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
});
expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
}, 60000);
});
}
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});
});
expect(response).toHaveProperty("json");
expect(response.json).toHaveProperty("company_mission");
expect(typeof response.json.company_mission).toBe("string");
expect(response.json).toHaveProperty("supports_sso");
expect(response.json.supports_sso).toBe(false);
expect(typeof response.json.supports_sso).toBe("boolean");
expect(response.json).toHaveProperty("is_open_source");
expect(response.json.is_open_source).toBe(true);
expect(typeof response.json.is_open_source).toBe("boolean");
}, 30000);
});
describe("Proxy API (f-e dependant)", () => {
it.concurrent("undefined works", async () => {
await scrape({
url: "http://firecrawl.dev",
});
}, 15000);
it.concurrent("basic works", async () => {
await scrape({
url: "http://firecrawl.dev",
proxy: "basic",
});
}, 15000);
it.concurrent("stealth works", async () => {
await scrape({
url: "http://firecrawl.dev",
proxy: "stealth",
});
}, 15000);
});
expect(response).toHaveProperty("json");
expect(response.json).toHaveProperty("company_mission");
expect(typeof response.json.company_mission).toBe("string");
expect(response.json).toHaveProperty("supports_sso");
expect(response.json.supports_sso).toBe(false);
expect(typeof response.json.supports_sso).toBe("boolean");
expect(response.json).toHaveProperty("is_open_source");
expect(response.json.is_open_source).toBe(true);
expect(typeof response.json.is_open_source).toBe("boolean");
}, 30000);
});
}
});

View File

@ -27,10 +27,10 @@ async function search(body: SearchRequestInput): Promise<Document> {
return raw.body.data;
}
describe("Scrape tests", () => {
it("works", async () => {
describe("Search tests", () => {
it.concurrent("works", async () => {
await search({
query: "firecrawl"
});
}, 15000);
}, 60000);
});

View File

@ -13,13 +13,13 @@ import {
getDoneJobsOrderedLength,
isCrawlKickoffFinished,
} from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { getScrapeQueue, QueueFunction } from "../../services/queue-service";
import {
supabaseGetJobById,
supabaseGetJobsById,
} from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
import type { Job, JobState } from "bullmq";
import type { Job, JobState, Queue } from "bullmq";
import { logger } from "../../lib/logger";
import { supabase_service } from "../../services/supabase";
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
@ -245,7 +245,7 @@ export async function crawlStatusController(
let totalCount = jobIDs.length;
if (totalCount === 0) {
if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") {
const x = await supabase_service
.from('firecrawl_jobs')
.select('*', { count: 'exact', head: true })

View File

@ -1,7 +1,34 @@
import { Response } from "express";
import { supabaseGetJobsById } from "../../lib/supabase-jobs";
import { RequestWithAuth } from "./types";
import { getExtract, getExtractExpiry } from "../../lib/extract/extract-redis";
import { DBJob, PseudoJob } from "./crawl-status";
import { getExtractQueue } from "../../services/queue-service";
import { ExtractResult } from "../../lib/extract/extraction-service";
import { supabaseGetJobById } from "../../lib/supabase-jobs";
export async function getExtractJob(id: string): Promise<PseudoJob<ExtractResult> | null> {
const [bullJob, dbJob] = await Promise.all([
getExtractQueue().getJob(id),
(process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobById(id) : null) as Promise<DBJob | null>,
]);
if (!bullJob && !dbJob) return null;
const data = dbJob?.docs ?? bullJob?.returnvalue?.data;
const job: PseudoJob<any> = {
id,
getState: bullJob ? bullJob.getState : (() => dbJob!.success ? "completed" : "failed"),
returnvalue: data,
data: {
scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options,
},
timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(),
failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined,
}
return job;
}
export async function extractStatusController(
req: RequestWithAuth<{ jobId: string }, any, any>,
@ -16,24 +43,29 @@ export async function extractStatusController(
});
}
let data: any[] = [];
let data: ExtractResult | [] = [];
if (extract.status === "completed") {
const jobData = await supabaseGetJobsById([req.params.jobId]);
if (!jobData || jobData.length === 0) {
const jobData = await getExtractJob(req.params.jobId);
if (!jobData) {
return res.status(404).json({
success: false,
error: "Job not found",
});
}
data = jobData[0].docs;
if (!jobData.returnvalue) {
// if we got in the split-second where the redis is updated but the bull isn't
// just pretend it's still processing - MG
extract.status = "processing";
} else {
data = jobData.returnvalue ?? [];
}
}
// console.log(extract.sources);
return res.status(200).json({
success: extract.status === "failed" ? false : true,
data: data,
data,
status: extract.status,
error: extract?.error ?? undefined,
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),

View File

@ -5,6 +5,7 @@ import {
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
TimeoutSignal,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
@ -53,6 +54,8 @@ export async function getMapResults({
origin,
includeMetadata = false,
allowExternalLinks,
abort = new AbortController().signal, // noop
mock,
}: {
url: string;
search?: string;
@ -65,6 +68,8 @@ export async function getMapResults({
origin?: string;
includeMetadata?: boolean;
allowExternalLinks?: boolean;
abort?: AbortSignal;
mock?: string;
}): Promise<MapResult> {
const id = uuidv4();
let links: string[] = [url];
@ -87,8 +92,8 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
await crawler.importRobotsTxt(sc.robots);
sc.robots = await crawler.getRobotsTxt(false, abort);
crawler.importRobotsTxt(sc.robots);
} catch (_) {}
// If sitemapOnly is true, only get links from sitemap
@ -102,6 +107,8 @@ export async function getMapResults({
true,
true,
30000,
abort,
mock,
);
if (sitemap > 0) {
links = links
@ -144,7 +151,7 @@ export async function getMapResults({
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page,
});
}, abort);
};
pagePromises = Array.from({ length: maxPages }, (_, i) =>
@ -157,7 +164,7 @@ export async function getMapResults({
// Parallelize sitemap index query with search results
const [sitemapIndexResult, ...searchResults] = await Promise.all([
querySitemapIndex(url),
querySitemapIndex(url, abort),
...(cachedResult ? [] : pagePromises),
]);
@ -178,6 +185,7 @@ export async function getMapResults({
true,
false,
30000,
abort,
);
} catch (e) {
logger.warn("tryGetSitemap threw an error", { error: e });
@ -277,6 +285,7 @@ export async function mapController(
req.body = mapRequestSchema.parse(req.body);
let result: Awaited<ReturnType<typeof getMapResults>>;
const abort = new AbortController();
try {
result = await Promise.race([
getMapResults({
@ -289,13 +298,18 @@ export async function mapController(
origin: req.body.origin,
teamId: req.auth.team_id,
plan: req.auth.plan,
abort: abort.signal,
mock: req.body.useMock,
}),
...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => reject("timeout"), req.body.timeout))
new Promise((resolve, reject) => setTimeout(() => {
abort.abort(new TimeoutSignal());
reject(new TimeoutSignal());
}, req.body.timeout))
] : []),
]) as any;
} catch (error) {
if (error === "timeout") {
if (error instanceof TimeoutSignal || error === "timeout") {
return res.status(408).json({
success: false,
error: "Request timed out",

View File

@ -501,6 +501,7 @@ export const mapRequestSchema = crawlerOptions
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000),
timeout: z.number().positive().finite().optional(),
useMock: z.string().optional(),
})
.strict(strictMessage);
@ -1004,3 +1005,9 @@ export const generateLLMsTextRequestSchema = z.object({
export type GenerateLLMsTextRequest = z.infer<
typeof generateLLMsTextRequestSchema
>;
export class TimeoutSignal extends Error {
constructor() {
super("Operation timed out")
}
}

View File

@ -1,38 +1,10 @@
import { CONCURRENCY_LIMIT } from "../services/rate-limiter";
import { redisConnection } from "../services/queue-service";
import { PlanType } from "../types";
import type { Job, JobsOptions } from "bullmq";
import type { JobsOptions } from "bullmq";
const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
const constructQueueKey = (team_id: string) =>
"concurrency-limit-queue:" + team_id;
export function calculateJobTimeToRun(
job: ConcurrencyLimitedJob
): number {
let jobTimeToRun = 86400000; // 24h (crawl)
if (job.data.scrapeOptions) {
if (job.data.scrapeOptions.timeout) {
jobTimeToRun = job.data.scrapeOptions.timeout;
}
if (job.data.scrapeOptions.waitFor) {
jobTimeToRun += job.data.scrapeOptions.waitFor;
}
(job.data.scrapeOptions.actions ?? []).forEach(x => {
if (x.type === "wait" && x.milliseconds) {
jobTimeToRun += x.milliseconds;
} else {
jobTimeToRun += 1000;
}
})
}
return jobTimeToRun;
}
export async function cleanOldConcurrencyLimitEntries(
team_id: string,
now: number = Date.now(),

View File

@ -7,7 +7,6 @@ import {
} from "../build-prompts";
import OpenAI from "openai";
import { logger } from "../../../lib/logger";
const openai = new OpenAI();
export async function analyzeSchemaAndPrompt(
urls: string[],
@ -40,6 +39,7 @@ export async function analyzeSchemaAndPrompt(
const model = "gpt-4o";
const openai = new OpenAI();
const result = await openai.beta.chat.completions.parse({
model: model,
messages: [

View File

@ -48,7 +48,7 @@ interface ExtractServiceOptions {
cacheKey?: string;
}
interface ExtractResult {
export interface ExtractResult {
success: boolean;
data?: any;
extractId: string;

View File

@ -3,10 +3,6 @@ import { Document } from "../../../controllers/v1/types";
import { logger } from "../../logger";
import OpenAI from "openai";
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
});
@ -27,6 +23,10 @@ export interface PageMetadata {
}
async function getEmbedding(text: string) {
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const embedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: text,

View File

@ -1,9 +1,5 @@
import OpenAI from "openai";
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
interface Message {
role: "system" | "user" | "assistant";
content: string;
@ -19,6 +15,10 @@ interface GenerateTextOptions {
export async function generateText(options: GenerateTextOptions) {
const { model, messages, temperature = 0.7, maxTokens } = options;
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const completion = await openai.chat.completions.create({
model,
messages,

View File

@ -1,14 +1,13 @@
import axios from "axios";
import { configDotenv } from "dotenv";
import OpenAI from "openai";
configDotenv();
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
async function getEmbedding(text: string) {
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const embedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: text,

View File

@ -9,6 +9,7 @@ import { logger as _logger } from "../../lib/logger";
import https from "https";
import { redisConnection } from "../../services/queue-service";
import { extractLinks } from "../../lib/html-transformer";
import { TimeoutSignal } from "../../controllers/v1/types";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -182,7 +183,7 @@ export class WebCrawler {
.slice(0, limit);
}
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
public async getRobotsTxt(skipTlsVerification = false, abort?: AbortSignal): Promise<string> {
let extraArgs = {};
if (skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
@ -191,6 +192,7 @@ export class WebCrawler {
}
const response = await axios.get(this.robotsTxtUrl, {
timeout: axiosTimeout,
signal: abort,
...extraArgs,
});
return response.data;
@ -205,6 +207,8 @@ export class WebCrawler {
fromMap: boolean = false,
onlySitemap: boolean = false,
timeout: number = 120000,
abort?: AbortSignal,
mock?: string,
): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap",
@ -260,10 +264,10 @@ export class WebCrawler {
try {
let count = (await Promise.race([
Promise.all([
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
...this.robots
.getSitemaps()
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
]).then((results) => results.reduce((a, x) => a + x, 0)),
timeoutPromise,
])) as number;
@ -555,6 +559,8 @@ export class WebCrawler {
private async tryFetchSitemapLinks(
url: string,
urlsHandler: (urls: string[]) => unknown,
abort?: AbortSignal,
mock?: string,
): Promise<number> {
const sitemapUrl = url.endsWith(".xml")
? url
@ -569,13 +575,19 @@ export class WebCrawler {
this.logger,
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
method: "tryFetchSitemapLinks",
sitemapUrl,
error,
});
if (error instanceof TimeoutSignal) {
throw error;
} else {
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
method: "tryFetchSitemapLinks",
sitemapUrl,
error,
});
}
}
// If this is a subdomain, also try to get sitemap from the main domain
@ -611,20 +623,30 @@ export class WebCrawler {
this.logger,
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
if (error instanceof TimeoutSignal) {
throw error;
} else {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
}
}
}
} catch (error) {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
if (error instanceof TimeoutSignal) {
throw error;
} else {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
}
}
// If no sitemap found yet, try the baseUrl as a last resort
@ -636,22 +658,30 @@ export class WebCrawler {
this.logger,
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks",
sitemapUrl: baseUrlSitemap,
error,
});
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
if (error instanceof TimeoutSignal) {
throw error;
} else {
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
this.jobId,
this.sitemapsHit,
);
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks",
sitemapUrl: baseUrlSitemap,
error,
});
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
this.jobId,
this.sitemapsHit,
abort,
mock,
);
}
}
}
}

View File

@ -12,10 +12,11 @@ import { supabase_service } from "../../services/supabase";
*/
import { withAuth } from "../../lib/withAuth";
async function querySitemapIndexFunction(url: string) {
async function querySitemapIndexFunction(url: string, abort?: AbortSignal) {
const originUrl = normalizeUrlOnlyHostname(url);
for (let attempt = 1; attempt <= 3; attempt++) {
abort?.throwIfAborted();
try {
const { data, error } = await supabase_service
.from("crawl_maps")

View File

@ -1,8 +1,7 @@
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types";
import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import type { Logger } from "winston";
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
@ -20,6 +19,8 @@ export async function getLinksFromSitemap(
logger: Logger,
crawlId: string,
sitemapsHit: Set<string>,
abort?: AbortSignal,
mock?: string,
): Promise<number> {
if (sitemapsHit.size >= 20) {
return 0;
@ -38,13 +39,14 @@ export async function getLinksFromSitemap(
const response = await scrapeURL(
"sitemap;" + crawlId,
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
scrapeOptions.parse({ formats: ["rawHtml"], useMock: mock }),
{
forceEngine: [
"fetch",
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
],
v0DisableJsDom: true
v0DisableJsDom: true,
abort,
},
);
@ -69,14 +71,18 @@ export async function getLinksFromSitemap(
return 0;
}
} catch (error) {
logger.error(`Request failed for sitemap fetch`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error,
});
if (error instanceof TimeoutSignal) {
throw error;
} else {
logger.error(`Request failed for sitemap fetch`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error,
});
return 0;
return 0;
}
}
const parsed = await parseStringPromise(content);
@ -90,7 +96,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit),
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort, mock),
);
const results = await Promise.all(sitemapPromises);
@ -114,6 +120,8 @@ export async function getLinksFromSitemap(
logger,
crawlId,
sitemapsHit,
abort,
mock,
),
);
count += (await Promise.all(sitemapPromises)).reduce(
@ -151,56 +159,3 @@ export async function getLinksFromSitemap(
return 0;
}
export const fetchSitemapData = async (
url: string,
timeout?: number,
): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
const fetchResponse = await scrapeURL(
"sitemap",
sitemapUrl,
scrapeOptions.parse({
formats: ["rawHtml"],
timeout: timeout || axiosTimeout,
}),
{ forceEngine: "fetch" },
);
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
const xml = fetchResponse.document.rawHtml!;
const parsedXml = await parseStringPromise(xml);
const sitemapData: SitemapEntry[] = [];
if (parsedXml.urlset && parsedXml.urlset.url) {
for (const urlElement of parsedXml.urlset.url) {
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
if (urlElement.changefreq)
sitemapEntry.changefreq = urlElement.changefreq[0];
if (urlElement.priority)
sitemapEntry.priority = Number(urlElement.priority[0]);
sitemapData.push(sitemapEntry);
}
}
return sitemapData;
}
return null;
} catch (error) {
// Error handling for failed sitemap fetch
}
return [];
};
export interface SitemapEntry {
loc: string;
lastmod?: string;
changefreq?: string;
priority?: number;
}

View File

@ -7,6 +7,7 @@ import {
InsecureConnectionError,
makeSecureDispatcher,
} from "../utils/safeFetch";
import { MockState, saveMock } from "../../lib/mock";
export async function scrapeURLWithFetch(
meta: Meta,
@ -14,44 +15,95 @@ export async function scrapeURLWithFetch(
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;
let response: undici.Response;
try {
response = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
const mockOptions = {
url: meta.url,
// irrelevant
method: "GET",
ignoreResponse: false,
ignoreFailure: false,
tryCount: 1,
};
let response: {
url: string;
body: string,
status: number;
headers: any;
};
if (meta.mock !== null) {
const makeRequestTypeId = (
request: MockState["requests"][number]["options"],
) => request.url + ";" + request.method;
const thisId = makeRequestTypeId(mockOptions);
const matchingMocks = meta.mock.requests
.filter((x) => makeRequestTypeId(x.options) === thisId)
.sort((a, b) => a.time - b.time);
const nextI = meta.mock.tracker[thisId] ?? 0;
meta.mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found.");
}
response = {
...matchingMocks[nextI].result,
};
} else {
try {
const x = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort,
}),
(async () => {
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
response = {
url: x.url,
body: await x.text(),
status: x.status,
headers: [...x.headers],
};
if (meta.mock === null) {
await saveMock(
mockOptions,
response,
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
} catch (error) {
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;
}
} catch (error) {
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;
}
}
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any),
);
return {
url: response.url,
html: await response.text(),
html: response.body,
statusCode: response.status,
// TODO: error?
};
}

View File

@ -10,6 +10,7 @@ import {
UnsupportedFileError,
} from "../../error";
import { MockState } from "../../lib/mock";
import { fireEngineURL } from "./scrape";
const successSchema = z.object({
jobId: z.string(),
@ -84,9 +85,8 @@ export async function fireEngineCheckStatus(
logger: Logger,
jobId: string,
mock: MockState | null,
abort?: AbortSignal,
): Promise<FireEngineCheckStatusSuccess> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
const status = await Sentry.startSpan(
{
name: "fire-engine: Check status",

View File

@ -3,14 +3,13 @@ import * as Sentry from "@sentry/node";
import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock";
import { fireEngineURL } from "./scrape";
export async function fireEngineDelete(
logger: Logger,
jobId: string,
mock: MockState | null,
) {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
await Sentry.startSpan(
{
name: "fire-engine: Delete scrape",

View File

@ -24,8 +24,9 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { fireEngineDelete } from "./delete";
import { MockState, saveMock } from "../../lib/mock";
import { MockState } from "../../lib/mock";
import { getInnerJSON } from "../../../../lib/html-transformer";
import { TimeoutSignal } from "../../../../controllers/v1/types";
// This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the
@ -40,6 +41,7 @@ async function performFireEngineScrape<
request: FireEngineScrapeRequestCommon & Engine,
timeout: number,
mock: MockState | null,
abort?: AbortSignal,
): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }),
@ -84,6 +86,7 @@ async function performFireEngineScrape<
logger.child({ method: "fireEngineCheckStatus" }),
scrape.jobId,
mock,
abort,
);
} catch (error) {
if (error instanceof StillProcessingError) {
@ -107,6 +110,16 @@ async function performFireEngineScrape<
jobId: scrape.jobId,
});
throw error;
} else if (error instanceof TimeoutSignal) {
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterError: error,
}),
scrape.jobId,
mock,
);
throw error;
} else {
Sentry.captureException(error);
errors.push(error);
@ -120,11 +133,12 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250));
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck",
}),
status.responseHeaders,
status,
);
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
@ -219,6 +233,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
request,
timeout,
meta.mock,
meta.internalOptions.abort,
);
if (
@ -298,6 +313,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request,
timeout,
meta.mock,
meta.internalOptions.abort,
);
if (!response.url) {
@ -353,6 +369,7 @@ export async function scrapeURLWithFireEngineTLSClient(
request,
timeout,
meta.mock,
meta.internalOptions.abort,
);
if (!response.url) {

View File

@ -65,6 +65,8 @@ const schema = z.object({
processing: z.boolean(),
});
export const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL ?? "<mock-fire-engine-url>";
export async function fireEngineScrape<
Engine extends
| FireEngineScrapeRequestChromeCDP
@ -74,11 +76,8 @@ export async function fireEngineScrape<
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
mock: MockState | null,
abort?: AbortSignal,
): Promise<z.infer<typeof schema>> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
// TODO: retries
const scrapeRequest = await Sentry.startSpan(
{
name: "fire-engine: Scrape",
@ -103,6 +102,7 @@ export async function fireEngineScrape<
schema,
tryCount: 3,
mock,
abort,
});
},
);

View File

@ -310,7 +310,12 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
const _engines = [...engines];
const _engines: Engine[] = [
...engines,
// enable fire-engine in self-hosted testing environment when mocks are supplied
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : [])
];
if (meta.internalOptions.useCache !== true) {
const cacheIndex = _engines.indexOf("cache");

View File

@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { readFile, unlink } from "node:fs/promises";
import path from "node:path";
import type { Response } from "undici";
type PDFProcessorResult = { html: string; markdown?: string };
@ -75,22 +76,49 @@ export async function scrapePDF(
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url, {
headers: meta.options.headers,
});
const content = file.buffer.toString("base64");
return {
url: file.response.url,
statusCode: file.response.status,
if (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) {
const content = (await readFile(meta.pdfPrefetch.filePath)).toString("base64");
return {
url: meta.pdfPrefetch.url ?? meta.url,
statusCode: meta.pdfPrefetch.status,
html: content,
markdown: content,
};
html: content,
markdown: content,
};
} else {
const file = await fetchFileToBuffer(meta.url, {
headers: meta.options.headers,
});
const ct = file.response.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
const content = file.buffer.toString("base64");
return {
url: file.response.url,
statusCode: file.response.status,
html: content,
markdown: content,
};
}
}
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
if ((response as any).headers) { // if downloadFile was used
const r: Response = response as any;
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
}
let result: PDFProcessorResult | null = null;
@ -142,7 +170,7 @@ export async function scrapePDF(
await unlink(tempFilePath);
return {
url: response.url,
url: response.url ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",

View File

@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
});
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),

View File

@ -43,14 +43,24 @@ export function makeSecureDispatcher(
url: string,
options?: undici.Agent.Options,
) {
const agent = new undici.Agent({
const agentOpts: undici.Agent.Options = {
connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine
// lookup: secureLookup,
},
maxRedirections: 5000,
...options,
});
};
const agent = process.env.PROXY_SERVER
? new undici.ProxyAgent({
uri: process.env.PROXY_SERVER.includes("://") ? process.env.PROXY_SERVER : ("http://" + process.env.PROXY_SERVER),
token: process.env.PROXY_USERNAME
? `Basic ${Buffer.from(process.env.PROXY_USERNAME + ":" + (process.env.PROXY_PASSWORD ?? "")).toString("base64")}`
: undefined,
...agentOpts,
})
: new undici.Agent(agentOpts);
agent.on("connect", (_, targets) => {
const client: undici.Client = targets.slice(-1)[0] as undici.Client;

View File

@ -1,9 +1,30 @@
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
import path from "path";
import os from "os";
import { writeFile } from "fs/promises";
import { Meta } from "../..";
export function specialtyScrapeCheck(
async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
if (!feRes?.file) {
return null;
}
const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
return {
status: feRes.pageStatusCode,
url: feRes.url,
filePath,
};
}
export async function specialtyScrapeCheck(
logger: Logger,
headers: Record<string, string> | undefined,
feRes?: FireEngineCheckStatusSuccess,
) {
const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
contentType.startsWith("application/pdf;")
) {
// .pdf
throw new AddFeatureError(["pdf"]);
throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
} else if (
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||

View File

@ -1,4 +1,4 @@
import { EngineResultsTracker } from ".";
import { EngineResultsTracker, Meta } from ".";
import { Engine, FeatureFlag } from "./engines";
export class EngineError extends Error {
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[];
public pdfPrefetch: Meta["pdfPrefetch"];
constructor(featureFlags: FeatureFlag[]) {
constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
this.pdfPrefetch = pdfPrefetch;
}
}
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
this.reason = reason;
}
}
export class PDFAntibotError extends Error {
constructor() {
super("PDF scrape was prevented by anti-bot")
}
}

View File

@ -1,7 +1,7 @@
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions } from "../../controllers/v1/types";
import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import { logger as _logger } from "../../lib/logger";
import {
buildFallbackList,
@ -16,6 +16,7 @@ import {
AddFeatureError,
EngineError,
NoEnginesLeftError,
PDFAntibotError,
RemoveFeatureError,
SiteError,
TimeoutError,
@ -49,6 +50,11 @@ export type Meta = {
logs: any[];
featureFlags: Set<FeatureFlag>;
mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
};
function buildFeatureFlags(
@ -151,6 +157,7 @@ async function buildMetaObject(
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
pdfPrefetch: undefined,
};
}
@ -165,6 +172,7 @@ export type InternalOptions = {
disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
};
export type EngineResultsTracker = {
@ -222,6 +230,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
: undefined;
for (const { engine, unsupportedFeatures } of fallbackList) {
meta.internalOptions.abort?.throwIfAborted();
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
@ -307,6 +316,10 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.warn(
@ -390,6 +403,9 @@ export async function scrapeURL(
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags),
);
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
@ -404,6 +420,21 @@ export async function scrapeURL(
(x) => !error.featureFlags.includes(x),
),
);
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
} else {
throw error;
}
@ -433,6 +464,8 @@ export async function scrapeURL(
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -2,6 +2,8 @@ import { Logger } from "winston";
import { z, ZodError } from "zod";
import * as Sentry from "@sentry/node";
import { MockState, saveMock } from "./mock";
import { TimeoutSignal } from "../../../controllers/v1/types";
import { fireEngineURL } from "../engines/fire-engine/scrape";
export type RobustFetchParams<Schema extends z.Schema<any>> = {
url: string;
@ -17,6 +19,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
tryCount?: number;
tryCooldown?: number;
mock: MockState | null;
abort?: AbortSignal;
};
export async function robustFetch<
@ -35,7 +38,10 @@ export async function robustFetch<
tryCount = 1,
tryCooldown,
mock,
abort,
}: RobustFetchParams<Schema>): Promise<Output> {
abort?.throwIfAborted();
const params = {
url,
logger,
@ -47,6 +53,7 @@ export async function robustFetch<
ignoreFailure,
tryCount,
tryCooldown,
abort,
};
let response: {
@ -70,6 +77,7 @@ export async function robustFetch<
: {}),
...(headers !== undefined ? headers : {}),
},
signal: abort,
...(body instanceof FormData
? {
body,
@ -81,7 +89,9 @@ export async function robustFetch<
: {}),
});
} catch (error) {
if (!ignoreFailure) {
if (error instanceof TimeoutSignal) {
throw error;
} else if (!ignoreFailure) {
Sentry.captureException(error);
if (tryCount > 1) {
logger.debug(
@ -126,14 +136,13 @@ export async function robustFetch<
const makeRequestTypeId = (
request: (typeof mock)["requests"][number]["options"],
) => {
let trueUrl = (process.env.FIRE_ENGINE_BETA_URL && request.url.startsWith(process.env.FIRE_ENGINE_BETA_URL))
? request.url.replace(process.env.FIRE_ENGINE_BETA_URL, "<fire-engine>")
let trueUrl = request.url.startsWith(fireEngineURL)
? request.url.replace(fireEngineURL, "<fire-engine>")
: request.url;
let out = trueUrl + ";" + request.method;
if (
process.env.FIRE_ENGINE_BETA_URL &&
(trueUrl.startsWith("<fire-engine>")) &&
trueUrl.startsWith("<fire-engine>") &&
request.method === "POST"
) {
out += "f-e;" + request.body?.engine + ";" + request.body?.url;

View File

@ -305,6 +305,7 @@ export async function performLLMExtract(
document: Document,
): Promise<Document> {
if (meta.options.formats.includes("extract")) {
meta.internalOptions.abort?.throwIfAborted();
const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({
method: "performLLMExtract/generateOpenAICompletions",

View File

@ -16,6 +16,7 @@ export async function fireEngineMap(
numResults: number;
page?: number;
},
abort?: AbortSignal,
): Promise<SearchResult[]> {
try {
let data = JSON.stringify({
@ -29,9 +30,7 @@ export async function fireEngineMap(
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn(
"(v1/map Beta) Results might differ from cloud offering currently.",
);
logger.warn("(v1/map Beta) Results might differ from cloud offering currently.");
return [];
}
@ -42,6 +41,7 @@ export async function fireEngineMap(
"X-Disable-Cache": "true",
},
body: data,
signal: abort,
});
if (response.ok) {

View File

@ -1,21 +1,18 @@
import axios from "axios";
import * as cheerio from "cheerio"; // TODO: rustify
import { JSDOM } from 'jsdom';
import * as querystring from "querystring";
import { SearchResult } from "../../src/lib/entities";
import { logger } from "../../src/lib/logger";
import https from 'https';
const _useragent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
];
const getRandomInt = (min: number, max: number): number => Math.floor(Math.random() * (max - min + 1)) + min;
function get_useragent(): string {
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
export function get_useragent(): string {
const lynx_version = `Lynx/${getRandomInt(2, 3)}.${getRandomInt(8, 9)}.${getRandomInt(0, 2)}`;
const libwww_version = `libwww-FM/${getRandomInt(2, 3)}.${getRandomInt(13, 15)}`;
const ssl_mm_version = `SSL-MM/${getRandomInt(1, 2)}.${getRandomInt(3, 5)}`;
const openssl_version = `OpenSSL/${getRandomInt(1, 3)}.${getRandomInt(0, 4)}.${getRandomInt(0, 9)}`;
return `${lynx_version} ${libwww_version} ${ssl_mm_version} ${openssl_version}`;
}
async function _req(
@ -31,9 +28,10 @@ async function _req(
) {
const params = {
q: term,
num: results, // Number of results to return
num: results+2, // Number of results to return
hl: lang,
gl: country,
safe: "active",
start: start,
};
if (tbs) {
@ -42,18 +40,25 @@ async function _req(
if (filter) {
params["filter"] = filter;
}
var agent = get_useragent();
try {
const resp = await axios.get("https://www.google.com/search", {
headers: {
"User-Agent": get_useragent(),
"User-Agent": agent,
"Accept": "*/*"
},
params: params,
proxy: proxies,
timeout: timeout,
httpsAgent: new https.Agent({
rejectUnauthorized: true
}),
withCredentials: true
});
return resp;
} catch (error) {
if (error.response && error.response.status === 429) {
logger.warn("Google Search: Too many requests, try again later.", error.response);
throw new Error("Google Search: Too many requests, try again later.");
}
throw error;
@ -100,34 +105,42 @@ export async function googleSearch(
tbs,
filter,
);
const $ = cheerio.load(resp.data);
const result_block = $("div.g");
const dom = new JSDOM(resp.data);
const document = dom.window.document;
const result_block = document.querySelectorAll("div.ezO2md");
let new_results = 0;
let unique = true;
let fetched_results = 0;
const fetched_links = new Set<string>();
if (result_block.length === 0) {
start += 1;
attempts += 1;
} else {
attempts = 0; // Reset attempts if we have results
attempts = 0;
}
result_block.each((index, element) => {
const linkElement = $(element).find("a");
const link =
linkElement && linkElement.attr("href")
? linkElement.attr("href")
: null;
const title = $(element).find("h3");
const ogImage = $(element).find("img").eq(1).attr("src");
const description_box = $(element).find(
"div[style='-webkit-line-clamp:2']",
);
const answerBox = $(element).find(".mod").text();
if (description_box) {
const description = description_box.text();
if (link && title && description) {
start += 1;
results.push(new SearchResult(link, title.text(), description));
for (const result of result_block) {
const link_tag = result.querySelector("a[href]") as HTMLAnchorElement;
const title_tag = link_tag ? link_tag.querySelector("span.CVA68e") : null;
const description_tag = result.querySelector("span.FrIlee");
if (link_tag && title_tag && description_tag) {
const link = decodeURIComponent(link_tag.href.split("&")[0].replace("/url?q=", ""));
if (fetched_links.has(link) && unique) continue;
fetched_links.add(link);
const title = title_tag.textContent || "";
const description = description_tag.textContent || "";
fetched_results++;
new_results++;
if (link && title && description) {
start += 1
results.push(new SearchResult(link, title, description));
}
if (fetched_results >= num_results) break;
}
}
});
}
await new Promise((resolve) =>
setTimeout(resolve, sleep_interval * 1000),
);

View File

@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { searchapi_search } from "./searchapi";
import { serper_search } from "./serper";
import { searxng_search } from "./searxng";
export async function search({
query,
@ -51,6 +52,16 @@ export async function search({
location,
});
}
if (process.env.SEARXNG_ENDPOINT) {
return await searxng_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
}
return await googleSearch(
query,
advanced,
@ -64,7 +75,7 @@ export async function search({
timeout,
);
} catch (error) {
logger.error(`Error in search function: ${error}`);
logger.error(`Error in search function`, { error });
return [];
}
}

View File

@ -0,0 +1,64 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
import { logger } from "../lib/logger"
dotenv.config();
interface SearchOptions {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
page?: number;
}
export async function searxng_search(
q: string,
options: SearchOptions,
): Promise<SearchResult[]> {
const params = {
q: q,
language: options.lang,
// gl: options.country, //not possible with SearXNG
// location: options.location, //not possible with SearXNG
// num: options.num_results, //not possible with SearXNG
engines: process.env.SEARXNG_ENGINES || "",
categories: process.env.SEARXNG_CATEGORIES || "general",
pageno: options.page ?? 1,
format: "json"
};
const url = process.env.SEARXNG_ENDPOINT!;
// Remove trailing slash if it exists
const cleanedUrl = url.endsWith('/') ? url.slice(0, -1) : url;
// Concatenate "/search" to the cleaned URL
const finalUrl = cleanedUrl + "/search";
try {
const response = await axios.get(finalUrl, {
headers: {
"Content-Type": "application/json",
},
params: params,
});
const data = response.data;
if (data && Array.isArray(data.results)) {
return data.results.map((a: any) => ({
url: a.url,
title: a.title,
description: a.content,
}));
} else {
return [];
}
} catch (error) {
logger.error(`There was an error searching for content`, { error });
return [];
}
}

View File

@ -1,10 +1,8 @@
import { Job, JobsOptions } from "bullmq";
import { getScrapeQueue } from "./queue-service";
import { v4 as uuidv4 } from "uuid";
import { NotificationType, PlanType, WebScraperOptions } from "../types";
import { PlanType, WebScraperOptions } from "../types";
import * as Sentry from "@sentry/node";
import {
calculateJobTimeToRun,
cleanOldConcurrencyLimitEntries,
getConcurrencyLimitActiveJobs,
getConcurrencyQueueJobsCount,
@ -13,7 +11,6 @@ import {
} from "../lib/concurrency-limit";
import { logger } from "../lib/logger";
import { getConcurrencyLimitMax } from "./rate-limiter";
import { sendNotificationWithCustomDays } from "./notification/email_notification";
async function _addScrapeJobToConcurrencyQueue(
webScraperOptions: any,
@ -44,15 +41,7 @@ export async function _addScrapeJobToBullMQ(
webScraperOptions.team_id &&
webScraperOptions.plan
) {
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId, calculateJobTimeToRun({
id: jobId,
opts: {
...options,
priority: jobPriority,
jobId,
},
data: webScraperOptions,
}));
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId, 60 * 1000); // 60s default timeout
}
await getScrapeQueue().add(jobId, webScraperOptions, {

View File

@ -2,6 +2,8 @@ import { Queue } from "bullmq";
import { logger } from "../lib/logger";
import IORedis from "ioredis";
export type QueueFunction = () => Queue<any, any, string, any, any, string>;
let scrapeQueue: Queue;
let extractQueue: Queue;
let loggingQueue: Queue;

View File

@ -52,7 +52,6 @@ import { configDotenv } from "dotenv";
import { scrapeOptions } from "../controllers/v1/types";
import { getRateLimiterPoints } from "./rate-limiter";
import {
calculateJobTimeToRun,
cleanOldConcurrencyLimitEntries,
pushConcurrencyLimitActiveJob,
removeConcurrencyLimitActiveJob,
@ -247,6 +246,11 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
extendInterval: jobLockExtendInterval,
extensionTime: jobLockExtensionTime,
});
if (job.data?.mode !== "kickoff" && job.data?.team_id) {
await pushConcurrencyLimitActiveJob(job.data.team_id, job.id, 60 * 1000); // 60s lock renew, just like in the queue
}
await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval);
@ -597,7 +601,7 @@ const workerFun = async (
// we are 1 under the limit, assuming the job insertion logic never over-inserts. - MG
const nextJob = await takeConcurrencyLimitedJob(job.data.team_id);
if (nextJob !== null) {
await pushConcurrencyLimitActiveJob(job.data.team_id, nextJob.id, calculateJobTimeToRun(nextJob));
await pushConcurrencyLimitActiveJob(job.data.team_id, nextJob.id, 60 * 1000); // 60s initial timeout
await queue.add(
nextJob.id,

View File

@ -535,7 +535,7 @@ export default class FirecrawlApp {
const response: AxiosResponse = await axios.post(
this.apiUrl + `/v1/scrape`,
jsonData,
{ headers }
{ headers, timeout: params?.timeout !== undefined ? (params.timeout + 5000) : undefined },
);
if (response.status === 200) {
const responseData = response.data;
@ -1262,7 +1262,7 @@ export default class FirecrawlApp {
data: any,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.post(url, data, { headers });
return axios.post(url, data, { headers, timeout: (data?.timeout ? (data.timeout + 5000) : undefined) });
}
/**

View File

@ -0,0 +1,3 @@
/node_modules/
/dist/
.env

View File

@ -1,6 +1,6 @@
import express, { Request, Response } from 'express';
import bodyParser from 'body-parser';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest, Page } from 'playwright';
import dotenv from 'dotenv';
import UserAgent from 'user-agents';
import { getError } from './helpers/get_error';
@ -119,7 +119,7 @@ const isValidUrl = (urlString: string): boolean => {
}
};
const scrapePage = async (page: any, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => {
const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => {
console.log(`Navigating to ${url} with waitUntil: ${waitUntil} and timeout: ${timeout}ms`);
const response = await page.goto(url, { waitUntil, timeout });
@ -135,9 +135,19 @@ const scrapePage = async (page: any, url: string, waitUntil: 'load' | 'networkid
}
}
let headers = null, content = await page.content();
if (response) {
headers = await response.allHeaders();
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
}
}
return {
content: await page.content(),
content,
status: response ? response.status() : null,
headers,
};
};
@ -175,40 +185,35 @@ app.post('/scrape', async (req: Request, res: Response) => {
await page.setExtraHTTPHeaders(headers);
}
let pageContent;
let pageStatusCode: number | null = null;
let result: Awaited<ReturnType<typeof scrapePage>>;
try {
// Strategy 1: Normal
console.log('Attempting strategy 1: Normal load');
const result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector);
} catch (error) {
console.log('Strategy 1 failed, attempting strategy 2: Wait until networkidle');
try {
// Strategy 2: Wait until networkidle
const result = await scrapePage(page, url, 'networkidle', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
result = await scrapePage(page, url, 'networkidle', wait_after_load, timeout, check_selector);
} catch (finalError) {
await page.close();
return res.status(500).json({ error: 'An error occurred while fetching the page.' });
}
}
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined;
const pageError = result.status !== 200 ? getError(result.status) : undefined;
if (!pageError) {
console.log(`✅ Scrape successful!`);
} else {
console.log(`🚨 Scrape failed with status code: ${pageStatusCode} ${pageError}`);
console.log(`🚨 Scrape failed with status code: ${result.status} ${pageError}`);
}
await page.close();
res.json({
content: pageContent,
pageStatusCode,
content: result.content,
pageStatusCode: result.status,
...(pageError && { pageError })
});
});

View File

@ -19,6 +19,7 @@
"user-agents": "^1.1.410"
},
"devDependencies": {
"@types/body-parser": "^1.19.5",
"@types/express": "^4.17.21",
"@types/node": "^20.14.9",
"@types/user-agents": "^1.0.4",

873
apps/playwright-service-ts/pnpm-lock.yaml generated Normal file
View File

@ -0,0 +1,873 @@
lockfileVersion: '9.0'
settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
importers:
.:
dependencies:
body-parser:
specifier: ^1.20.2
version: 1.20.3
dotenv:
specifier: ^16.4.5
version: 16.4.7
express:
specifier: ^4.19.2
version: 4.21.2
playwright:
specifier: ^1.45.0
version: 1.49.1
user-agents:
specifier: ^1.1.410
version: 1.1.455
devDependencies:
'@types/body-parser':
specifier: ^1.19.5
version: 1.19.5
'@types/express':
specifier: ^4.17.21
version: 4.17.21
'@types/node':
specifier: ^20.14.9
version: 20.17.10
'@types/user-agents':
specifier: ^1.0.4
version: 1.0.4
ts-node:
specifier: ^10.9.2
version: 10.9.2(@types/node@20.17.10)(typescript@5.7.2)
typescript:
specifier: ^5.5.2
version: 5.7.2
packages:
'@cspotcode/source-map-support@0.8.1':
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
engines: {node: '>=12'}
'@jridgewell/resolve-uri@3.1.2':
resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==}
engines: {node: '>=6.0.0'}
'@jridgewell/sourcemap-codec@1.5.0':
resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==}
'@jridgewell/trace-mapping@0.3.9':
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==}
'@tsconfig/node10@1.0.11':
resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==}
'@tsconfig/node12@1.0.11':
resolution: {integrity: sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==}
'@tsconfig/node14@1.0.3':
resolution: {integrity: sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==}
'@tsconfig/node16@1.0.4':
resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==}
'@types/body-parser@1.19.5':
resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==}
'@types/connect@3.4.38':
resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==}
'@types/express-serve-static-core@4.19.6':
resolution: {integrity: sha512-N4LZ2xG7DatVqhCZzOGb1Yi5lMbXSZcmdLDe9EzSndPV2HpWYWzRbaerl2n27irrm94EPpprqa8KpskPT085+A==}
'@types/express@4.17.21':
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
'@types/http-errors@2.0.4':
resolution: {integrity: sha512-D0CFMMtydbJAegzOyHjtiKPLlvnm3iTZyZRSZoLq2mRhDdmLfIWOCYPfQJ4cu2erKghU++QvjcUjp/5h7hESpA==}
'@types/mime@1.3.5':
resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==}
'@types/node@20.17.10':
resolution: {integrity: sha512-/jrvh5h6NXhEauFFexRin69nA0uHJ5gwk4iDivp/DeoEua3uwCUto6PC86IpRITBOs4+6i2I56K5x5b6WYGXHA==}
'@types/qs@6.9.17':
resolution: {integrity: sha512-rX4/bPcfmvxHDv0XjfJELTTr+iB+tn032nPILqHm5wbthUUUuVtNGGqzhya9XUxjTP8Fpr0qYgSZZKxGY++svQ==}
'@types/range-parser@1.2.7':
resolution: {integrity: sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==}
'@types/send@0.17.4':
resolution: {integrity: sha512-x2EM6TJOybec7c52BX0ZspPodMsQUd5L6PRwOunVyVUhXiBSKf3AezDL8Dgvgt5o0UfKNfuA0eMLr2wLT4AiBA==}
'@types/serve-static@1.15.7':
resolution: {integrity: sha512-W8Ym+h8nhuRwaKPaDw34QUkwsGi6Rc4yYqvKFo5rm2FUEhCFbzVWrxXUxuKK8TASjWsysJY0nsmNCGhCOIsrOw==}
'@types/user-agents@1.0.4':
resolution: {integrity: sha512-AjeFc4oX5WPPflgKfRWWJfkEk7Wu82fnj1rROPsiqFt6yElpdGFg8Srtm/4PU4rA9UiDUZlruGPgcwTMQlwq4w==}
accepts@1.3.8:
resolution: {integrity: sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==}
engines: {node: '>= 0.6'}
acorn-walk@8.3.4:
resolution: {integrity: sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==}
engines: {node: '>=0.4.0'}
acorn@8.14.0:
resolution: {integrity: sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA==}
engines: {node: '>=0.4.0'}
hasBin: true
arg@4.1.3:
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==}
array-flatten@1.1.1:
resolution: {integrity: sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==}
body-parser@1.20.3:
resolution: {integrity: sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==}
engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16}
bytes@3.1.2:
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
engines: {node: '>= 0.8'}
call-bind-apply-helpers@1.0.1:
resolution: {integrity: sha512-BhYE+WDaywFg2TBWYNXAE+8B1ATnThNBqXHP5nQu0jWJdVvY2hvkpyB3qOmtmDePiS5/BDQ8wASEWGMWRG148g==}
engines: {node: '>= 0.4'}
call-bound@1.0.3:
resolution: {integrity: sha512-YTd+6wGlNlPxSuri7Y6X8tY2dmm12UMH66RpKMhiX6rsk5wXXnYgbUcOt8kiS31/AjfoTOvCsE+w8nZQLQnzHA==}
engines: {node: '>= 0.4'}
content-disposition@0.5.4:
resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==}
engines: {node: '>= 0.6'}
content-type@1.0.5:
resolution: {integrity: sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==}
engines: {node: '>= 0.6'}
cookie-signature@1.0.6:
resolution: {integrity: sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==}
cookie@0.7.1:
resolution: {integrity: sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==}
engines: {node: '>= 0.6'}
create-require@1.1.1:
resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==}
debug@2.6.9:
resolution: {integrity: sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==}
peerDependencies:
supports-color: '*'
peerDependenciesMeta:
supports-color:
optional: true
depd@2.0.0:
resolution: {integrity: sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==}
engines: {node: '>= 0.8'}
destroy@1.2.0:
resolution: {integrity: sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==}
engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16}
diff@4.0.2:
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
engines: {node: '>=0.3.1'}
dotenv@16.4.7:
resolution: {integrity: sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==}
engines: {node: '>=12'}
dunder-proto@1.0.1:
resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==}
engines: {node: '>= 0.4'}
ee-first@1.1.1:
resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==}
encodeurl@1.0.2:
resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==}
engines: {node: '>= 0.8'}
encodeurl@2.0.0:
resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==}
engines: {node: '>= 0.8'}
es-define-property@1.0.1:
resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==}
engines: {node: '>= 0.4'}
es-errors@1.3.0:
resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==}
engines: {node: '>= 0.4'}
es-object-atoms@1.0.0:
resolution: {integrity: sha512-MZ4iQ6JwHOBQjahnjwaC1ZtIBH+2ohjamzAO3oaHcXYup7qxjF2fixyH+Q71voWHeOkI2q/TnJao/KfXYIZWbw==}
engines: {node: '>= 0.4'}
escape-html@1.0.3:
resolution: {integrity: sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==}
etag@1.8.1:
resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==}
engines: {node: '>= 0.6'}
express@4.21.2:
resolution: {integrity: sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==}
engines: {node: '>= 0.10.0'}
finalhandler@1.3.1:
resolution: {integrity: sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==}
engines: {node: '>= 0.8'}
forwarded@0.2.0:
resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==}
engines: {node: '>= 0.6'}
fresh@0.5.2:
resolution: {integrity: sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==}
engines: {node: '>= 0.6'}
fsevents@2.3.2:
resolution: {integrity: sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==}
engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
os: [darwin]
function-bind@1.1.2:
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
get-intrinsic@1.2.6:
resolution: {integrity: sha512-qxsEs+9A+u85HhllWJJFicJfPDhRmjzoYdl64aMWW9yRIJmSyxdn8IEkuIM530/7T+lv0TIHd8L6Q/ra0tEoeA==}
engines: {node: '>= 0.4'}
gopd@1.2.0:
resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
engines: {node: '>= 0.4'}
has-symbols@1.1.0:
resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==}
engines: {node: '>= 0.4'}
hasown@2.0.2:
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
engines: {node: '>= 0.4'}
http-errors@2.0.0:
resolution: {integrity: sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==}
engines: {node: '>= 0.8'}
iconv-lite@0.4.24:
resolution: {integrity: sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==}
engines: {node: '>=0.10.0'}
inherits@2.0.4:
resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
ipaddr.js@1.9.1:
resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==}
engines: {node: '>= 0.10'}
lodash.clonedeep@4.5.0:
resolution: {integrity: sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ==}
make-error@1.3.6:
resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==}
math-intrinsics@1.1.0:
resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
engines: {node: '>= 0.4'}
media-typer@0.3.0:
resolution: {integrity: sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==}
engines: {node: '>= 0.6'}
merge-descriptors@1.0.3:
resolution: {integrity: sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==}
methods@1.1.2:
resolution: {integrity: sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==}
engines: {node: '>= 0.6'}
mime-db@1.52.0:
resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==}
engines: {node: '>= 0.6'}
mime-types@2.1.35:
resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==}
engines: {node: '>= 0.6'}
mime@1.6.0:
resolution: {integrity: sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==}
engines: {node: '>=4'}
hasBin: true
ms@2.0.0:
resolution: {integrity: sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==}
ms@2.1.3:
resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
negotiator@0.6.3:
resolution: {integrity: sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==}
engines: {node: '>= 0.6'}
object-inspect@1.13.3:
resolution: {integrity: sha512-kDCGIbxkDSXE3euJZZXzc6to7fCrKHNI/hSRQnRuQ+BWjFNzZwiFF8fj/6o2t2G9/jTj8PSIYTfCLelLZEeRpA==}
engines: {node: '>= 0.4'}
on-finished@2.4.1:
resolution: {integrity: sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==}
engines: {node: '>= 0.8'}
parseurl@1.3.3:
resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==}
engines: {node: '>= 0.8'}
path-to-regexp@0.1.12:
resolution: {integrity: sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==}
playwright-core@1.49.1:
resolution: {integrity: sha512-BzmpVcs4kE2CH15rWfzpjzVGhWERJfmnXmniSyKeRZUs9Ws65m+RGIi7mjJK/euCegfn3i7jvqWeWyHe9y3Vgg==}
engines: {node: '>=18'}
hasBin: true
playwright@1.49.1:
resolution: {integrity: sha512-VYL8zLoNTBxVOrJBbDuRgDWa3i+mfQgDTrL8Ah9QXZ7ax4Dsj0MSq5bYgytRnDVVe+njoKnfsYkH3HzqVj5UZA==}
engines: {node: '>=18'}
hasBin: true
proxy-addr@2.0.7:
resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
engines: {node: '>= 0.10'}
qs@6.13.0:
resolution: {integrity: sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==}
engines: {node: '>=0.6'}
range-parser@1.2.1:
resolution: {integrity: sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==}
engines: {node: '>= 0.6'}
raw-body@2.5.2:
resolution: {integrity: sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==}
engines: {node: '>= 0.8'}
safe-buffer@5.2.1:
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
safer-buffer@2.1.2:
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
send@0.19.0:
resolution: {integrity: sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==}
engines: {node: '>= 0.8.0'}
serve-static@1.16.2:
resolution: {integrity: sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==}
engines: {node: '>= 0.8.0'}
setprototypeof@1.2.0:
resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==}
side-channel-list@1.0.0:
resolution: {integrity: sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==}
engines: {node: '>= 0.4'}
side-channel-map@1.0.1:
resolution: {integrity: sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==}
engines: {node: '>= 0.4'}
side-channel-weakmap@1.0.2:
resolution: {integrity: sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==}
engines: {node: '>= 0.4'}
side-channel@1.1.0:
resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==}
engines: {node: '>= 0.4'}
statuses@2.0.1:
resolution: {integrity: sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==}
engines: {node: '>= 0.8'}
toidentifier@1.0.1:
resolution: {integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==}
engines: {node: '>=0.6'}
ts-node@10.9.2:
resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==}
hasBin: true
peerDependencies:
'@swc/core': '>=1.2.50'
'@swc/wasm': '>=1.2.50'
'@types/node': '*'
typescript: '>=2.7'
peerDependenciesMeta:
'@swc/core':
optional: true
'@swc/wasm':
optional: true
type-is@1.6.18:
resolution: {integrity: sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==}
engines: {node: '>= 0.6'}
typescript@5.7.2:
resolution: {integrity: sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==}
engines: {node: '>=14.17'}
hasBin: true
undici-types@6.19.8:
resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==}
unpipe@1.0.0:
resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==}
engines: {node: '>= 0.8'}
user-agents@1.1.455:
resolution: {integrity: sha512-C5FfBiUlxZAYI+nsxg2iUcVrC0CxjawRZMxoUA9Z5MUm1mC0phPvs7iPe9ksKVaZrsyNLivDeIUxJvHFuCXyLw==}
utils-merge@1.0.1:
resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==}
engines: {node: '>= 0.4.0'}
v8-compile-cache-lib@3.0.1:
resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==}
vary@1.1.2:
resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==}
engines: {node: '>= 0.8'}
yn@3.1.1:
resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==}
engines: {node: '>=6'}
snapshots:
'@cspotcode/source-map-support@0.8.1':
dependencies:
'@jridgewell/trace-mapping': 0.3.9
'@jridgewell/resolve-uri@3.1.2': {}
'@jridgewell/sourcemap-codec@1.5.0': {}
'@jridgewell/trace-mapping@0.3.9':
dependencies:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.0
'@tsconfig/node10@1.0.11': {}
'@tsconfig/node12@1.0.11': {}
'@tsconfig/node14@1.0.3': {}
'@tsconfig/node16@1.0.4': {}
'@types/body-parser@1.19.5':
dependencies:
'@types/connect': 3.4.38
'@types/node': 20.17.10
'@types/connect@3.4.38':
dependencies:
'@types/node': 20.17.10
'@types/express-serve-static-core@4.19.6':
dependencies:
'@types/node': 20.17.10
'@types/qs': 6.9.17
'@types/range-parser': 1.2.7
'@types/send': 0.17.4
'@types/express@4.17.21':
dependencies:
'@types/body-parser': 1.19.5
'@types/express-serve-static-core': 4.19.6
'@types/qs': 6.9.17
'@types/serve-static': 1.15.7
'@types/http-errors@2.0.4': {}
'@types/mime@1.3.5': {}
'@types/node@20.17.10':
dependencies:
undici-types: 6.19.8
'@types/qs@6.9.17': {}
'@types/range-parser@1.2.7': {}
'@types/send@0.17.4':
dependencies:
'@types/mime': 1.3.5
'@types/node': 20.17.10
'@types/serve-static@1.15.7':
dependencies:
'@types/http-errors': 2.0.4
'@types/node': 20.17.10
'@types/send': 0.17.4
'@types/user-agents@1.0.4': {}
accepts@1.3.8:
dependencies:
mime-types: 2.1.35
negotiator: 0.6.3
acorn-walk@8.3.4:
dependencies:
acorn: 8.14.0
acorn@8.14.0: {}
arg@4.1.3: {}
array-flatten@1.1.1: {}
body-parser@1.20.3:
dependencies:
bytes: 3.1.2
content-type: 1.0.5
debug: 2.6.9
depd: 2.0.0
destroy: 1.2.0
http-errors: 2.0.0
iconv-lite: 0.4.24
on-finished: 2.4.1
qs: 6.13.0
raw-body: 2.5.2
type-is: 1.6.18
unpipe: 1.0.0
transitivePeerDependencies:
- supports-color
bytes@3.1.2: {}
call-bind-apply-helpers@1.0.1:
dependencies:
es-errors: 1.3.0
function-bind: 1.1.2
call-bound@1.0.3:
dependencies:
call-bind-apply-helpers: 1.0.1
get-intrinsic: 1.2.6
content-disposition@0.5.4:
dependencies:
safe-buffer: 5.2.1
content-type@1.0.5: {}
cookie-signature@1.0.6: {}
cookie@0.7.1: {}
create-require@1.1.1: {}
debug@2.6.9:
dependencies:
ms: 2.0.0
depd@2.0.0: {}
destroy@1.2.0: {}
diff@4.0.2: {}
dotenv@16.4.7: {}
dunder-proto@1.0.1:
dependencies:
call-bind-apply-helpers: 1.0.1
es-errors: 1.3.0
gopd: 1.2.0
ee-first@1.1.1: {}
encodeurl@1.0.2: {}
encodeurl@2.0.0: {}
es-define-property@1.0.1: {}
es-errors@1.3.0: {}
es-object-atoms@1.0.0:
dependencies:
es-errors: 1.3.0
escape-html@1.0.3: {}
etag@1.8.1: {}
express@4.21.2:
dependencies:
accepts: 1.3.8
array-flatten: 1.1.1
body-parser: 1.20.3
content-disposition: 0.5.4
content-type: 1.0.5
cookie: 0.7.1
cookie-signature: 1.0.6
debug: 2.6.9
depd: 2.0.0
encodeurl: 2.0.0
escape-html: 1.0.3
etag: 1.8.1
finalhandler: 1.3.1
fresh: 0.5.2
http-errors: 2.0.0
merge-descriptors: 1.0.3
methods: 1.1.2
on-finished: 2.4.1
parseurl: 1.3.3
path-to-regexp: 0.1.12
proxy-addr: 2.0.7
qs: 6.13.0
range-parser: 1.2.1
safe-buffer: 5.2.1
send: 0.19.0
serve-static: 1.16.2
setprototypeof: 1.2.0
statuses: 2.0.1
type-is: 1.6.18
utils-merge: 1.0.1
vary: 1.1.2
transitivePeerDependencies:
- supports-color
finalhandler@1.3.1:
dependencies:
debug: 2.6.9
encodeurl: 2.0.0
escape-html: 1.0.3
on-finished: 2.4.1
parseurl: 1.3.3
statuses: 2.0.1
unpipe: 1.0.0
transitivePeerDependencies:
- supports-color
forwarded@0.2.0: {}
fresh@0.5.2: {}
fsevents@2.3.2:
optional: true
function-bind@1.1.2: {}
get-intrinsic@1.2.6:
dependencies:
call-bind-apply-helpers: 1.0.1
dunder-proto: 1.0.1
es-define-property: 1.0.1
es-errors: 1.3.0
es-object-atoms: 1.0.0
function-bind: 1.1.2
gopd: 1.2.0
has-symbols: 1.1.0
hasown: 2.0.2
math-intrinsics: 1.1.0
gopd@1.2.0: {}
has-symbols@1.1.0: {}
hasown@2.0.2:
dependencies:
function-bind: 1.1.2
http-errors@2.0.0:
dependencies:
depd: 2.0.0
inherits: 2.0.4
setprototypeof: 1.2.0
statuses: 2.0.1
toidentifier: 1.0.1
iconv-lite@0.4.24:
dependencies:
safer-buffer: 2.1.2
inherits@2.0.4: {}
ipaddr.js@1.9.1: {}
lodash.clonedeep@4.5.0: {}
make-error@1.3.6: {}
math-intrinsics@1.1.0: {}
media-typer@0.3.0: {}
merge-descriptors@1.0.3: {}
methods@1.1.2: {}
mime-db@1.52.0: {}
mime-types@2.1.35:
dependencies:
mime-db: 1.52.0
mime@1.6.0: {}
ms@2.0.0: {}
ms@2.1.3: {}
negotiator@0.6.3: {}
object-inspect@1.13.3: {}
on-finished@2.4.1:
dependencies:
ee-first: 1.1.1
parseurl@1.3.3: {}
path-to-regexp@0.1.12: {}
playwright-core@1.49.1: {}
playwright@1.49.1:
dependencies:
playwright-core: 1.49.1
optionalDependencies:
fsevents: 2.3.2
proxy-addr@2.0.7:
dependencies:
forwarded: 0.2.0
ipaddr.js: 1.9.1
qs@6.13.0:
dependencies:
side-channel: 1.1.0
range-parser@1.2.1: {}
raw-body@2.5.2:
dependencies:
bytes: 3.1.2
http-errors: 2.0.0
iconv-lite: 0.4.24
unpipe: 1.0.0
safe-buffer@5.2.1: {}
safer-buffer@2.1.2: {}
send@0.19.0:
dependencies:
debug: 2.6.9
depd: 2.0.0
destroy: 1.2.0
encodeurl: 1.0.2
escape-html: 1.0.3
etag: 1.8.1
fresh: 0.5.2
http-errors: 2.0.0
mime: 1.6.0
ms: 2.1.3
on-finished: 2.4.1
range-parser: 1.2.1
statuses: 2.0.1
transitivePeerDependencies:
- supports-color
serve-static@1.16.2:
dependencies:
encodeurl: 2.0.0
escape-html: 1.0.3
parseurl: 1.3.3
send: 0.19.0
transitivePeerDependencies:
- supports-color
setprototypeof@1.2.0: {}
side-channel-list@1.0.0:
dependencies:
es-errors: 1.3.0
object-inspect: 1.13.3
side-channel-map@1.0.1:
dependencies:
call-bound: 1.0.3
es-errors: 1.3.0
get-intrinsic: 1.2.6
object-inspect: 1.13.3
side-channel-weakmap@1.0.2:
dependencies:
call-bound: 1.0.3
es-errors: 1.3.0
get-intrinsic: 1.2.6
object-inspect: 1.13.3
side-channel-map: 1.0.1
side-channel@1.1.0:
dependencies:
es-errors: 1.3.0
object-inspect: 1.13.3
side-channel-list: 1.0.0
side-channel-map: 1.0.1
side-channel-weakmap: 1.0.2
statuses@2.0.1: {}
toidentifier@1.0.1: {}
ts-node@10.9.2(@types/node@20.17.10)(typescript@5.7.2):
dependencies:
'@cspotcode/source-map-support': 0.8.1
'@tsconfig/node10': 1.0.11
'@tsconfig/node12': 1.0.11
'@tsconfig/node14': 1.0.3
'@tsconfig/node16': 1.0.4
'@types/node': 20.17.10
acorn: 8.14.0
acorn-walk: 8.3.4
arg: 4.1.3
create-require: 1.1.1
diff: 4.0.2
make-error: 1.3.6
typescript: 5.7.2
v8-compile-cache-lib: 3.0.1
yn: 3.1.1
type-is@1.6.18:
dependencies:
media-typer: 0.3.0
mime-types: 2.1.35
typescript@5.7.2: {}
undici-types@6.19.8: {}
unpipe@1.0.0: {}
user-agents@1.1.455:
dependencies:
lodash.clonedeep: 4.5.0
utils-merge@1.0.1: {}
v8-compile-cache-lib@3.0.1: {}
vary@1.1.2: {}
yn@3.1.1: {}

View File

@ -104,6 +104,8 @@ async def root(body: UrlModel):
json_compatible_item_data = {
"content": page_content,
"pageStatusCode": page_status_code,
"pageError": page_error
}
}
if page_error is not None:
json_compatible_item_data["pageError"] = page_error
return JSONResponse(content=json_compatible_item_data)

View File

@ -145,6 +145,7 @@ class FirecrawlApp:
f'{self.api_url}{endpoint}',
headers=headers,
json=scrape_params,
timeout=(scrape_params["timeout"] + 5000 if "timeout" in scrape_params else None),
)
if response.status_code == 200:
try:
@ -433,7 +434,7 @@ class FirecrawlApp:
else:
self._handle_error(response, 'map')
def batch_scrape_urls(self, urls: list[str],
def batch_scrape_urls(self, urls: List[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
@ -441,7 +442,7 @@ class FirecrawlApp:
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
Args:
urls (list[str]): The URLs to scrape.
urls (List[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
@ -476,12 +477,12 @@ class FirecrawlApp:
self._handle_error(response, 'start batch scrape job')
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.
Args:
urls (list[str]): The URLs to scrape.
urls (List[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
@ -505,12 +506,12 @@ class FirecrawlApp:
else:
self._handle_error(response, 'start batch scrape job')
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
"""
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Args:
urls (list[str]): The URLs to scrape.
urls (List[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
@ -925,7 +926,7 @@ class FirecrawlApp:
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:

View File

@ -13,13 +13,13 @@ x-common-service: &common-service
services:
playwright-service:
build: apps/playwright-service
build: apps/playwright-service-ts
environment:
- PORT=3000
- PROXY_SERVER=${PROXY_SERVER}
- PROXY_USERNAME=${PROXY_USERNAME}
- PROXY_PASSWORD=${PROXY_PASSWORD}
- BLOCK_MEDIA=${BLOCK_MEDIA}
PORT: 3000
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
BLOCK_MEDIA: ${BLOCK_MEDIA}
networks:
- backend
@ -28,7 +28,7 @@ services:
environment:
REDIS_URL: ${REDIS_URL:-redis://redis:6379}
REDIS_RATE_LIMIT_URL: ${REDIS_URL:-redis://redis:6379}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape}
USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION}
PORT: ${PORT:-3002}
NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE}
@ -51,6 +51,9 @@ services:
SERPER_API_KEY: ${SERPER_API_KEY}
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
LOGGING_LEVEL: ${LOGGING_LEVEL}
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
FLY_PROCESS_GROUP: app
depends_on:
- redis
@ -64,7 +67,7 @@ services:
environment:
REDIS_URL: ${REDIS_URL:-redis://redis:6379}
REDIS_RATE_LIMIT_URL: ${REDIS_URL:-redis://redis:6379}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape}
USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION}
PORT: ${PORT:-3002}
NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE}
@ -85,6 +88,9 @@ services:
HOST: ${HOST:-0.0.0.0}
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
LOGGING_LEVEL: ${LOGGING_LEVEL}
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
FLY_PROCESS_GROUP: worker
depends_on:
- redis