From 2200f084f35f304e3af6a108b225a3d1597dc3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Feb 2025 00:41:22 +0100 Subject: [PATCH] SELFHOST FIXES (#1207) * fix(extract): construct OpenAI on demand Fixes hard-crash if api key not specified in a self-hosting environment. * fix(ci): try sleeping * fix(ci): override host * fix(ci): wait for server to start * Support /extract and /crawl for self-hosted (FIR-1097) (#1137) * Support /extract for self-hosted This returns the job response from redis rather than supabase when db auth is disabled (self hosted mode) * Use getJob for extract and use correct types * fix(v1/crawl-status): only poll DB for total count if DB is enabled * feat(snips): TEST_SUITE_SELF_HOSTED * fix(ci/test-server-self-host): use pr trigger * fix(scrapeURL): f-e mocking in selfhosted env * fix(snips): do not try to eval json format on selfhost * fix(scrapeURL): further f-e mocking * fix(snips): don't timeout on hard fail polling * fix(v1/extract-status): fix-up the db-agnostic impl unfortunately had to separate the functions since the schema was too divergent :( * fix(snips): boost screenshot delay * feat(ci): test with openai * feat(ci): extract, search testing * fix(ci): matrix * fix(ci): bleh * Update: fix default google search (#1174) * fix log title * search should always work * asd * fix ci --------- Co-authored-by: Nick Roth Co-authored-by: William --- .github/workflows/test-server-self-host.yml | 19 +- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 286 +++++++++++++++++- .../src/__tests__/snips/batch-scrape.test.ts | 75 +++-- apps/api/src/__tests__/snips/crawl.test.ts | 2 +- apps/api/src/__tests__/snips/extract.test.ts | 56 ++-- apps/api/src/__tests__/snips/scrape.test.ts | 226 +++++++------- apps/api/src/__tests__/snips/search.test.ts | 4 +- apps/api/src/controllers/v1/crawl-status.ts | 6 +- apps/api/src/controllers/v1/extract-status.ts | 46 ++- .../completions/analyzeSchemaAndPrompt.ts | 2 +- .../api/src/lib/extract/extraction-service.ts | 2 +- apps/api/src/lib/extract/index/pinecone.ts | 8 +- apps/api/src/lib/llm/generate.ts | 8 +- apps/api/src/lib/ranker.ts | 9 +- .../engines/fire-engine/checkStatus.ts | 3 +- .../scrapeURL/engines/fire-engine/delete.ts | 3 +- .../scrapeURL/engines/fire-engine/scrape.ts | 6 +- .../src/scraper/scrapeURL/engines/index.ts | 7 +- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 8 +- apps/api/src/search/fireEngine.ts | 4 +- apps/api/src/search/googlesearch.ts | 85 +++--- apps/api/src/search/index.ts | 2 +- apps/api/src/services/queue-service.ts | 2 + 24 files changed, 608 insertions(+), 262 deletions(-) diff --git a/.github/workflows/test-server-self-host.yml b/.github/workflows/test-server-self-host.yml index 3e13d3d1..037e9684 100644 --- a/.github/workflows/test-server-self-host.yml +++ b/.github/workflows/test-server-self-host.yml @@ -1,22 +1,35 @@ name: Self-hosted Server Test Suite on: - workflow_dispatch: + pull_request: + branches: + - main + paths: + - apps/api/** env: PORT: 3002 REDIS_URL: redis://localhost:6379 + HOST: 0.0.0.0 ENV: ${{ secrets.ENV }} + TEST_SUITE_SELF_HOSTED: true jobs: test: name: Run tests + strategy: + matrix: + openai: [true, false] + serper: [true, false] runs-on: ubuntu-latest services: redis: image: redis ports: - 6379:6379 + env: + OPENAI_API_KEY: ${{ matrix.openai == true && secrets.OPENAI_API_KEY || '' }} + SERPER_API_KEY: ${{ matrix.serper == true && secrets.SERPER_API_KEY || '' }} steps: - uses: actions/checkout@v3 - name: Install pnpm @@ -40,6 +53,8 @@ jobs: run: npm run workers > worker.log 2>&1 & working-directory: ./apps/api id: start_workers + - name: Wait for server + run: pnpx wait-on tcp:3002 -t 15s - name: Run snippet tests run: | npm run test:snips @@ -50,7 +65,7 @@ jobs: - uses: actions/upload-artifact@v4 if: always() with: - name: Logs + name: Logs (openai ${{ matrix.openai }}, serper ${{ matrix.serper }}) path: | ./apps/api/api.log ./apps/api/worker.log diff --git a/apps/api/package.json b/apps/api/package.json index 1c554728..31519998 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -56,6 +56,7 @@ "typescript": "^5.4.2" }, "dependencies": { + "jsdom": "^26.0.0", "@anthropic-ai/sdk": "^0.24.3", "@apidevtools/json-schema-ref-parser": "^11.7.3", "@brillout/import": "^0.2.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index f66dfe72..85c02140 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -125,6 +125,9 @@ importers: joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 + jsdom: + specifier: ^26.0.0 + version: 26.0.0 json-schema-to-zod: specifier: ^2.3.0 version: 2.3.0 @@ -136,7 +139,7 @@ importers: version: 2.9.0 langchain: specifier: ^0.2.8 - version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -332,6 +335,9 @@ packages: resolution: {integrity: sha512-WApSdLdXEBb/1FUPca2lteASewEfpjEYJ8oXZP+0gExK5qSfsEKBKcA+WjY6Q4wvXwyv0+W6Kvc372pSceib9w==} engines: {node: '>= 16'} + '@asamuzakjp/css-color@2.8.3': + resolution: {integrity: sha512-GIc76d9UI1hCvOATjZPyHFmE5qhRccp3/zGfMPapK3jBi+yocEzp6BBB0UnfRYP9NP4FANqUZYb0hnfs3TM3hw==} + '@aws-crypto/crc32@3.0.0': resolution: {integrity: sha512-IzSgsrxUcsrejQbPVilIKy16kAT52EwB6zSaI+M3xxIhKh5+aldEyvI+z6erM7TCLB2BJsFrtHjp6/4/sr+3dA==} @@ -685,6 +691,34 @@ packages: resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} engines: {node: '>=12'} + '@csstools/color-helpers@5.0.1': + resolution: {integrity: sha512-MKtmkA0BX87PKaO1NFRTFH+UnkgnmySQOvNxJubsadusqPEC2aJ9MOQiMceZJJ6oitUl/i0L6u0M1IrmAOmgBA==} + engines: {node: '>=18'} + + '@csstools/css-calc@2.1.1': + resolution: {integrity: sha512-rL7kaUnTkL9K+Cvo2pnCieqNpTKgQzy5f+N+5Iuko9HAoasP+xgprVh7KN/MaJVvVL1l0EzQq2MoqBHKSrDrag==} + engines: {node: '>=18'} + peerDependencies: + '@csstools/css-parser-algorithms': ^3.0.4 + '@csstools/css-tokenizer': ^3.0.3 + + '@csstools/css-color-parser@3.0.7': + resolution: {integrity: sha512-nkMp2mTICw32uE5NN+EsJ4f5N+IGFeCFu4bGpiKgb2Pq/7J/MpyLBeQ5ry4KKtRFZaYs6sTmcMYrSRIyj5DFKA==} + engines: {node: '>=18'} + peerDependencies: + '@csstools/css-parser-algorithms': ^3.0.4 + '@csstools/css-tokenizer': ^3.0.3 + + '@csstools/css-parser-algorithms@3.0.4': + resolution: {integrity: sha512-Up7rBoV77rv29d3uKHUIVubz1BTcgyUK72IvCQAbfbMv584xHcGKCKbWh7i8hPrRJ7qU4Y8IO3IY9m+iTB7P3A==} + engines: {node: '>=18'} + peerDependencies: + '@csstools/css-tokenizer': ^3.0.3 + + '@csstools/css-tokenizer@3.0.3': + resolution: {integrity: sha512-UJnjoFsmxfKUdNYdWgOB0mWUypuLvAfQPH1+pyvRJs6euowbFkFC6P13w1l8mJyi3vxYMxc9kld5jZEGRQs6bw==} + engines: {node: '>=18'} + '@dabh/diagnostics@2.0.3': resolution: {integrity: sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==} @@ -1715,6 +1749,10 @@ packages: resolution: {integrity: sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==} engines: {node: '>= 14'} + agent-base@7.1.3: + resolution: {integrity: sha512-jRR5wdylq8CkOe6hei19GGZnxM6rBGwFl3Bg0YItGDimvjGtAvdZk4Pu6Cl4u4Igsws4a1fd1Vq3ezrhn4KmFw==} + engines: {node: '>= 14'} + agentkeepalive@4.5.0: resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==} engines: {node: '>= 8.0.0'} @@ -2141,6 +2179,10 @@ packages: resolution: {integrity: sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==} engines: {node: '>= 6'} + cssstyle@4.2.1: + resolution: {integrity: sha512-9+vem03dMXG7gDmZ62uqmRiMRNtinIZ9ZyuF6BdxzfOD+FdN5hretzynkn0ReS2DO2GSw76RWHs0UmJPI2zUjw==} + engines: {node: '>=18'} + csv-parse@5.5.6: resolution: {integrity: sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==} @@ -2152,6 +2194,10 @@ packages: resolution: {integrity: sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==} engines: {node: '>= 14'} + data-urls@5.0.0: + resolution: {integrity: sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==} + engines: {node: '>=18'} + date-fns@3.6.0: resolution: {integrity: sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww==} @@ -2197,6 +2243,9 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} + decimal.js@10.5.0: + resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==} + dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2510,6 +2559,10 @@ packages: resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} engines: {node: '>= 6'} + form-data@4.0.1: + resolution: {integrity: sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==} + engines: {node: '>= 6'} + formdata-node@4.4.1: resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==} engines: {node: '>= 12.20'} @@ -2647,6 +2700,10 @@ packages: resolution: {integrity: sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==} engines: {node: '>=12'} + html-encoding-sniffer@4.0.0: + resolution: {integrity: sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==} + engines: {node: '>=18'} + html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} @@ -2686,6 +2743,10 @@ packages: resolution: {integrity: sha512-1e4Wqeblerz+tMKPIq2EMGiiWW1dIjZOksyHWSUm1rmuvw/how9hBHZ38lAGj5ID4Ik6EdkOw7NmWPy6LAwalw==} engines: {node: '>= 14'} + https-proxy-agent@7.0.6: + resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} + engines: {node: '>= 14'} + human-signals@2.1.0: resolution: {integrity: sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==} engines: {node: '>=10.17.0'} @@ -2798,6 +2859,9 @@ packages: resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==} engines: {node: '>=8'} + is-potential-custom-element-name@1.0.1: + resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} + is-retry-allowed@2.2.0: resolution: {integrity: sha512-XVm7LOeLpTW4jV19QSH38vkswxoLud8sQ57YwJVTPWdiaI9I8keEhGFpBlslyVsgdQy4Opg8QOLb8YRgsyZiQg==} engines: {node: '>=10'} @@ -3012,6 +3076,15 @@ packages: jsbn@1.1.0: resolution: {integrity: sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==} + jsdom@26.0.0: + resolution: {integrity: sha512-BZYDGVAIriBWTpIxYzrXjv3E/4u8+/pSG5bQdIYCbNCGOvsPkDQfTVLAIXAf9ETdCpduCVTkDe2NNZ8NIwUVzw==} + engines: {node: '>=18'} + peerDependencies: + canvas: ^3.0.0 + peerDependenciesMeta: + canvas: + optional: true + jsesc@2.5.2: resolution: {integrity: sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==} engines: {node: '>=4'} @@ -3298,6 +3371,9 @@ packages: resolution: {integrity: sha512-CQl19J/g+Hbjbv4Y3mFNNXFEL/5t/KCg8POCuUqd4rMKjGG+j1ybER83hxV58zL+dFI1PTkt3GNFSHRt+d8qEQ==} engines: {node: 14 || >=16.14} + lru-cache@10.4.3: + resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} + lru-cache@5.1.1: resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} @@ -3588,6 +3664,9 @@ packages: resolution: {integrity: sha512-1MQz1Ed8z2yckoBeSfkQHHO9K1yDRxxtotKSJ9yvcTUUxSvfvzEq5GwBrjjHEpMlq/k5gvXdmJ1SbYxWtpNoVg==} engines: {node: '>=8'} + nwsapi@2.2.16: + resolution: {integrity: sha512-F1I/bimDpj3ncaNDhfyMWuFqmQDBwDB0Fogc2qpL3BWvkQteFD/8BzWuIRl83rq0DXfm8SGt/HFhLXZyljTXcQ==} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -3697,6 +3776,9 @@ packages: parse5@7.1.2: resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==} + parse5@7.2.1: + resolution: {integrity: sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==} + parseley@0.12.1: resolution: {integrity: sha512-e6qHKe3a9HWr0oMRVDTRhKce+bRO8VGQR3NyVwcjwrbhMmFCX9KszEV35+rn4AdilFAq9VPxP/Fe1wC9Qjd2lw==} @@ -4015,6 +4097,9 @@ packages: resolution: {integrity: sha512-s+pyvQeIKIZ0dx5iJiQk1tPLJAWln39+MI5jtM8wnyws+G5azk+dMnMX0qfbqNetKKNgcWWOdi0sfm+FbQbgdQ==} engines: {node: '>=10.0.0'} + rrweb-cssom@0.8.0: + resolution: {integrity: sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==} + rusha@0.8.14: resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==} @@ -4034,6 +4119,10 @@ packages: sax@1.4.1: resolution: {integrity: sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==} + saxes@6.0.0: + resolution: {integrity: sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==} + engines: {node: '>=v12.22.7'} + scheduler@0.23.2: resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==} @@ -4260,6 +4349,9 @@ packages: resolution: {integrity: sha512-SzRP5LQ6Ts2G5NyAa/jg16s8e3R7rfdFjizy1zeoecYWw+nGL+YA1xZvW/+iJmidBGSdLkuvdwTYEyJEb+EiUw==} engines: {node: '>=0.2.6'} + symbol-tree@3.2.4: + resolution: {integrity: sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==} + systeminformation@5.22.11: resolution: {integrity: sha512-aLws5yi4KCHTb0BVvbodQY5bY8eW4asMRDTxTW46hqw9lGjACX6TlLdJrkdoHYRB0qs+MekqEq1zG7WDnWE8Ug==} engines: {node: '>=8.0.0'} @@ -4315,6 +4407,10 @@ packages: resolution: {integrity: sha512-r0eojU4bI8MnHr8c5bNo7lJDdI2qXlWWJk6a9EAFG7vbhTjElYhBVS3/miuE0uOuoLdb8Mc/rVfsmm6eo5o9GA==} hasBin: true + tough-cookie@5.1.1: + resolution: {integrity: sha512-Ek7HndSVkp10hmHP9V4qZO1u+pn1RU5sI0Fw+jCU3lyvuMZcgqsNgc6CmJJZyByK4Vm/qotGRJlfgAX8q+4JiA==} + engines: {node: '>=16'} + tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} @@ -4322,6 +4418,10 @@ packages: resolution: {integrity: sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==} engines: {node: '>=14'} + tr46@5.0.0: + resolution: {integrity: sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==} + engines: {node: '>=18'} + triple-beam@1.4.1: resolution: {integrity: sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==} engines: {node: '>= 14.0.0'} @@ -4483,6 +4583,10 @@ packages: resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==} engines: {node: '>= 0.8'} + w3c-xmlserializer@5.0.0: + resolution: {integrity: sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==} + engines: {node: '>=18'} + walker@1.0.8: resolution: {integrity: sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==} @@ -4505,13 +4609,25 @@ packages: resolution: {integrity: sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==} engines: {node: '>=12'} + whatwg-encoding@3.1.1: + resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} + engines: {node: '>=18'} + whatwg-fetch@3.6.20: resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==} + whatwg-mimetype@4.0.0: + resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} + engines: {node: '>=18'} + whatwg-url@13.0.0: resolution: {integrity: sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==} engines: {node: '>=16'} + whatwg-url@14.1.1: + resolution: {integrity: sha512-mDGf9diDad/giZ/Sm9Xi2YcyzaFpbdLpJPr+E9fSkyQ7KpQD4SdFcugkRQYzhmfI4KeV4Qpnn2sKPdo+kmsgRQ==} + engines: {node: '>=18'} + whatwg-url@5.0.0: resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} @@ -4583,6 +4699,10 @@ packages: utf-8-validate: optional: true + xml-name-validator@5.0.0: + resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==} + engines: {node: '>=18'} + xml2js@0.6.2: resolution: {integrity: sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==} engines: {node: '>=4.0.0'} @@ -4595,6 +4715,9 @@ packages: resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==} engines: {node: '>=4.0'} + xmlchars@2.2.0: + resolution: {integrity: sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==} + xtend@4.0.2: resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==} engines: {node: '>=0.4'} @@ -4675,6 +4798,14 @@ snapshots: '@types/json-schema': 7.0.15 js-yaml: 4.1.0 + '@asamuzakjp/css-color@2.8.3': + dependencies: + '@csstools/css-calc': 2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3) + '@csstools/css-color-parser': 3.0.7(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3) + '@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3) + '@csstools/css-tokenizer': 3.0.3 + lru-cache: 10.4.3 + '@aws-crypto/crc32@3.0.0': dependencies: '@aws-crypto/util': 3.0.0 @@ -5413,6 +5544,26 @@ snapshots: dependencies: '@jridgewell/trace-mapping': 0.3.9 + '@csstools/color-helpers@5.0.1': {} + + '@csstools/css-calc@2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)': + dependencies: + '@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3) + '@csstools/css-tokenizer': 3.0.3 + + '@csstools/css-color-parser@3.0.7(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3)': + dependencies: + '@csstools/color-helpers': 5.0.1 + '@csstools/css-calc': 2.1.1(@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3))(@csstools/css-tokenizer@3.0.3) + '@csstools/css-parser-algorithms': 3.0.4(@csstools/css-tokenizer@3.0.3) + '@csstools/css-tokenizer': 3.0.3 + + '@csstools/css-parser-algorithms@3.0.4(@csstools/css-tokenizer@3.0.3)': + dependencies: + '@csstools/css-tokenizer': 3.0.3 + + '@csstools/css-tokenizer@3.0.3': {} + '@dabh/diagnostics@2.0.3': dependencies: colorspace: 1.1.4 @@ -5642,13 +5793,13 @@ snapshots: '@jsdevtools/ono@7.1.3': {} - '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': + '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) + langsmith: 0.1.34(7lljbsleilzgkaubvlq4ipicvq) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5660,9 +5811,9 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': + '@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) zod: 3.23.8 @@ -5671,9 +5822,9 @@ snapshots: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -6811,6 +6962,8 @@ snapshots: transitivePeerDependencies: - supports-color + agent-base@7.1.3: {} + agentkeepalive@4.5.0: dependencies: humanize-ms: 1.2.1 @@ -7321,12 +7474,22 @@ snapshots: css-what@6.1.0: {} + cssstyle@4.2.1: + dependencies: + '@asamuzakjp/css-color': 2.8.3 + rrweb-cssom: 0.8.0 + csv-parse@5.5.6: {} data-uri-to-buffer@4.0.1: {} data-uri-to-buffer@6.0.2: {} + data-urls@5.0.0: + dependencies: + whatwg-mimetype: 4.0.0 + whatwg-url: 14.1.1 + date-fns@3.6.0: {} debug@2.6.9: @@ -7351,6 +7514,8 @@ snapshots: decamelize@4.0.0: {} + decimal.js@10.5.0: {} + dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7661,6 +7826,12 @@ snapshots: combined-stream: 1.0.8 mime-types: 2.1.35 + form-data@4.0.1: + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + mime-types: 2.1.35 + formdata-node@4.4.1: dependencies: node-domexception: 1.0.0 @@ -7795,6 +7966,10 @@ snapshots: dependencies: whatwg-encoding: 2.0.0 + html-encoding-sniffer@4.0.0: + dependencies: + whatwg-encoding: 3.1.1 + html-escaper@2.0.2: {} html-to-text@9.0.5: @@ -7875,6 +8050,13 @@ snapshots: transitivePeerDependencies: - supports-color + https-proxy-agent@7.0.6: + dependencies: + agent-base: 7.1.3 + debug: 4.3.5 + transitivePeerDependencies: + - supports-color + human-signals@2.1.0: {} humanize-ms@1.2.1: @@ -7984,6 +8166,8 @@ snapshots: is-plain-obj@2.1.0: {} + is-potential-custom-element-name@1.0.1: {} + is-retry-allowed@2.2.0: {} is-stream@2.0.1: {} @@ -8400,6 +8584,34 @@ snapshots: jsbn@1.1.0: {} + jsdom@26.0.0: + dependencies: + cssstyle: 4.2.1 + data-urls: 5.0.0 + decimal.js: 10.5.0 + form-data: 4.0.1 + html-encoding-sniffer: 4.0.0 + http-proxy-agent: 7.0.2 + https-proxy-agent: 7.0.6 + is-potential-custom-element-name: 1.0.1 + nwsapi: 2.2.16 + parse5: 7.2.1 + rrweb-cssom: 0.8.0 + saxes: 6.0.0 + symbol-tree: 3.2.4 + tough-cookie: 5.1.1 + w3c-xmlserializer: 5.0.0 + webidl-conversions: 7.0.0 + whatwg-encoding: 3.1.1 + whatwg-mimetype: 4.0.0 + whatwg-url: 14.1.1 + ws: 8.18.0 + xml-name-validator: 5.0.0 + transitivePeerDependencies: + - bufferutil + - supports-color + - utf-8-validate + jsesc@2.5.2: {} json-parse-even-better-errors@2.3.1: {} @@ -8435,17 +8647,17 @@ snapshots: kuler@2.0.0: {} - langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): + langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) - '@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) + langsmith: 0.1.34(7lljbsleilzgkaubvlq4ipicvq) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8463,6 +8675,7 @@ snapshots: handlebars: 4.7.8 html-to-text: 9.0.5 ioredis: 5.4.1 + jsdom: 26.0.0 mammoth: 1.7.2 mongodb: 6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3) pdf-parse: 1.1.1 @@ -8475,7 +8688,7 @@ snapshots: langchainhub@0.0.11: {} - langsmith@0.1.34(npkyd6f7wyl3urgrzoxaktl5a4): + langsmith@0.1.34(7lljbsleilzgkaubvlq4ipicvq): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8484,8 +8697,8 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) - langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(jsdom@26.0.0)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) languagedetect@2.0.0: {} @@ -8554,6 +8767,8 @@ snapshots: lru-cache@10.3.0: {} + lru-cache@10.4.3: {} + lru-cache@5.1.1: dependencies: yallist: 3.1.1 @@ -8849,6 +9064,8 @@ snapshots: num-sort@2.1.0: {} + nwsapi@2.2.16: {} + object-assign@4.1.1: {} object-inspect@1.13.1: {} @@ -8979,6 +9196,10 @@ snapshots: dependencies: entities: 4.5.0 + parse5@7.2.1: + dependencies: + entities: 4.5.0 + parseley@0.12.1: dependencies: leac: 0.6.0 @@ -9321,6 +9542,8 @@ snapshots: robots-parser@3.0.1: {} + rrweb-cssom@0.8.0: {} + rusha@0.8.14: {} safe-buffer@5.1.2: {} @@ -9333,6 +9556,10 @@ snapshots: sax@1.4.1: {} + saxes@6.0.0: + dependencies: + xmlchars: 2.2.0 + scheduler@0.23.2: dependencies: loose-envify: 1.4.0 @@ -9583,6 +9810,8 @@ snapshots: sylvester@0.0.12: {} + symbol-tree@3.2.4: {} + systeminformation@5.22.11: {} tar-fs@3.0.5: @@ -9640,12 +9869,20 @@ snapshots: touch@3.1.1: {} + tough-cookie@5.1.1: + dependencies: + tldts: 6.1.75 + tr46@0.0.3: {} tr46@4.1.1: dependencies: punycode: 2.3.1 + tr46@5.0.0: + dependencies: + punycode: 2.3.1 + triple-beam@1.4.1: {} ts-jest@29.1.4(@babel/core@7.24.6)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.6))(jest@29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)))(typescript@5.4.5): @@ -9777,6 +10014,10 @@ snapshots: vary@1.1.2: {} + w3c-xmlserializer@5.0.0: + dependencies: + xml-name-validator: 5.0.0 + walker@1.0.8: dependencies: makeerror: 1.0.12 @@ -9793,13 +10034,24 @@ snapshots: dependencies: iconv-lite: 0.6.3 + whatwg-encoding@3.1.1: + dependencies: + iconv-lite: 0.6.3 + whatwg-fetch@3.6.20: {} + whatwg-mimetype@4.0.0: {} + whatwg-url@13.0.0: dependencies: tr46: 4.1.1 webidl-conversions: 7.0.0 + whatwg-url@14.1.1: + dependencies: + tr46: 5.0.0 + webidl-conversions: 7.0.0 + whatwg-url@5.0.0: dependencies: tr46: 0.0.3 @@ -9868,6 +10120,8 @@ snapshots: ws@8.18.0: {} + xml-name-validator@5.0.0: {} + xml2js@0.6.2: dependencies: sax: 1.4.1 @@ -9877,6 +10131,8 @@ snapshots: xmlbuilder@11.0.1: {} + xmlchars@2.2.0: {} + xtend@4.0.2: {} y18n@5.0.8: {} diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts index 59c9da2e..6dcadae4 100644 --- a/apps/api/src/__tests__/snips/batch-scrape.test.ts +++ b/apps/api/src/__tests__/snips/batch-scrape.test.ts @@ -30,7 +30,7 @@ async function batchScrape(body: BatchScrapeRequestInput): ReturnType { - describe("JSON format", () => { it.concurrent("works", async () => { const response = await batchScrape({ - urls: ["http://firecrawl.dev"], - formats: ["json"], - jsonOptions: { - prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", - schema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", + urls: ["http://firecrawl.dev"] + }); + + expect(response.body.data[0]).toHaveProperty("markdown"); + expect(response.body.data[0].markdown).toContain("Firecrawl"); + }, 30000); + + if (!process.env.TEST_SUITE_SELF_HOSTED) { + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await batchScrape({ + urls: ["http://firecrawl.dev"], + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], }, }, - required: ["company_mission", "supports_sso", "is_open_source"], - }, - }, + }); + + expect(response.body.data[0]).toHaveProperty("json"); + expect(response.body.data[0].json).toHaveProperty("company_mission"); + expect(typeof response.body.data[0].json.company_mission).toBe("string"); + expect(response.body.data[0].json).toHaveProperty("supports_sso"); + expect(response.body.data[0].json.supports_sso).toBe(false); + expect(typeof response.body.data[0].json.supports_sso).toBe("boolean"); + expect(response.body.data[0].json).toHaveProperty("is_open_source"); + expect(response.body.data[0].json.is_open_source).toBe(true); + expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); + }, 30000); }); - - expect(response.body.data[0]).toHaveProperty("json"); - expect(response.body.data[0].json).toHaveProperty("company_mission"); - expect(typeof response.body.data[0].json.company_mission).toBe("string"); - expect(response.body.data[0].json).toHaveProperty("supports_sso"); - expect(response.body.data[0].json.supports_sso).toBe(false); - expect(typeof response.body.data[0].json.supports_sso).toBe("boolean"); - expect(response.body.data[0].json).toHaveProperty("is_open_source"); - expect(response.body.data[0].json.is_open_source).toBe(true); - expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); - }, 30000); - }); + } }); diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index 6aa47732..1fb572ef 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -30,7 +30,7 @@ async function crawl(body: CrawlRequestInput): ReturnType { x = await crawlStatus(cs.body.id); expect(x.statusCode).toBe(200); expect(typeof x.body.status).toBe("string"); - } while (x.body.status !== "completed") + } while (x.body.status === "scraping"); expectCrawlToSucceed(x); return x; diff --git a/apps/api/src/__tests__/snips/extract.test.ts b/apps/api/src/__tests__/snips/extract.test.ts index 7375d85a..993a17a6 100644 --- a/apps/api/src/__tests__/snips/extract.test.ts +++ b/apps/api/src/__tests__/snips/extract.test.ts @@ -30,7 +30,7 @@ async function extract(body: ExtractRequestInput): Promise { x = await extractStatus(es.body.id); expect(x.statusCode).toBe(200); expect(typeof x.body.status).toBe("string"); - } while (x.body.status !== "completed"); + } while (x.body.status === "processing"); expectExtractToSucceed(x); return x.body; @@ -51,31 +51,37 @@ function expectExtractToSucceed(response: Awaited { - it.concurrent("works", async () => { - const res = await extract({ - urls: ["https://firecrawl.dev"], - schema: { - "type": "object", - "properties": { - "company_mission": { - "type": "string" + if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) { + it.concurrent("works", async () => { + const res = await extract({ + urls: ["https://firecrawl.dev"], + schema: { + "type": "object", + "properties": { + "company_mission": { + "type": "string" + }, + "is_open_source": { + "type": "boolean" + } }, - "is_open_source": { - "type": "boolean" - } + "required": [ + "company_mission", + "is_open_source" + ] }, - "required": [ - "company_mission", - "is_open_source" - ] - }, - origin: "api-sdk", - }); + origin: "api-sdk", + }); - expect(res.data).toHaveProperty("company_mission"); - expect(typeof res.data.company_mission).toBe("string") - expect(res.data).toHaveProperty("is_open_source"); - expect(typeof res.data.is_open_source).toBe("boolean"); - expect(res.data.is_open_source).toBe(true); - }, 60000); + expect(res.data).toHaveProperty("company_mission"); + expect(typeof res.data.company_mission).toBe("string") + expect(res.data).toHaveProperty("is_open_source"); + expect(typeof res.data.is_open_source).toBe("boolean"); + expect(res.data.is_open_source).toBe(true); + }, 60000); + } else { + it.concurrent("dummy test", () => { + expect(true).toBe(true); + }); + } }); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 2ab5df9e..31ce019e 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -41,41 +41,13 @@ describe("Scrape tests", () => { ); }, 10000); - describe("Ad blocking (f-e dependant)", () => { - it.concurrent("blocks ads by default", async () => { - const response = await scrape({ - url: "https://canyoublockit.com/testing/", - }); + it("works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev" + }); - expect(response.markdown).not.toContain(".g.doubleclick.net/"); - }, 10000); - - it.concurrent("doesn't block ads if explicitly disabled", async () => { - const response = await scrape({ - url: "https://canyoublockit.com/testing/", - blockAds: false, - }); - - expect(response.markdown).toContain(".g.doubleclick.net/"); - }, 10000); - }); - - describe("Location API (f-e dependant)", () => { - it.concurrent("works without specifying an explicit location", async () => { - const response = await scrape({ - url: "https://iplocation.com", - }); - }, 10000); - - it.concurrent("works with country US", async () => { - const response = await scrape({ - url: "https://iplocation.com", - location: { country: "US" }, - }); - - expect(response.markdown).toContain("| Country | United States |"); - }, 10000); - }); + expect(response.markdown).toContain("Firecrawl"); + }, 10000); describe("JSON scrape support", () => { it.concurrent("returns parseable JSON", async () => { @@ -89,82 +61,122 @@ describe("Scrape tests", () => { }, 25000); // TODO: mock and shorten }); - describe("Screenshot", () => { - it.concurrent("screenshot format works", async () => { - const response = await scrape({ - url: "http://firecrawl.dev", - formats: ["screenshot"] - }); - - expect(typeof response.screenshot).toBe("string"); - }, 15000); + if (!process.env.TEST_SUITE_SELF_HOSTED) { + describe("Ad blocking (f-e dependant)", () => { + it.concurrent("blocks ads by default", async () => { + const response = await scrape({ + url: "https://canyoublockit.com/testing/", + }); - it.concurrent("screenshot@fullPage format works", async () => { - const response = await scrape({ - url: "http://firecrawl.dev", - formats: ["screenshot@fullPage"] - }); - - expect(typeof response.screenshot).toBe("string"); - }, 15000); - }); + expect(response.markdown).not.toContain(".g.doubleclick.net/"); + }, 10000); - describe("JSON format", () => { - it.concurrent("works", async () => { - const response = await scrape({ - url: "http://firecrawl.dev", - formats: ["json"], - jsonOptions: { - prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", - schema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", + it.concurrent("doesn't block ads if explicitly disabled", async () => { + const response = await scrape({ + url: "https://canyoublockit.com/testing/", + blockAds: false, + }); + + expect(response.markdown).toContain(".g.doubleclick.net/"); + }, 10000); + }); + + describe("Location API (f-e dependant)", () => { + it.concurrent("works without specifying an explicit location", async () => { + const response = await scrape({ + url: "https://iplocation.com", + }); + }, 10000); + + it.concurrent("works with country US", async () => { + const response = await scrape({ + url: "https://iplocation.com", + location: { country: "US" }, + }); + + expect(response.markdown).toContain("| Country | United States |"); + }, 10000); + }); + + describe("Screenshot (f-e/sb dependant)", () => { + it.concurrent("screenshot format works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["screenshot"] + }); + + expect(typeof response.screenshot).toBe("string"); + }, 30000); + + it.concurrent("screenshot@fullPage format works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["screenshot@fullPage"] + }); + + expect(typeof response.screenshot).toBe("string"); + }, 30000); + }); + + describe("Proxy API (f-e dependant)", () => { + it.concurrent("undefined works", async () => { + await scrape({ + url: "http://firecrawl.dev", + }); + }, 15000); + + it.concurrent("basic works", async () => { + await scrape({ + url: "http://firecrawl.dev", + proxy: "basic", + }); + }, 15000); + + it.concurrent("stealth works", async () => { + await scrape({ + url: "http://firecrawl.dev", + proxy: "stealth", + }); + }, 15000); + }); + } + + if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) { + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, }, + required: ["company_mission", "supports_sso", "is_open_source"], }, - required: ["company_mission", "supports_sso", "is_open_source"], }, - }, - }); - - expect(response).toHaveProperty("json"); - expect(response.json).toHaveProperty("company_mission"); - expect(typeof response.json.company_mission).toBe("string"); - expect(response.json).toHaveProperty("supports_sso"); - expect(response.json.supports_sso).toBe(false); - expect(typeof response.json.supports_sso).toBe("boolean"); - expect(response.json).toHaveProperty("is_open_source"); - expect(response.json.is_open_source).toBe(true); - expect(typeof response.json.is_open_source).toBe("boolean"); - }, 30000); - }); - - describe("Proxy API (f-e dependant)", () => { - it.concurrent("undefined works", async () => { - await scrape({ - url: "http://firecrawl.dev", - }); - }, 15000); - - it.concurrent("basic works", async () => { - await scrape({ - url: "http://firecrawl.dev", - proxy: "basic", - }); - }, 15000); - - it.concurrent("stealth works", async () => { - await scrape({ - url: "http://firecrawl.dev", - proxy: "stealth", - }); - }, 15000); - }); + }); + + expect(response).toHaveProperty("json"); + expect(response.json).toHaveProperty("company_mission"); + expect(typeof response.json.company_mission).toBe("string"); + expect(response.json).toHaveProperty("supports_sso"); + expect(response.json.supports_sso).toBe(false); + expect(typeof response.json.supports_sso).toBe("boolean"); + expect(response.json).toHaveProperty("is_open_source"); + expect(response.json.is_open_source).toBe(true); + expect(typeof response.json.is_open_source).toBe("boolean"); + }, 30000); + }); + } }); diff --git a/apps/api/src/__tests__/snips/search.test.ts b/apps/api/src/__tests__/snips/search.test.ts index 5cb5323d..1e191c57 100644 --- a/apps/api/src/__tests__/snips/search.test.ts +++ b/apps/api/src/__tests__/snips/search.test.ts @@ -27,8 +27,8 @@ async function search(body: SearchRequestInput): Promise { return raw.body.data; } -describe("Scrape tests", () => { - it("works", async () => { +describe("Search tests", () => { + it.concurrent("works", async () => { await search({ query: "firecrawl" }); diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 8d0ea1b7..9a5ecaad 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -13,13 +13,13 @@ import { getDoneJobsOrderedLength, isCrawlKickoffFinished, } from "../../lib/crawl-redis"; -import { getScrapeQueue } from "../../services/queue-service"; +import { getScrapeQueue, QueueFunction } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById, } from "../../lib/supabase-jobs"; import { configDotenv } from "dotenv"; -import type { Job, JobState } from "bullmq"; +import type { Job, JobState, Queue } from "bullmq"; import { logger } from "../../lib/logger"; import { supabase_service } from "../../services/supabase"; import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; @@ -245,7 +245,7 @@ export async function crawlStatusController( let totalCount = jobIDs.length; - if (totalCount === 0) { + if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") { const x = await supabase_service .from('firecrawl_jobs') .select('*', { count: 'exact', head: true }) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 81477e7c..c83ab506 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -1,7 +1,34 @@ import { Response } from "express"; -import { supabaseGetJobsById } from "../../lib/supabase-jobs"; import { RequestWithAuth } from "./types"; import { getExtract, getExtractExpiry } from "../../lib/extract/extract-redis"; +import { DBJob, PseudoJob } from "./crawl-status"; +import { getExtractQueue } from "../../services/queue-service"; +import { ExtractResult } from "../../lib/extract/extraction-service"; +import { supabaseGetJobById } from "../../lib/supabase-jobs"; + +export async function getExtractJob(id: string): Promise | null> { + const [bullJob, dbJob] = await Promise.all([ + getExtractQueue().getJob(id), + (process.env.USE_DB_AUTHENTICATION === "true" ? supabaseGetJobById(id) : null) as Promise, + ]); + + if (!bullJob && !dbJob) return null; + + const data = dbJob?.docs ?? bullJob?.returnvalue?.data; + + const job: PseudoJob = { + id, + getState: bullJob ? bullJob.getState : (() => dbJob!.success ? "completed" : "failed"), + returnvalue: data, + data: { + scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options, + }, + timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(), + failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined, + } + + return job; +} export async function extractStatusController( req: RequestWithAuth<{ jobId: string }, any, any>, @@ -16,24 +43,29 @@ export async function extractStatusController( }); } - let data: any[] = []; + let data: ExtractResult | [] = []; if (extract.status === "completed") { - const jobData = await supabaseGetJobsById([req.params.jobId]); - if (!jobData || jobData.length === 0) { + const jobData = await getExtractJob(req.params.jobId); + if (!jobData) { return res.status(404).json({ success: false, error: "Job not found", }); } - data = jobData[0].docs; + if (!jobData.returnvalue) { + // if we got in the split-second where the redis is updated but the bull isn't + // just pretend it's still processing - MG + extract.status = "processing"; + } else { + data = jobData.returnvalue ?? []; + } } - // console.log(extract.sources); return res.status(200).json({ success: extract.status === "failed" ? false : true, - data: data, + data, status: extract.status, error: extract?.error ?? undefined, expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(), diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts index 1ed7decf..5b54f3f8 100644 --- a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -7,7 +7,6 @@ import { } from "../build-prompts"; import OpenAI from "openai"; import { logger } from "../../../lib/logger"; -const openai = new OpenAI(); export async function analyzeSchemaAndPrompt( urls: string[], @@ -40,6 +39,7 @@ export async function analyzeSchemaAndPrompt( const model = "gpt-4o"; + const openai = new OpenAI(); const result = await openai.beta.chat.completions.parse({ model: model, messages: [ diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index ab4b1aeb..ff43dedd 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -48,7 +48,7 @@ interface ExtractServiceOptions { cacheKey?: string; } -interface ExtractResult { +export interface ExtractResult { success: boolean; data?: any; extractId: string; diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts index df4c7686..7e5ddf41 100644 --- a/apps/api/src/lib/extract/index/pinecone.ts +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -3,10 +3,6 @@ import { Document } from "../../../controllers/v1/types"; import { logger } from "../../logger"; import OpenAI from "openai"; -const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, -}); - const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY!, }); @@ -27,6 +23,10 @@ export interface PageMetadata { } async function getEmbedding(text: string) { + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + const embedding = await openai.embeddings.create({ model: "text-embedding-3-small", input: text, diff --git a/apps/api/src/lib/llm/generate.ts b/apps/api/src/lib/llm/generate.ts index 5249dadd..6edbda9a 100644 --- a/apps/api/src/lib/llm/generate.ts +++ b/apps/api/src/lib/llm/generate.ts @@ -1,9 +1,5 @@ import OpenAI from "openai"; -const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, -}); - interface Message { role: "system" | "user" | "assistant"; content: string; @@ -19,6 +15,10 @@ interface GenerateTextOptions { export async function generateText(options: GenerateTextOptions) { const { model, messages, temperature = 0.7, maxTokens } = options; + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + const completion = await openai.chat.completions.create({ model, messages, diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index 16e49764..63cd6c20 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -1,14 +1,13 @@ -import axios from "axios"; import { configDotenv } from "dotenv"; import OpenAI from "openai"; configDotenv(); -const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, -}); - async function getEmbedding(text: string) { + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + const embedding = await openai.embeddings.create({ model: "text-embedding-3-small", input: text, diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 58fc5b3e..2bf9c5c5 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -10,6 +10,7 @@ import { UnsupportedFileError, } from "../../error"; import { MockState } from "../../lib/mock"; +import { fireEngineURL } from "./scrape"; const successSchema = z.object({ jobId: z.string(), @@ -85,8 +86,6 @@ export async function fireEngineCheckStatus( jobId: string, mock: MockState | null, ): Promise { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - const status = await Sentry.startSpan( { name: "fire-engine: Check status", diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts index d20df42e..d046b738 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts @@ -3,14 +3,13 @@ import * as Sentry from "@sentry/node"; import { robustFetch } from "../../lib/fetch"; import { MockState } from "../../lib/mock"; +import { fireEngineURL } from "./scrape"; export async function fireEngineDelete( logger: Logger, jobId: string, mock: MockState | null, ) { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - await Sentry.startSpan( { name: "fire-engine: Delete scrape", diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index 607c0c8d..1b63dc75 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -65,6 +65,8 @@ const schema = z.object({ processing: z.boolean(), }); +export const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL ?? ""; + export async function fireEngineScrape< Engine extends | FireEngineScrapeRequestChromeCDP @@ -75,10 +77,6 @@ export async function fireEngineScrape< request: FireEngineScrapeRequestCommon & Engine, mock: MockState | null, ): Promise> { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - - // TODO: retries - const scrapeRequest = await Sentry.startSpan( { name: "fire-engine: Scrape", diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index f32708c0..0a688fc6 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -310,7 +310,12 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { - const _engines = [...engines]; + const _engines: Engine[] = [ + ...engines, + + // enable fire-engine in self-hosted testing environment when mocks are supplied + ...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : []) + ]; if (meta.internalOptions.useCache !== true) { const cacheIndex = _engines.indexOf("cache"); diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index c9513bfe..c0fc1702 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -2,6 +2,7 @@ import { Logger } from "winston"; import { z, ZodError } from "zod"; import * as Sentry from "@sentry/node"; import { MockState, saveMock } from "./mock"; +import { fireEngineURL } from "../engines/fire-engine/scrape"; export type RobustFetchParams> = { url: string; @@ -126,14 +127,13 @@ export async function robustFetch< const makeRequestTypeId = ( request: (typeof mock)["requests"][number]["options"], ) => { - let trueUrl = (process.env.FIRE_ENGINE_BETA_URL && request.url.startsWith(process.env.FIRE_ENGINE_BETA_URL)) - ? request.url.replace(process.env.FIRE_ENGINE_BETA_URL, "") + let trueUrl = request.url.startsWith(fireEngineURL) + ? request.url.replace(fireEngineURL, "") : request.url; let out = trueUrl + ";" + request.method; if ( - process.env.FIRE_ENGINE_BETA_URL && - (trueUrl.startsWith("")) && + trueUrl.startsWith("") && request.method === "POST" ) { out += "f-e;" + request.body?.engine + ";" + request.body?.url; diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 26277523..8596666c 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -29,9 +29,7 @@ export async function fireEngineMap( }); if (!process.env.FIRE_ENGINE_BETA_URL) { - console.warn( - "(v1/map Beta) Results might differ from cloud offering currently.", - ); + logger.warn("(v1/map Beta) Results might differ from cloud offering currently."); return []; } diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 07719ae5..72ab97d7 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,21 +1,18 @@ import axios from "axios"; -import * as cheerio from "cheerio"; // TODO: rustify +import { JSDOM } from 'jsdom'; import * as querystring from "querystring"; import { SearchResult } from "../../src/lib/entities"; import { logger } from "../../src/lib/logger"; +import https from 'https'; -const _useragent_list = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", -]; +const getRandomInt = (min: number, max: number): number => Math.floor(Math.random() * (max - min + 1)) + min; -function get_useragent(): string { - return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; +export function get_useragent(): string { + const lynx_version = `Lynx/${getRandomInt(2, 3)}.${getRandomInt(8, 9)}.${getRandomInt(0, 2)}`; + const libwww_version = `libwww-FM/${getRandomInt(2, 3)}.${getRandomInt(13, 15)}`; + const ssl_mm_version = `SSL-MM/${getRandomInt(1, 2)}.${getRandomInt(3, 5)}`; + const openssl_version = `OpenSSL/${getRandomInt(1, 3)}.${getRandomInt(0, 4)}.${getRandomInt(0, 9)}`; + return `${lynx_version} ${libwww_version} ${ssl_mm_version} ${openssl_version}`; } async function _req( @@ -31,9 +28,10 @@ async function _req( ) { const params = { q: term, - num: results, // Number of results to return + num: results+2, // Number of results to return hl: lang, gl: country, + safe: "active", start: start, }; if (tbs) { @@ -42,18 +40,25 @@ async function _req( if (filter) { params["filter"] = filter; } + var agent = get_useragent(); try { const resp = await axios.get("https://www.google.com/search", { headers: { - "User-Agent": get_useragent(), + "User-Agent": agent, + "Accept": "*/*" }, params: params, proxy: proxies, timeout: timeout, + httpsAgent: new https.Agent({ + rejectUnauthorized: true + }), + withCredentials: true }); return resp; } catch (error) { if (error.response && error.response.status === 429) { + logger.warn("Google Search: Too many requests, try again later.", error.response); throw new Error("Google Search: Too many requests, try again later."); } throw error; @@ -100,34 +105,42 @@ export async function googleSearch( tbs, filter, ); - const $ = cheerio.load(resp.data); - const result_block = $("div.g"); + const dom = new JSDOM(resp.data); + const document = dom.window.document; + const result_block = document.querySelectorAll("div.ezO2md"); + let new_results = 0; + let unique = true; + let fetched_results = 0; + + const fetched_links = new Set(); if (result_block.length === 0) { start += 1; attempts += 1; } else { - attempts = 0; // Reset attempts if we have results + attempts = 0; } - result_block.each((index, element) => { - const linkElement = $(element).find("a"); - const link = - linkElement && linkElement.attr("href") - ? linkElement.attr("href") - : null; - const title = $(element).find("h3"); - const ogImage = $(element).find("img").eq(1).attr("src"); - const description_box = $(element).find( - "div[style='-webkit-line-clamp:2']", - ); - const answerBox = $(element).find(".mod").text(); - if (description_box) { - const description = description_box.text(); - if (link && title && description) { - start += 1; - results.push(new SearchResult(link, title.text(), description)); + + for (const result of result_block) { + const link_tag = result.querySelector("a[href]") as HTMLAnchorElement; + const title_tag = link_tag ? link_tag.querySelector("span.CVA68e") : null; + const description_tag = result.querySelector("span.FrIlee"); + + if (link_tag && title_tag && description_tag) { + const link = decodeURIComponent(link_tag.href.split("&")[0].replace("/url?q=", "")); + if (fetched_links.has(link) && unique) continue; + fetched_links.add(link); + const title = title_tag.textContent || ""; + const description = description_tag.textContent || ""; + fetched_results++; + new_results++; + if (link && title && description) { + start += 1 + results.push(new SearchResult(link, title, description)); + } + if (fetched_results >= num_results) break; } - } - }); + } + await new Promise((resolve) => setTimeout(resolve, sleep_interval * 1000), ); diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index d4e6ce9d..e85ee384 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -64,7 +64,7 @@ export async function search({ timeout, ); } catch (error) { - logger.error(`Error in search function: ${error}`); + logger.error(`Error in search function`, { error }); return []; } } diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 9969c4b2..b910fa4b 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -2,6 +2,8 @@ import { Queue } from "bullmq"; import { logger } from "../lib/logger"; import IORedis from "ioredis"; +export type QueueFunction = () => Queue; + let scrapeQueue: Queue; let extractQueue: Queue; let loggingQueue: Queue;