diff --git a/.gitignore b/.gitignore index 537c4cad..478ca945 100644 --- a/.gitignore +++ b/.gitignore @@ -34,5 +34,7 @@ apps/js-sdk/firecrawl/dist /examples/sonnet_web_crawler/firecrawl_env /examples/internal_link_assitant/firecrawl_env +/apps/api/logs/* +/apps/api/debug/* -.vscode \ No newline at end of file +.vscode diff --git a/apps/api/package.json b/apps/api/package.json index 00c1bc0e..670dfc7a 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,6 +58,7 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", + "@pinecone-database/pinecone": "^4.0.0", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 17532d25..082a200f 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -10,7 +10,7 @@ importers: dependencies: '@anthropic-ai/sdk': specifier: ^0.24.3 - version: 0.24.3 + version: 0.24.3(encoding@0.1.13) '@brillout/import': specifier: ^0.2.2 version: 0.2.3 @@ -29,9 +29,12 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@pinecone-database/pinecone': + specifier: ^4.0.0 + version: 4.0.0 '@sentry/cli': specifier: ^2.33.1 - version: 2.33.1 + version: 2.33.1(encoding@0.1.13) '@sentry/node': specifier: ^8.26.0 version: 8.26.0 @@ -79,7 +82,7 @@ importers: version: 1.1.1 cohere-ai: specifier: ^7.14.0 - version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)) + version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13) cors: specifier: ^2.8.5 version: 2.8.5 @@ -130,13 +133,13 @@ importers: version: 2.9.0 langchain: specifier: ^0.2.8 - version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 logsnag: specifier: ^1.0.0 - version: 1.0.0 + version: 1.0.0(encoding@0.1.13) luxon: specifier: ^3.4.3 version: 3.4.4 @@ -157,7 +160,7 @@ importers: version: 7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3) openai: specifier: ^4.57.0 - version: 4.57.0(zod@3.23.8) + version: 4.57.0(encoding@0.1.13)(zod@3.23.8) pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -275,7 +278,7 @@ importers: version: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) jest-fetch-mock: specifier: ^3.0.3 - version: 3.0.3 + version: 3.0.3(encoding@0.1.13) mammoth: specifier: ^1.7.2 version: 1.7.2 @@ -1006,6 +1009,10 @@ packages: '@pdf-lib/upng@1.0.1': resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + '@pinecone-database/pinecone@4.0.0': + resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==} + engines: {node: '>=18.0.0'} + '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -2279,6 +2286,9 @@ packages: resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==} engines: {node: '>= 0.8'} + encoding@0.1.13: + resolution: {integrity: sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==} + end-of-stream@1.4.4: resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} @@ -4599,7 +4609,7 @@ snapshots: '@jridgewell/gen-mapping': 0.3.5 '@jridgewell/trace-mapping': 0.3.25 - '@anthropic-ai/sdk@0.24.3': + '@anthropic-ai/sdk@0.24.3(encoding@0.1.13)': dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 @@ -4607,7 +4617,7 @@ snapshots: agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) web-streams-polyfill: 3.3.3 transitivePeerDependencies: - encoding @@ -5577,13 +5587,13 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.4.15 - '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': + '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu) + langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5595,20 +5605,20 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': + '@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 - openai: 4.57.0(zod@3.23.8) + openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) zod: 3.23.8 zod-to-json-schema: 3.23.1(zod@3.23.8) transitivePeerDependencies: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -5866,6 +5876,10 @@ snapshots: dependencies: pako: 1.0.11 + '@pinecone-database/pinecone@4.0.0': + dependencies: + encoding: 0.1.13 + '@pkgjs/parseargs@0.11.0': optional: true @@ -5950,10 +5964,10 @@ snapshots: '@sentry/cli-win32-x64@2.33.1': optional: true - '@sentry/cli@2.33.1': + '@sentry/cli@2.33.1(encoding@0.1.13)': dependencies: https-proxy-agent: 5.0.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) progress: 2.0.3 proxy-from-env: 1.1.0 which: 2.0.2 @@ -7088,7 +7102,7 @@ snapshots: co@4.6.0: {} - cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)): + cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13): dependencies: '@aws-sdk/client-sagemaker': 3.679.0 '@aws-sdk/credential-providers': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)) @@ -7098,7 +7112,7 @@ snapshots: form-data-encoder: 4.0.2 formdata-node: 6.0.3 js-base64: 3.7.2 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) qs: 6.11.2 readable-stream: 4.5.2 url-join: 4.0.1 @@ -7208,9 +7222,9 @@ snapshots: dependencies: luxon: 3.4.4 - cross-fetch@3.1.8: + cross-fetch@3.1.8(encoding@0.1.13): dependencies: - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) transitivePeerDependencies: - encoding @@ -7365,6 +7379,10 @@ snapshots: encodeurl@1.0.2: {} + encoding@0.1.13: + dependencies: + iconv-lite: 0.6.3 + end-of-stream@1.4.4: dependencies: once: 1.4.0 @@ -7899,9 +7917,9 @@ snapshots: isexe@2.0.0: {} - isomorphic-fetch@3.0.0: + isomorphic-fetch@3.0.0(encoding@0.1.13): dependencies: - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) whatwg-fetch: 3.6.20 transitivePeerDependencies: - encoding @@ -8070,9 +8088,9 @@ snapshots: jest-mock: 29.7.0 jest-util: 29.7.0 - jest-fetch-mock@3.0.3: + jest-fetch-mock@3.0.3(encoding@0.1.13): dependencies: - cross-fetch: 3.1.8 + cross-fetch: 3.1.8(encoding@0.1.13) promise-polyfill: 8.3.0 transitivePeerDependencies: - encoding @@ -8342,17 +8360,17 @@ snapshots: kuler@2.0.0: {} - langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): + langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) - '@langchain/openai': 0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu) + langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8362,6 +8380,7 @@ snapshots: zod-to-json-schema: 3.23.1(zod@3.23.8) optionalDependencies: '@aws-sdk/credential-provider-node': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0) + '@pinecone-database/pinecone': 4.0.0 '@supabase/supabase-js': 2.44.2 axios: 1.7.2 cheerio: 1.0.0-rc.12 @@ -8381,7 +8400,7 @@ snapshots: langchainhub@0.0.11: {} - langsmith@0.1.34(zyeavx4tfqw3smbbpiinhfxxeu): + langsmith@0.1.34(npkyd6f7wyl3urgrzoxaktl5a4): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8390,9 +8409,9 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) - langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) - openai: 4.57.0(zod@3.23.8) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) languagedetect@2.0.0: {} @@ -8442,9 +8461,9 @@ snapshots: loglevel@1.9.1: {} - logsnag@1.0.0: + logsnag@1.0.0(encoding@0.1.13): dependencies: - isomorphic-fetch: 3.0.0 + isomorphic-fetch: 3.0.0(encoding@0.1.13) transitivePeerDependencies: - encoding @@ -8703,9 +8722,11 @@ snapshots: node-ensure@0.0.0: {} - node-fetch@2.7.0: + node-fetch@2.7.0(encoding@0.1.13): dependencies: whatwg-url: 5.0.0 + optionalDependencies: + encoding: 0.1.13 node-fetch@3.3.2: dependencies: @@ -8780,7 +8801,7 @@ snapshots: transitivePeerDependencies: - debug - openai@4.57.0(zod@3.23.8): + openai@4.57.0(encoding@0.1.13)(zod@3.23.8): dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 @@ -8789,7 +8810,7 @@ snapshots: agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) qs: 6.12.2 optionalDependencies: zod: 3.23.8 diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index adc080f2..20214d72 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -254,3 +254,4 @@ logger.info(`Worker ${process.pid} started`); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); +// diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index cbab4e05..ff91fa88 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) { if (!cacheRedis) return; try { - await cacheRedis.set(key, JSON.stringify(entry)); + await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds } catch (error) { logger.warn("Failed to save to cache", { key, error }); } diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts new file mode 100644 index 00000000..cbb33f8b --- /dev/null +++ b/apps/api/src/lib/canonical-url.ts @@ -0,0 +1,7 @@ +export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} \ No newline at end of file diff --git a/apps/api/src/lib/extract/config.ts b/apps/api/src/lib/extract/config.ts new file mode 100644 index 00000000..f8333b3c --- /dev/null +++ b/apps/api/src/lib/extract/config.ts @@ -0,0 +1,7 @@ +export const extractConfig = { + MAX_INITIAL_RANKING_LIMIT: 1000, + MAX_RANKING_LIMIT: 20, + INITIAL_SCORE_THRESHOLD: 0.75, + FALLBACK_SCORE_THRESHOLD: 0.5, + MIN_REQUIRED_LINKS: 1, +}; \ No newline at end of file diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 04194b0b..91d515df 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -14,10 +14,13 @@ interface ScrapeDocumentOptions { timeout: number; } -export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise { +export async function scrapeDocument( + options: ScrapeDocumentOptions, + urlTraces: URLTrace[], +): Promise { const trace = urlTraces.find((t) => t.url === options.url); if (trace) { - trace.status = 'scraped'; + trace.status = "scraped"; trace.timing.scrapedAt = new Date().toISOString(); } @@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: mode: "single_urls", team_id: options.teamId, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { + useCache: true, + }, plan: options.plan, origin: options.origin, is_scrape: true, @@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: } catch (error) { logger.error(`Error in scrapeDocument: ${error}`); if (trace) { - trace.status = 'error'; + trace.status = "error"; trace.error = error.message; } return null; } -} \ No newline at end of file +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index f84a1f34..2791df3c 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/ import { buildDocument } from "./build-document"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; +import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; +import { saveCrawl, StoredCrawl } from "../crawl-redis"; interface ExtractServiceOptions { request: ExtractRequest; @@ -24,6 +26,18 @@ interface ExtractResult { error?: string; } +function getRootDomain(url: string): string { + try { + if(url.endsWith("/*")) { + url = url.slice(0, -2); + } + const urlObj = new URL(url); + return `${urlObj.protocol}//${urlObj.hostname}`; + } catch (e) { + return url; + } +} + export async function performExtraction(options: ExtractServiceOptions): Promise { const { request, teamId, plan, subId } = options; const scrapeId = crypto.randomUUID(); @@ -87,7 +101,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise mode: "llm", systemPrompt: (request.systemPrompt ? `${request.systemPrompt}\n` : "") + - "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: request.prompt, schema: request.schema, @@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise }); } + // Kickoff background crawl for indexing root domains + // const rootDomains = new Set(request.urls.map(getRootDomain)); + // rootDomains.forEach(async url => { + // const crawlId = crypto.randomUUID(); + + // // Create and save crawl configuration first + // const sc: StoredCrawl = { + // originUrl: url, + // crawlerOptions: { + // maxDepth: 15, + // limit: 5000, + // includePaths: [], + // excludePaths: [], + // ignoreSitemap: false, + // includeSubdomains: true, + // allowExternalLinks: false, + // allowBackwardLinks: true + // }, + // scrapeOptions: { + // formats: ["markdown"], + // onlyMainContent: true, + // waitFor: 0, + // mobile: false, + // removeBase64Images: true, + // fastMode: false, + // parsePDF: true, + // skipTlsVerification: false, + // }, + // internalOptions: { + // disableSmartWaitCache: true, + // isBackgroundIndex: true + // }, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // createdAt: Date.now(), + // plan: "hobby", // make it a low concurrency + // }; + + // // Save the crawl configuration + // await saveCrawl(crawlId, sc); + + // // Then kick off the job + // await _addScrapeJobToBullMQ({ + // url, + // mode: "kickoff" as const, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // plan: "hobby", // make it a low concurrency + // crawlerOptions: sc.crawlerOptions, + // scrapeOptions: sc.scrapeOptions, + // internalOptions: sc.internalOptions, + // origin: "index", + // crawl_id: crawlId, + // webhook: null, + // v1: true, + // }, {}, crypto.randomUUID(), 50); + // }); + // Bill team for usage billTeam(teamId, subId, links.length * 5).catch((error) => { logger.error( diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts new file mode 100644 index 00000000..14c3bea4 --- /dev/null +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -0,0 +1,158 @@ +import { Pinecone } from '@pinecone-database/pinecone'; +import { Document } from '../../../controllers/v1/types'; +import { logger } from '../../logger'; +import OpenAI from "openai"; + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +const pinecone = new Pinecone({ + apiKey: process.env.PINECONE_API_KEY!, +}); + +const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; + +const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes + +export interface PageMetadata { + url: string; + originUrl: string; + title?: string; + description?: string; + crawlId?: string; + teamId?: string; + timestamp: number; + markdown?: string; +} + +async function getEmbedding(text: string) { + const embedding = await openai.embeddings.create({ + model: "text-embedding-3-small", + input: text, + encoding_format: "float", + }); + + return embedding.data[0].embedding; +} + +function normalizeUrl(url: string) { + const urlO = new URL(url); + if (!urlO.hostname.startsWith("www.")) { + urlO.hostname = "www." + urlO.hostname; + } + return urlO.href; +} + +export async function indexPage({ + document, + originUrl, + crawlId, + teamId +}: { + document: Document; + originUrl: string; + crawlId?: string; + teamId?: string; +} +) { + try { + const index = pinecone.index(INDEX_NAME); + + // Trim markdown if it's too long + let trimmedMarkdown = document.markdown; + if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) { + trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding + } + + // Create text to embed + const textToEmbed = [ + document.metadata.title, + document.metadata.description, + trimmedMarkdown + ].filter(Boolean).join('\n\n'); + + // Get embedding from OpenAI + const embedding = await getEmbedding(textToEmbed); + + const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!); + + // Prepare metadata + const metadata: PageMetadata = { + url: normalizedUrl, + originUrl: normalizeUrl(originUrl), + title: document.metadata.title, + description: document.metadata.description, + crawlId, + teamId, + markdown: trimmedMarkdown, + timestamp: Date.now() + }; + + // Upsert to Pinecone + await index.upsert([{ + id: normalizedUrl, + values: embedding, + metadata: { + ...metadata, + [document.metadata.sourceURL || document.metadata.url!]: true + } + }]); + + logger.debug('Successfully indexed page in Pinecone', { + url: metadata.url, + crawlId + }); + + } catch (error) { + logger.error('Failed to index page in Pinecone', { + error, + url: document.metadata.sourceURL || document.metadata.url, + crawlId + }); + } +} + +export async function searchSimilarPages( + query: string, + originUrl?: string, + limit: number = 10 +) { + try { + const index = pinecone.index(INDEX_NAME); + + // Get query embedding from OpenAI + const queryEmbedding = await getEmbedding(query); + + const queryParams: any = { + vector: queryEmbedding, + topK: limit, + includeMetadata: true + }; + + const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; + // Add filter if originUrl is provided + if (normalizedOriginUrl) { + queryParams.filter = { + originUrl: { $eq: normalizedOriginUrl } + }; + } + + const results = await index.query(queryParams); + return results.matches.map(match => ({ + url: match.metadata?.url, + title: match.metadata?.title, + description: match.metadata?.description, + score: match.score, + markdown: match.metadata?.markdown + })); + + } catch (error) { + logger.error('Failed to search similar pages in Pinecone', { + error, + query, + originUrl + }); + return []; + } +} diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 2a4e2f62..e5b61741 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -3,15 +3,13 @@ import { performRanking } from "../ranker"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { logger } from "../logger"; import { CohereClient } from "cohere-ai"; +import { extractConfig } from "./config"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, }); -const MAX_RANKING_LIMIT = 10; -const INITIAL_SCORE_THRESHOLD = 0.75; -const FALLBACK_SCORE_THRESHOLD = 0.5; -const MIN_REQUIRED_LINKS = 1; + interface RankingResult { mappedLinks: MapDocument[]; @@ -61,32 +59,35 @@ export async function rerankLinks( searchQuery, ); + // First try with high threshold let filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - INITIAL_SCORE_THRESHOLD, + extractConfig.INITIAL_SCORE_THRESHOLD, ); + + // If we don't have enough high-quality links, try with lower threshold - if (filteredLinks.length < MIN_REQUIRED_LINKS) { + if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) { logger.info( - `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, + `Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, ); filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - FALLBACK_SCORE_THRESHOLD, + extractConfig.FALLBACK_SCORE_THRESHOLD, ); if (filteredLinks.length === 0) { // If still no results, take top N results regardless of score logger.warn( - `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`, + `No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`, ); filteredLinks = linksAndScores .sort((a, b) => b.score - a.score) - .slice(0, MIN_REQUIRED_LINKS) + .slice(0, extractConfig.MIN_REQUIRED_LINKS) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => @@ -108,7 +109,7 @@ export async function rerankLinks( } }); - const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT); + const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT); // Mark URLs that will be used in completion rankedLinks.forEach(link => { @@ -119,7 +120,7 @@ export async function rerankLinks( }); // Mark URLs that were dropped due to ranking limit - filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => { + filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => { const trace = urlTraces.find(t => t.url === link.url); if (trace) { trace.warning = 'Excluded due to ranking limit'; diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 4d61a8d3..a5027fa9 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction"; import { buildRefrasedPrompt } from "./build-prompts"; import { logger } from "../logger"; import { rerankLinks } from "./reranker"; - -const MAX_EXTRACT_LIMIT = 100; +import { extractConfig } from "./config"; interface ProcessUrlOptions { url: string; @@ -67,8 +66,56 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); let mappedLinks = mapResults.mapResults as MapDocument[]; - const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; - const uniqueUrls = removeDuplicateUrls(allUrls); + let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + let uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + + // retry if only one url is returned + if (uniqueUrls.length <= 1) { + const retryMapResults = await getMapResults({ + url: baseUrl, + teamId: options.teamId, + plan: options.plan, + allowExternalLinks: options.allowExternalLinks, + origin: options.origin, + limit: options.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: options.includeSubdomains, + }); + + mappedLinks = retryMapResults.mapResults as MapDocument[]; + allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + warning: 'Broader search. Not limiting map results to prompt.', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + } // Track all discovered URLs uniqueUrls.forEach(discoveredUrl => { @@ -96,8 +143,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } - // Limit initial set of links - mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); + // Limit initial set of links (1000) + mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT); // Perform reranking if prompt is provided if (options.prompt) { diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index bffbc9c2..02d59457 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -10,7 +10,7 @@ const openai = new OpenAI({ async function getEmbedding(text: string) { const embedding = await openai.embeddings.create({ - model: "text-embedding-ada-002", + model: "text-embedding-3-small", input: text, encoding_format: "float", }); diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0f3b8524..6bb8b04e 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -16,6 +16,7 @@ import { ScrapeUrlResponse, } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; +import { indexPage } from "../lib/extract/index/pinecone"; configDotenv(); export async function startWebScraperPipeline({ @@ -173,6 +174,12 @@ export async function runWebScraper({ creditsToBeBilled = 5; } + // If the team is the background index team, return the response + if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) { + return response; + } + + billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bb0c485c..e452f7fa 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -38,7 +38,7 @@ const useCache = process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ - // ...(useCache ? [ "cache" as const ] : []), + ...(useCache ? [ "cache" as const ] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, @@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { + + if (meta.internalOptions.useCache !== true) { + const cacheIndex = engines.indexOf("cache"); + if (cacheIndex !== -1) { + engines.splice(cacheIndex, 1); + } + } else { + meta.logger.debug("Cache engine enabled by useCache option"); + } const prioritySum = [...meta.featureFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, 0, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 130ef9ee..b13f7d9a 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -151,8 +151,9 @@ export type InternalOptions = { v0CrawlOnlyUrls?: boolean; v0DisableJsDom?: boolean; - + useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine + isBackgroundIndex?: boolean; }; export type EngineResultsTracker = { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 48e6f7fd..4fb08337 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -48,6 +48,11 @@ import { } from "../lib/concurrency-limit"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; +import { indexPage } from "../lib/extract/index/pinecone"; +import { Document } from "../controllers/v1/types"; +import { supabase_service } from "../services/supabase"; +import { normalizeUrl } from "../lib/canonical-url"; + configDotenv(); class RacedRedirectError extends Error { @@ -74,6 +79,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { + (async () => { + const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined; + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited", + ); + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { + // Fire and forget the upload to Supabase + try { + // Standardize URLs to canonical form (https, no www) + const standardizedUrls = [ + ...new Set( + visitedUrls.map((url) => { + return normalizeUrl(url); + }), + ), + ]; + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl) + .single(); + + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [ + ...new Set([...existingMap.urls, ...standardizedUrls]), + ]; + + const { error } = await supabase_service + .from("crawl_maps") + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service.from("crawl_maps").insert({ + origin_url: originUrl, + urls: standardizedUrls, + num_urls: standardizedUrls.length, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } + } + } catch (error) { + _logger.error("Error saving crawl map", { error }); + } + } + })(); + if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); @@ -209,7 +277,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { const result = await processJob(job, token); if (result.success) { try { - if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { + if ( + job.data.crawl_id && + process.env.USE_DB_AUTHENTICATION === "true" + ) { logger.debug( "Job succeeded -- has crawl associated, putting null in Redis", ); @@ -410,66 +481,66 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { const crawler = crawlToCrawler(job.data.crawl_id, sc); const sitemap = sc.crawlerOptions.ignoreSitemap - ? 0 - : await crawler.tryGetSitemap(async urls => { - if (urls.length === 0) return; - - logger.debug("Using sitemap chunk of length " + urls.length, { - sitemapLength: urls.length, - }); - - let jobPriority = await getJobPriority({ - plan: job.data.plan, - team_id: job.data.team_id, - basePriority: 21, - }); - logger.debug("Using job priority " + jobPriority, { jobPriority }); - - const jobs = urls.map(url => { - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls" as const, - team_id: job.data.team_id, - plan: job.data.plan!, - crawlerOptions: job.data.crawlerOptions, - scrapeOptions: job.data.scrapeOptions, - internalOptions: sc.internalOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - sitemapped: true, - webhook: job.data.webhook, - v1: job.data.v1, - }, - opts: { - jobId: uuid, - priority: 20, - }, - }; - }); - - logger.debug("Locking URLs..."); - await lockURLs( - job.data.crawl_id, - sc, - jobs.map((x) => x.data.url), - ); - logger.debug("Adding scrape jobs to Redis..."); - await addCrawlJobs( - job.data.crawl_id, - jobs.map((x) => x.opts.jobId), - ); - logger.debug("Adding scrape jobs to BullMQ..."); - await addScrapeJobs(jobs); + ? 0 + : await crawler.tryGetSitemap(async (urls) => { + if (urls.length === 0) return; + + logger.debug("Using sitemap chunk of length " + urls.length, { + sitemapLength: urls.length, }); + let jobPriority = await getJobPriority({ + plan: job.data.plan, + team_id: job.data.team_id, + basePriority: 21, + }); + logger.debug("Using job priority " + jobPriority, { jobPriority }); + + const jobs = urls.map((url) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + logger.debug("Locking URLs..."); + await lockURLs( + job.data.crawl_id, + sc, + jobs.map((x) => x.data.url), + ); + logger.debug("Adding scrape jobs to Redis..."); + await addCrawlJobs( + job.data.crawl_id, + jobs.map((x) => x.opts.jobId), + ); + logger.debug("Adding scrape jobs to BullMQ..."); + await addScrapeJobs(jobs); + }); + if (sitemap === 0) { logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap, }); - + logger.debug("Locking URL..."); await lockURL(job.data.crawl_id, sc, job.data.url); const jobId = uuidv4(); @@ -511,14 +582,33 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { "crawl.started", ); } - - return { success: true } + + return { success: true }; } catch (error) { - logger.error("An error occurred!", { error }) + logger.error("An error occurred!", { error }); return { success: false, error }; } } +async function indexJob(job: Job & { id: string }, document: Document) { + if ( + document && + document.markdown && + job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! + ) { + // indexPage({ + // document: document, + // originUrl: job.data.crawl_id + // ? (await getCrawl(job.data.crawl_id))?.originUrl! + // : document.metadata.sourceURL!, + // crawlId: job.data.crawl_id, + // teamId: job.data.team_id, + // }).catch((error) => { + // _logger.error("Error indexing page", { error }); + // }); + } +} + async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", @@ -623,14 +713,18 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.sourceURL, sc) ) { const crawler = crawlToCrawler(job.data.crawl_id, sc); - if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) { - throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking + if ( + crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null + ) { + throw new Error( + "Redirected target URL is not allowed by crawlOptions", + ); // TODO: make this its own error type that is ignored by error tracking } if (isUrlBlocked(doc.metadata.url)) { throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking } - + const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc), @@ -675,6 +769,8 @@ async function processJob(job: Job & { id: string }, token: string) { true, ); + indexJob(job, doc); + logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true); @@ -746,15 +842,18 @@ async function processJob(job: Job & { id: string }, token: string) { newJobId: jobId, }); } else { - logger.debug("Could not lock URL " + JSON.stringify(link), { - url: link, - }); + // TODO: removed this, ok? too many 'not useful' logs (?) Mogery! + // logger.debug("Could not lock URL " + JSON.stringify(link), { + // url: link, + // }); } } } } await finishCrawlIfNeeded(job, sc); + } else { + indexJob(job, doc); } logger.info(`🐂 Job done ${job.id}`); diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 21025589..304a9fc4 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -166,6 +166,7 @@ const testSuiteTokens = [ "4c2638d", "cbb3462", // don't remove (s-ai) "824abcd", // don't remove (s-ai) + "0966288", ]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];