From bf9d41d0b238a848d1edf07a882fd42e6c8937c2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 19:37:48 -0300 Subject: [PATCH 01/13] Nick: index exploration --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 101 +++++++++------ apps/api/src/index.ts | 1 + apps/api/src/lib/extract/index/pinecone.ts | 141 +++++++++++++++++++++ apps/api/src/lib/ranker.ts | 2 +- 5 files changed, 205 insertions(+), 41 deletions(-) create mode 100644 apps/api/src/lib/extract/index/pinecone.ts diff --git a/apps/api/package.json b/apps/api/package.json index 00c1bc0e..670dfc7a 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,6 +58,7 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", + "@pinecone-database/pinecone": "^4.0.0", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 17532d25..082a200f 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -10,7 +10,7 @@ importers: dependencies: '@anthropic-ai/sdk': specifier: ^0.24.3 - version: 0.24.3 + version: 0.24.3(encoding@0.1.13) '@brillout/import': specifier: ^0.2.2 version: 0.2.3 @@ -29,9 +29,12 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@pinecone-database/pinecone': + specifier: ^4.0.0 + version: 4.0.0 '@sentry/cli': specifier: ^2.33.1 - version: 2.33.1 + version: 2.33.1(encoding@0.1.13) '@sentry/node': specifier: ^8.26.0 version: 8.26.0 @@ -79,7 +82,7 @@ importers: version: 1.1.1 cohere-ai: specifier: ^7.14.0 - version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)) + version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13) cors: specifier: ^2.8.5 version: 2.8.5 @@ -130,13 +133,13 @@ importers: version: 2.9.0 langchain: specifier: ^0.2.8 - version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 logsnag: specifier: ^1.0.0 - version: 1.0.0 + version: 1.0.0(encoding@0.1.13) luxon: specifier: ^3.4.3 version: 3.4.4 @@ -157,7 +160,7 @@ importers: version: 7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3) openai: specifier: ^4.57.0 - version: 4.57.0(zod@3.23.8) + version: 4.57.0(encoding@0.1.13)(zod@3.23.8) pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -275,7 +278,7 @@ importers: version: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5)) jest-fetch-mock: specifier: ^3.0.3 - version: 3.0.3 + version: 3.0.3(encoding@0.1.13) mammoth: specifier: ^1.7.2 version: 1.7.2 @@ -1006,6 +1009,10 @@ packages: '@pdf-lib/upng@1.0.1': resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + '@pinecone-database/pinecone@4.0.0': + resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==} + engines: {node: '>=18.0.0'} + '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -2279,6 +2286,9 @@ packages: resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==} engines: {node: '>= 0.8'} + encoding@0.1.13: + resolution: {integrity: sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==} + end-of-stream@1.4.4: resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} @@ -4599,7 +4609,7 @@ snapshots: '@jridgewell/gen-mapping': 0.3.5 '@jridgewell/trace-mapping': 0.3.25 - '@anthropic-ai/sdk@0.24.3': + '@anthropic-ai/sdk@0.24.3(encoding@0.1.13)': dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 @@ -4607,7 +4617,7 @@ snapshots: agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) web-streams-polyfill: 3.3.3 transitivePeerDependencies: - encoding @@ -5577,13 +5587,13 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.4.15 - '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': + '@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu) + langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5595,20 +5605,20 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': + '@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 - openai: 4.57.0(zod@3.23.8) + openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) zod: 3.23.8 zod-to-json-schema: 3.23.1(zod@3.23.8) transitivePeerDependencies: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -5866,6 +5876,10 @@ snapshots: dependencies: pako: 1.0.11 + '@pinecone-database/pinecone@4.0.0': + dependencies: + encoding: 0.1.13 + '@pkgjs/parseargs@0.11.0': optional: true @@ -5950,10 +5964,10 @@ snapshots: '@sentry/cli-win32-x64@2.33.1': optional: true - '@sentry/cli@2.33.1': + '@sentry/cli@2.33.1(encoding@0.1.13)': dependencies: https-proxy-agent: 5.0.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) progress: 2.0.3 proxy-from-env: 1.1.0 which: 2.0.2 @@ -7088,7 +7102,7 @@ snapshots: co@4.6.0: {} - cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)): + cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13): dependencies: '@aws-sdk/client-sagemaker': 3.679.0 '@aws-sdk/credential-providers': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)) @@ -7098,7 +7112,7 @@ snapshots: form-data-encoder: 4.0.2 formdata-node: 6.0.3 js-base64: 3.7.2 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) qs: 6.11.2 readable-stream: 4.5.2 url-join: 4.0.1 @@ -7208,9 +7222,9 @@ snapshots: dependencies: luxon: 3.4.4 - cross-fetch@3.1.8: + cross-fetch@3.1.8(encoding@0.1.13): dependencies: - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) transitivePeerDependencies: - encoding @@ -7365,6 +7379,10 @@ snapshots: encodeurl@1.0.2: {} + encoding@0.1.13: + dependencies: + iconv-lite: 0.6.3 + end-of-stream@1.4.4: dependencies: once: 1.4.0 @@ -7899,9 +7917,9 @@ snapshots: isexe@2.0.0: {} - isomorphic-fetch@3.0.0: + isomorphic-fetch@3.0.0(encoding@0.1.13): dependencies: - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) whatwg-fetch: 3.6.20 transitivePeerDependencies: - encoding @@ -8070,9 +8088,9 @@ snapshots: jest-mock: 29.7.0 jest-util: 29.7.0 - jest-fetch-mock@3.0.3: + jest-fetch-mock@3.0.3(encoding@0.1.13): dependencies: - cross-fetch: 3.1.8 + cross-fetch: 3.1.8(encoding@0.1.13) promise-polyfill: 8.3.0 transitivePeerDependencies: - encoding @@ -8342,17 +8360,17 @@ snapshots: kuler@2.0.0: {} - langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): + langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) - '@langchain/openai': 0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + '@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu) + langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8362,6 +8380,7 @@ snapshots: zod-to-json-schema: 3.23.1(zod@3.23.8) optionalDependencies: '@aws-sdk/credential-provider-node': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0) + '@pinecone-database/pinecone': 4.0.0 '@supabase/supabase-js': 2.44.2 axios: 1.7.2 cheerio: 1.0.0-rc.12 @@ -8381,7 +8400,7 @@ snapshots: langchainhub@0.0.11: {} - langsmith@0.1.34(zyeavx4tfqw3smbbpiinhfxxeu): + langsmith@0.1.34(npkyd6f7wyl3urgrzoxaktl5a4): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8390,9 +8409,9 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) - langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) - openai: 4.57.0(zod@3.23.8) + '@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8)) + langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + openai: 4.57.0(encoding@0.1.13)(zod@3.23.8) languagedetect@2.0.0: {} @@ -8442,9 +8461,9 @@ snapshots: loglevel@1.9.1: {} - logsnag@1.0.0: + logsnag@1.0.0(encoding@0.1.13): dependencies: - isomorphic-fetch: 3.0.0 + isomorphic-fetch: 3.0.0(encoding@0.1.13) transitivePeerDependencies: - encoding @@ -8703,9 +8722,11 @@ snapshots: node-ensure@0.0.0: {} - node-fetch@2.7.0: + node-fetch@2.7.0(encoding@0.1.13): dependencies: whatwg-url: 5.0.0 + optionalDependencies: + encoding: 0.1.13 node-fetch@3.3.2: dependencies: @@ -8780,7 +8801,7 @@ snapshots: transitivePeerDependencies: - debug - openai@4.57.0(zod@3.23.8): + openai@4.57.0(encoding@0.1.13)(zod@3.23.8): dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 @@ -8789,7 +8810,7 @@ snapshots: agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 - node-fetch: 2.7.0 + node-fetch: 2.7.0(encoding@0.1.13) qs: 6.12.2 optionalDependencies: zod: 3.23.8 diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index adc080f2..d4769283 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -17,6 +17,7 @@ import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; +import { searchSimilarPages } from "./lib/extract/index/pinecone"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts new file mode 100644 index 00000000..603cd38f --- /dev/null +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -0,0 +1,141 @@ +import { Pinecone } from '@pinecone-database/pinecone'; +import { Document } from '../../../controllers/v1/types'; +import { logger } from '../../logger'; +import OpenAI from "openai"; + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +const pinecone = new Pinecone({ + apiKey: process.env.PINECONE_API_KEY!, +}); + +const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; + +export interface PageMetadata { + url: string; + originUrl: string; + title?: string; + description?: string; + crawlId?: string; + teamId?: string; + timestamp: number; + markdown?: string; +} + +async function getEmbedding(text: string) { + const embedding = await openai.embeddings.create({ + model: "text-embedding-3-small", + input: text, + encoding_format: "float", + }); + + return embedding.data[0].embedding; +} + +function normalizeUrl(url: string) { + const urlO = new URL(url); + if (!urlO.hostname.startsWith("www.")) { + urlO.hostname = "www." + urlO.hostname; + } + return urlO.href; +} + +export async function indexPage( + document: Document, + originUrl: string, + crawlId?: string, + teamId?: string +) { + try { + const index = pinecone.index(INDEX_NAME); + + // Create text to embed + const textToEmbed = [ + document.metadata.title, + document.metadata.description, + document.markdown + ].filter(Boolean).join('\n\n'); + + // Get embedding from OpenAI + const embedding = await getEmbedding(textToEmbed); + + // Prepare metadata + const metadata: PageMetadata = { + url: normalizeUrl(document.metadata.sourceURL || document.metadata.url!), + originUrl: normalizeUrl(originUrl), + title: document.metadata.title, + description: document.metadata.description, + crawlId, + teamId, + markdown: document.markdown, + timestamp: Date.now() + }; + + // Upsert to Pinecone + await index.upsert([{ + id: document.metadata.sourceURL || document.metadata.url!, + values: embedding, + metadata: { + ...metadata, + [document.metadata.sourceURL || document.metadata.url!]: true + } + }]); + + logger.debug('Successfully indexed page in Pinecone', { + url: metadata.url, + crawlId + }); + + } catch (error) { + logger.error('Failed to index page in Pinecone', { + error, + url: document.metadata.sourceURL || document.metadata.url, + crawlId + }); + } +} + +export async function searchSimilarPages( + query: string, + originUrl?: string, + limit: number = 10 +) { + try { + const index = pinecone.index(INDEX_NAME); + + // Get query embedding from OpenAI + const queryEmbedding = await getEmbedding(query); + + const queryParams: any = { + vector: queryEmbedding, + topK: limit, + includeMetadata: true + }; + + // Add filter if originUrl is provided + if (originUrl) { + queryParams.filter = { + [originUrl]: { $contains: normalizeUrl(originUrl) } + }; + } + + const results = await index.query(queryParams); + return results.matches.map(match => ({ + url: match.metadata?.url, + title: match.metadata?.title, + description: match.metadata?.description, + score: match.score, + markdown: match.metadata?.markdown + })); + + } catch (error) { + logger.error('Failed to search similar pages in Pinecone', { + error, + query, + originUrl + }); + return []; + } +} diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index bffbc9c2..02d59457 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -10,7 +10,7 @@ const openai = new OpenAI({ async function getEmbedding(text: string) { const embedding = await openai.embeddings.create({ - model: "text-embedding-ada-002", + model: "text-embedding-3-small", input: text, encoding_format: "float", }); From 7a31306be591f4d27cad23e58e3f6d3e1c57b60e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 20:04:22 -0300 Subject: [PATCH 02/13] Nick: url normalization + max metadata size --- apps/api/src/index.ts | 2 +- apps/api/src/lib/extract/index/pinecone.ts | 39 ++++++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index d4769283..20214d72 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -17,7 +17,6 @@ import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; -import { searchSimilarPages } from "./lib/extract/index/pinecone"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -255,3 +254,4 @@ logger.info(`Worker ${process.pid} started`); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); +// diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts index 603cd38f..14c3bea4 100644 --- a/apps/api/src/lib/extract/index/pinecone.ts +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -13,6 +13,8 @@ const pinecone = new Pinecone({ const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; +const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes + export interface PageMetadata { url: string; originUrl: string; @@ -42,40 +44,54 @@ function normalizeUrl(url: string) { return urlO.href; } -export async function indexPage( - document: Document, - originUrl: string, - crawlId?: string, - teamId?: string +export async function indexPage({ + document, + originUrl, + crawlId, + teamId +}: { + document: Document; + originUrl: string; + crawlId?: string; + teamId?: string; +} ) { try { const index = pinecone.index(INDEX_NAME); + // Trim markdown if it's too long + let trimmedMarkdown = document.markdown; + if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) { + trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding + } + // Create text to embed const textToEmbed = [ document.metadata.title, document.metadata.description, - document.markdown + trimmedMarkdown ].filter(Boolean).join('\n\n'); // Get embedding from OpenAI const embedding = await getEmbedding(textToEmbed); + const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!); + // Prepare metadata const metadata: PageMetadata = { - url: normalizeUrl(document.metadata.sourceURL || document.metadata.url!), + url: normalizedUrl, originUrl: normalizeUrl(originUrl), title: document.metadata.title, description: document.metadata.description, crawlId, teamId, - markdown: document.markdown, + markdown: trimmedMarkdown, timestamp: Date.now() }; // Upsert to Pinecone await index.upsert([{ - id: document.metadata.sourceURL || document.metadata.url!, + id: normalizedUrl, values: embedding, metadata: { ...metadata, @@ -114,10 +130,11 @@ export async function searchSimilarPages( includeMetadata: true }; + const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; // Add filter if originUrl is provided - if (originUrl) { + if (normalizedOriginUrl) { queryParams.filter = { - [originUrl]: { $contains: normalizeUrl(originUrl) } + originUrl: { $eq: normalizedOriginUrl } }; } From e6da214aebcdb01484cb11357d913554fb035072 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 21:42:01 -0300 Subject: [PATCH 03/13] Nick: async background index --- .../api/src/lib/extract/extraction-service.ts | 70 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 7 ++ apps/api/src/scraper/scrapeURL/index.ts | 1 + apps/api/src/services/rate-limiter.ts | 1 + 4 files changed, 79 insertions(+) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index f84a1f34..e266c2e9 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/ import { buildDocument } from "./build-document"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; +import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; +import { saveCrawl, StoredCrawl } from "../crawl-redis"; interface ExtractServiceOptions { request: ExtractRequest; @@ -24,6 +26,18 @@ interface ExtractResult { error?: string; } +function getRootDomain(url: string): string { + try { + if(url.endsWith("/*")) { + url = url.slice(0, -2); + } + const urlObj = new URL(url); + return `${urlObj.protocol}//${urlObj.hostname}`; + } catch (e) { + return url; + } +} + export async function performExtraction(options: ExtractServiceOptions): Promise { const { request, teamId, plan, subId } = options; const scrapeId = crypto.randomUUID(); @@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise }); } + // Kickoff background crawl for indexing root domains + const rootDomains = new Set(request.urls.map(getRootDomain)); + rootDomains.forEach(async url => { + const crawlId = crypto.randomUUID(); + + // Create and save crawl configuration first + const sc: StoredCrawl = { + originUrl: url, + crawlerOptions: { + maxDepth: 15, + limit: 5000, + includePaths: [], + excludePaths: [], + ignoreSitemap: false, + includeSubdomains: true, + allowExternalLinks: false, + allowBackwardLinks: true + }, + scrapeOptions: { + formats: ["markdown"], + onlyMainContent: true, + waitFor: 0, + mobile: false, + removeBase64Images: true, + fastMode: false, + parsePDF: true, + skipTlsVerification: false, + }, + internalOptions: { + disableSmartWaitCache: true, + isBackgroundIndex: true + }, + team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + createdAt: Date.now(), + plan: "hobby", // make it a low concurrency + }; + + // Save the crawl configuration + await saveCrawl(crawlId, sc); + + // Then kick off the job + await _addScrapeJobToBullMQ({ + url, + mode: "kickoff" as const, + team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + plan: "hobby", // make it a low concurrency + crawlerOptions: sc.crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "index", + crawl_id: crawlId, + webhook: null, + v1: true, + }, {}, crypto.randomUUID(), 50); + }); + // Bill team for usage billTeam(teamId, subId, links.length * 5).catch((error) => { logger.error( diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0f3b8524..6bb8b04e 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -16,6 +16,7 @@ import { ScrapeUrlResponse, } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; +import { indexPage } from "../lib/extract/index/pinecone"; configDotenv(); export async function startWebScraperPipeline({ @@ -173,6 +174,12 @@ export async function runWebScraper({ creditsToBeBilled = 5; } + // If the team is the background index team, return the response + if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) { + return response; + } + + billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 130ef9ee..549ce9d1 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -153,6 +153,7 @@ export type InternalOptions = { v0DisableJsDom?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine + isBackgroundIndex?: boolean; }; export type EngineResultsTracker = { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 21025589..304a9fc4 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -166,6 +166,7 @@ const testSuiteTokens = [ "4c2638d", "cbb3462", // don't remove (s-ai) "824abcd", // don't remove (s-ai) + "0966288", ]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; From bd81b41d5f8ba49b618e4ed7a9c87788e42aa16c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 21:43:59 -0300 Subject: [PATCH 04/13] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 155 ++++++++++++++++---------- 1 file changed, 94 insertions(+), 61 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 48e6f7fd..f6a033cb 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -48,6 +48,9 @@ import { } from "../lib/concurrency-limit"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; +import { indexPage } from "../lib/extract/index/pinecone"; +import { Document } from "../controllers/v1/types"; + configDotenv(); class RacedRedirectError extends Error { @@ -209,7 +212,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { const result = await processJob(job, token); if (result.success) { try { - if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { + if ( + job.data.crawl_id && + process.env.USE_DB_AUTHENTICATION === "true" + ) { logger.debug( "Job succeeded -- has crawl associated, putting null in Redis", ); @@ -410,66 +416,66 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { const crawler = crawlToCrawler(job.data.crawl_id, sc); const sitemap = sc.crawlerOptions.ignoreSitemap - ? 0 - : await crawler.tryGetSitemap(async urls => { - if (urls.length === 0) return; - - logger.debug("Using sitemap chunk of length " + urls.length, { - sitemapLength: urls.length, - }); - - let jobPriority = await getJobPriority({ - plan: job.data.plan, - team_id: job.data.team_id, - basePriority: 21, - }); - logger.debug("Using job priority " + jobPriority, { jobPriority }); - - const jobs = urls.map(url => { - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls" as const, - team_id: job.data.team_id, - plan: job.data.plan!, - crawlerOptions: job.data.crawlerOptions, - scrapeOptions: job.data.scrapeOptions, - internalOptions: sc.internalOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - sitemapped: true, - webhook: job.data.webhook, - v1: job.data.v1, - }, - opts: { - jobId: uuid, - priority: 20, - }, - }; - }); - - logger.debug("Locking URLs..."); - await lockURLs( - job.data.crawl_id, - sc, - jobs.map((x) => x.data.url), - ); - logger.debug("Adding scrape jobs to Redis..."); - await addCrawlJobs( - job.data.crawl_id, - jobs.map((x) => x.opts.jobId), - ); - logger.debug("Adding scrape jobs to BullMQ..."); - await addScrapeJobs(jobs); + ? 0 + : await crawler.tryGetSitemap(async (urls) => { + if (urls.length === 0) return; + + logger.debug("Using sitemap chunk of length " + urls.length, { + sitemapLength: urls.length, }); + let jobPriority = await getJobPriority({ + plan: job.data.plan, + team_id: job.data.team_id, + basePriority: 21, + }); + logger.debug("Using job priority " + jobPriority, { jobPriority }); + + const jobs = urls.map((url) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + logger.debug("Locking URLs..."); + await lockURLs( + job.data.crawl_id, + sc, + jobs.map((x) => x.data.url), + ); + logger.debug("Adding scrape jobs to Redis..."); + await addCrawlJobs( + job.data.crawl_id, + jobs.map((x) => x.opts.jobId), + ); + logger.debug("Adding scrape jobs to BullMQ..."); + await addScrapeJobs(jobs); + }); + if (sitemap === 0) { logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap, }); - + logger.debug("Locking URL..."); await lockURL(job.data.crawl_id, sc, job.data.url); const jobId = uuidv4(); @@ -511,14 +517,33 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { "crawl.started", ); } - - return { success: true } + + return { success: true }; } catch (error) { - logger.error("An error occurred!", { error }) + logger.error("An error occurred!", { error }); return { success: false, error }; } } +async function indexJob(job: Job & { id: string }, document: Document) { + if ( + document && + document.markdown && + job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! + ) { + indexPage({ + document: document, + originUrl: job.data.crawl_id + ? (await getCrawl(job.data.crawl_id))?.originUrl! + : document.metadata.sourceURL!, + crawlId: job.data.crawl_id, + teamId: job.data.team_id, + }).catch((error) => { + _logger.error("Error indexing page", { error }); + }); + } +} + async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", @@ -623,14 +648,18 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.sourceURL, sc) ) { const crawler = crawlToCrawler(job.data.crawl_id, sc); - if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) { - throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking + if ( + crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null + ) { + throw new Error( + "Redirected target URL is not allowed by crawlOptions", + ); // TODO: make this its own error type that is ignored by error tracking } if (isUrlBlocked(doc.metadata.url)) { throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking } - + const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc), @@ -675,6 +704,8 @@ async function processJob(job: Job & { id: string }, token: string) { true, ); + indexJob(job, doc); + logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true); @@ -755,6 +786,8 @@ async function processJob(job: Job & { id: string }, token: string) { } await finishCrawlIfNeeded(job, sc); + } else { + indexJob(job, doc); } logger.info(`🐂 Job done ${job.id}`); From 33632d2fe309f9557065ece1e3d731a0dcc2b5eb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 31 Dec 2024 15:22:50 -0300 Subject: [PATCH 05/13] Update extraction-service.ts --- .../api/src/lib/extract/extraction-service.ts | 102 +++++++++--------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index e266c2e9..0ca6a3de 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise } // Kickoff background crawl for indexing root domains - const rootDomains = new Set(request.urls.map(getRootDomain)); - rootDomains.forEach(async url => { - const crawlId = crypto.randomUUID(); + // const rootDomains = new Set(request.urls.map(getRootDomain)); + // rootDomains.forEach(async url => { + // const crawlId = crypto.randomUUID(); - // Create and save crawl configuration first - const sc: StoredCrawl = { - originUrl: url, - crawlerOptions: { - maxDepth: 15, - limit: 5000, - includePaths: [], - excludePaths: [], - ignoreSitemap: false, - includeSubdomains: true, - allowExternalLinks: false, - allowBackwardLinks: true - }, - scrapeOptions: { - formats: ["markdown"], - onlyMainContent: true, - waitFor: 0, - mobile: false, - removeBase64Images: true, - fastMode: false, - parsePDF: true, - skipTlsVerification: false, - }, - internalOptions: { - disableSmartWaitCache: true, - isBackgroundIndex: true - }, - team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, - createdAt: Date.now(), - plan: "hobby", // make it a low concurrency - }; + // // Create and save crawl configuration first + // const sc: StoredCrawl = { + // originUrl: url, + // crawlerOptions: { + // maxDepth: 15, + // limit: 5000, + // includePaths: [], + // excludePaths: [], + // ignoreSitemap: false, + // includeSubdomains: true, + // allowExternalLinks: false, + // allowBackwardLinks: true + // }, + // scrapeOptions: { + // formats: ["markdown"], + // onlyMainContent: true, + // waitFor: 0, + // mobile: false, + // removeBase64Images: true, + // fastMode: false, + // parsePDF: true, + // skipTlsVerification: false, + // }, + // internalOptions: { + // disableSmartWaitCache: true, + // isBackgroundIndex: true + // }, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // createdAt: Date.now(), + // plan: "hobby", // make it a low concurrency + // }; - // Save the crawl configuration - await saveCrawl(crawlId, sc); + // // Save the crawl configuration + // await saveCrawl(crawlId, sc); - // Then kick off the job - await _addScrapeJobToBullMQ({ - url, - mode: "kickoff" as const, - team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, - plan: "hobby", // make it a low concurrency - crawlerOptions: sc.crawlerOptions, - scrapeOptions: sc.scrapeOptions, - internalOptions: sc.internalOptions, - origin: "index", - crawl_id: crawlId, - webhook: null, - v1: true, - }, {}, crypto.randomUUID(), 50); - }); + // // Then kick off the job + // await _addScrapeJobToBullMQ({ + // url, + // mode: "kickoff" as const, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // plan: "hobby", // make it a low concurrency + // crawlerOptions: sc.crawlerOptions, + // scrapeOptions: sc.scrapeOptions, + // internalOptions: sc.internalOptions, + // origin: "index", + // crawl_id: crawlId, + // webhook: null, + // v1: true, + // }, {}, crypto.randomUUID(), 50); + // }); // Bill team for usage billTeam(teamId, subId, links.length * 5).catch((error) => { From c3fd13a82ba16e6c24f8bb69a8d232556dde761e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 31 Dec 2024 18:06:07 -0300 Subject: [PATCH 06/13] Nick: fixed re-ranker and enabled url cache of 2hrs --- .gitignore | 3 +++ apps/api/src/lib/cache.ts | 2 +- apps/api/src/lib/extract/config.ts | 7 ++++++ apps/api/src/lib/extract/reranker.ts | 25 ++++++++++--------- apps/api/src/lib/extract/url-processor.ts | 7 +++--- .../src/scraper/scrapeURL/engines/index.ts | 2 +- 6 files changed, 28 insertions(+), 18 deletions(-) create mode 100644 apps/api/src/lib/extract/config.ts diff --git a/.gitignore b/.gitignore index fc527490..311ee4df 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ apps/js-sdk/firecrawl/dist /examples/haiku_web_crawler/firecrawl_env /examples/sonnet_web_crawler/firecrawl_env /examples/internal_link_assitant/firecrawl_env + +/apps/api/logs/* +/apps/api/debug/* \ No newline at end of file diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index cbab4e05..ff91fa88 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) { if (!cacheRedis) return; try { - await cacheRedis.set(key, JSON.stringify(entry)); + await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds } catch (error) { logger.warn("Failed to save to cache", { key, error }); } diff --git a/apps/api/src/lib/extract/config.ts b/apps/api/src/lib/extract/config.ts new file mode 100644 index 00000000..f8333b3c --- /dev/null +++ b/apps/api/src/lib/extract/config.ts @@ -0,0 +1,7 @@ +export const extractConfig = { + MAX_INITIAL_RANKING_LIMIT: 1000, + MAX_RANKING_LIMIT: 20, + INITIAL_SCORE_THRESHOLD: 0.75, + FALLBACK_SCORE_THRESHOLD: 0.5, + MIN_REQUIRED_LINKS: 1, +}; \ No newline at end of file diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 2a4e2f62..e5b61741 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -3,15 +3,13 @@ import { performRanking } from "../ranker"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { logger } from "../logger"; import { CohereClient } from "cohere-ai"; +import { extractConfig } from "./config"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, }); -const MAX_RANKING_LIMIT = 10; -const INITIAL_SCORE_THRESHOLD = 0.75; -const FALLBACK_SCORE_THRESHOLD = 0.5; -const MIN_REQUIRED_LINKS = 1; + interface RankingResult { mappedLinks: MapDocument[]; @@ -61,32 +59,35 @@ export async function rerankLinks( searchQuery, ); + // First try with high threshold let filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - INITIAL_SCORE_THRESHOLD, + extractConfig.INITIAL_SCORE_THRESHOLD, ); + + // If we don't have enough high-quality links, try with lower threshold - if (filteredLinks.length < MIN_REQUIRED_LINKS) { + if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) { logger.info( - `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, + `Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, ); filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - FALLBACK_SCORE_THRESHOLD, + extractConfig.FALLBACK_SCORE_THRESHOLD, ); if (filteredLinks.length === 0) { // If still no results, take top N results regardless of score logger.warn( - `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`, + `No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`, ); filteredLinks = linksAndScores .sort((a, b) => b.score - a.score) - .slice(0, MIN_REQUIRED_LINKS) + .slice(0, extractConfig.MIN_REQUIRED_LINKS) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => @@ -108,7 +109,7 @@ export async function rerankLinks( } }); - const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT); + const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT); // Mark URLs that will be used in completion rankedLinks.forEach(link => { @@ -119,7 +120,7 @@ export async function rerankLinks( }); // Mark URLs that were dropped due to ranking limit - filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => { + filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => { const trace = urlTraces.find(t => t.url === link.url); if (trace) { trace.warning = 'Excluded due to ranking limit'; diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 4d61a8d3..9f255ad7 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction"; import { buildRefrasedPrompt } from "./build-prompts"; import { logger } from "../logger"; import { rerankLinks } from "./reranker"; - -const MAX_EXTRACT_LIMIT = 100; +import { extractConfig } from "./config"; interface ProcessUrlOptions { url: string; @@ -96,8 +95,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } - // Limit initial set of links - mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); + // Limit initial set of links (1000) + mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT); // Perform reranking if prompt is provided if (options.prompt) { diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bb0c485c..bf51ac94 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -38,7 +38,7 @@ const useCache = process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ - // ...(useCache ? [ "cache" as const ] : []), + ...(useCache ? [ "cache" as const ] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, From ef0fc8d0d302220e87a9150b232edf987f7edddb Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 2 Jan 2025 18:00:18 -0300 Subject: [PATCH 07/13] broader search if didnt find results --- .../api/src/lib/extract/extraction-service.ts | 2 +- apps/api/src/lib/extract/url-processor.ts | 52 ++++++++++++++++++- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 0ca6a3de..2791df3c 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -101,7 +101,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise mode: "llm", systemPrompt: (request.systemPrompt ? `${request.systemPrompt}\n` : "") + - "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: request.prompt, schema: request.schema, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 9f255ad7..af250fcd 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -66,8 +66,56 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); let mappedLinks = mapResults.mapResults as MapDocument[]; - const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; - const uniqueUrls = removeDuplicateUrls(allUrls); + let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + let uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + + // retry if only one url is returned + if (uniqueUrls.length === 1) { + const retryMapResults = await getMapResults({ + url: baseUrl, + teamId: options.teamId, + plan: options.plan, + allowExternalLinks: options.allowExternalLinks, + origin: options.origin, + limit: options.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: options.includeSubdomains, + }); + + mappedLinks = retryMapResults.mapResults as MapDocument[]; + allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + uniqueUrls = removeDuplicateUrls(allUrls); + + // Track all discovered URLs + uniqueUrls.forEach(discoveredUrl => { + if (!urlTraces.some(t => t.url === discoveredUrl)) { + urlTraces.push({ + url: discoveredUrl, + status: 'mapped', + warning: 'Broader search. Not limiting map results to prompt.', + timing: { + discoveredAt: new Date().toISOString(), + }, + usedInCompletion: false, + }); + } + }); + } // Track all discovered URLs uniqueUrls.forEach(discoveredUrl => { From 6b2e1cbb281362405c4b8729e25eae169ec13851 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:19:40 -0300 Subject: [PATCH 08/13] Nick: cache /extract scrapes --- apps/api/src/lib/extract/document-scraper.ts | 15 ++++++++++----- apps/api/src/scraper/scrapeURL/engines/index.ts | 6 ++++++ apps/api/src/scraper/scrapeURL/index.ts | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 04194b0b..91d515df 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -14,10 +14,13 @@ interface ScrapeDocumentOptions { timeout: number; } -export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise { +export async function scrapeDocument( + options: ScrapeDocumentOptions, + urlTraces: URLTrace[], +): Promise { const trace = urlTraces.find((t) => t.url === options.url); if (trace) { - trace.status = 'scraped'; + trace.status = "scraped"; trace.timing.scrapedAt = new Date().toISOString(); } @@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: mode: "single_urls", team_id: options.teamId, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { + useCache: true, + }, plan: options.plan, origin: options.origin, is_scrape: true, @@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: } catch (error) { logger.error(`Error in scrapeDocument: ${error}`); if (trace) { - trace.status = 'error'; + trace.status = "error"; trace.error = error.message; } return null; } -} \ No newline at end of file +} diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bf51ac94..956fc3ab 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -298,6 +298,12 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { + + if (meta.internalOptions.useCache !== true) { + engines.splice(engines.indexOf("cache"), 1); + }else{ + meta.logger.debug("Cache engine enabled by useCache option"); + } const prioritySum = [...meta.featureFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, 0, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 549ce9d1..b13f7d9a 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -151,7 +151,7 @@ export type InternalOptions = { v0CrawlOnlyUrls?: boolean; v0DisableJsDom?: boolean; - + useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine isBackgroundIndex?: boolean; }; From 432b4106789d495769da3804228b915522f42fa5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:26:05 -0300 Subject: [PATCH 09/13] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f6a033cb..8408cc61 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -531,16 +531,16 @@ async function indexJob(job: Job & { id: string }, document: Document) { document.markdown && job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! ) { - indexPage({ - document: document, - originUrl: job.data.crawl_id - ? (await getCrawl(job.data.crawl_id))?.originUrl! - : document.metadata.sourceURL!, - crawlId: job.data.crawl_id, - teamId: job.data.team_id, - }).catch((error) => { - _logger.error("Error indexing page", { error }); - }); + // indexPage({ + // document: document, + // originUrl: job.data.crawl_id + // ? (await getCrawl(job.data.crawl_id))?.originUrl! + // : document.metadata.sourceURL!, + // crawlId: job.data.crawl_id, + // teamId: job.data.team_id, + // }).catch((error) => { + // _logger.error("Error indexing page", { error }); + // }); } } From 499479c85e9da40a86e3c2ef83eaf1f924682ae5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:28:52 -0300 Subject: [PATCH 10/13] Update url-processor.ts --- apps/api/src/lib/extract/url-processor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index af250fcd..a5027fa9 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); // retry if only one url is returned - if (uniqueUrls.length === 1) { + if (uniqueUrls.length <= 1) { const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, From 8df1c67961dded611cfe18c9a1c304852d428c9d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:48:28 -0300 Subject: [PATCH 11/13] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 8408cc61..4ea3ff84 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -50,6 +50,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; +import { supabase_service } from "../services/supabase"; configDotenv(); @@ -77,6 +78,30 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); + + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { + try { + const { error } = await supabase_service + .from('crawl_maps') + .insert({ + crawl_id: job.data.crawl_id, + team_id: job.data.team_id, + origin_url: sc.originUrl, + urls: visitedUrls, + created_at: new Date().toISOString() + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } + } catch (error) { + _logger.error("Error saving crawl map", { error }); + } + } + if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); From a4f7c38834426c441d7da0221b7f467195cd2350 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 22:15:23 -0300 Subject: [PATCH 12/13] Nick: fixed --- .../src/scraper/scrapeURL/engines/index.ts | 7 ++- apps/api/src/services/queue-worker.ts | 52 ++++++++++++++----- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 956fc3ab..e452f7fa 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -300,8 +300,11 @@ export function buildFallbackList(meta: Meta): { }[] { if (meta.internalOptions.useCache !== true) { - engines.splice(engines.indexOf("cache"), 1); - }else{ + const cacheIndex = engines.indexOf("cache"); + if (cacheIndex !== -1) { + engines.splice(cacheIndex, 1); + } + } else { meta.logger.debug("Cache engine enabled by useCache option"); } const prioritySum = [...meta.featureFlags].reduce( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4ea3ff84..f6ff96a5 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -84,18 +84,43 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { try { - const { error } = await supabase_service + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service .from('crawl_maps') - .insert({ - crawl_id: job.data.crawl_id, - team_id: job.data.team_id, - origin_url: sc.originUrl, - urls: visitedUrls, - created_at: new Date().toISOString() - }); + .select('urls') + .eq('origin_url', sc.originUrl) + .single(); + + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; - if (error) { - _logger.error("Failed to save crawl map", { error }); + const { error } = await supabase_service + .from('crawl_maps') + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString() + }) + .eq('origin_url', sc.originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service + .from('crawl_maps') + .insert({ + origin_url: sc.originUrl, + urls: visitedUrls, + num_urls: visitedUrls.length, + created_at: new Date().toISOString() + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } } } catch (error) { _logger.error("Error saving crawl map", { error }); @@ -802,9 +827,10 @@ async function processJob(job: Job & { id: string }, token: string) { newJobId: jobId, }); } else { - logger.debug("Could not lock URL " + JSON.stringify(link), { - url: link, - }); + // TODO: removed this, ok? too many 'not useful' logs (?) Mogery! + // logger.debug("Could not lock URL " + JSON.stringify(link), { + // url: link, + // }); } } } From c655c6859f256b10cb1a4cdd9d4e039940dea89a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 22:50:53 -0300 Subject: [PATCH 13/13] Nick: fixed --- apps/api/src/lib/canonical-url.ts | 7 ++ apps/api/src/services/queue-worker.ts | 97 ++++++++++++++++----------- 2 files changed, 63 insertions(+), 41 deletions(-) create mode 100644 apps/api/src/lib/canonical-url.ts diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts new file mode 100644 index 00000000..cbb33f8b --- /dev/null +++ b/apps/api/src/lib/canonical-url.ts @@ -0,0 +1,7 @@ +export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} \ No newline at end of file diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f6ff96a5..4fb08337 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; +import { normalizeUrl } from "../lib/canonical-url"; configDotenv(); @@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { - // Get all visited URLs from Redis - const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); - - // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) - if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { - try { - // First check if entry exists for this origin URL - const { data: existingMap } = await supabase_service - .from('crawl_maps') - .select('urls') - .eq('origin_url', sc.originUrl) - .single(); + (async () => { + const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined; + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited", + ); + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { + // Fire and forget the upload to Supabase + try { + // Standardize URLs to canonical form (https, no www) + const standardizedUrls = [ + ...new Set( + visitedUrls.map((url) => { + return normalizeUrl(url); + }), + ), + ]; + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl) + .single(); - if (existingMap) { - // Merge URLs, removing duplicates - const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; - - const { error } = await supabase_service - .from('crawl_maps') - .update({ - urls: mergedUrls, - num_urls: mergedUrls.length, - updated_at: new Date().toISOString() - }) - .eq('origin_url', sc.originUrl); + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [ + ...new Set([...existingMap.urls, ...standardizedUrls]), + ]; - if (error) { - _logger.error("Failed to update crawl map", { error }); - } - } else { - // Insert new entry if none exists - const { error } = await supabase_service - .from('crawl_maps') - .insert({ - origin_url: sc.originUrl, - urls: visitedUrls, - num_urls: visitedUrls.length, - created_at: new Date().toISOString() + const { error } = await supabase_service + .from("crawl_maps") + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service.from("crawl_maps").insert({ + origin_url: originUrl, + urls: standardizedUrls, + num_urls: standardizedUrls.length, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), }); - if (error) { - _logger.error("Failed to save crawl map", { error }); + if (error) { + _logger.error("Failed to save crawl map", { error }); + } } + } catch (error) { + _logger.error("Error saving crawl map", { error }); } - } catch (error) { - _logger.error("Error saving crawl map", { error }); } - } + })(); if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id);