This commit is contained in:
Nicolas 2025-01-03 22:55:42 -03:00
commit e8a9d8ddcd
18 changed files with 570 additions and 133 deletions

2
.gitignore vendored
View File

@ -34,5 +34,7 @@ apps/js-sdk/firecrawl/dist
/examples/sonnet_web_crawler/firecrawl_env
/examples/internal_link_assitant/firecrawl_env
/apps/api/logs/*
/apps/api/debug/*
.vscode

View File

@ -58,6 +58,7 @@
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.17",
"@nangohq/node": "^0.40.8",
"@pinecone-database/pinecone": "^4.0.0",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",

101
apps/api/pnpm-lock.yaml generated
View File

@ -10,7 +10,7 @@ importers:
dependencies:
'@anthropic-ai/sdk':
specifier: ^0.24.3
version: 0.24.3
version: 0.24.3(encoding@0.1.13)
'@brillout/import':
specifier: ^0.2.2
version: 0.2.3
@ -29,9 +29,12 @@ importers:
'@nangohq/node':
specifier: ^0.40.8
version: 0.40.8
'@pinecone-database/pinecone':
specifier: ^4.0.0
version: 4.0.0
'@sentry/cli':
specifier: ^2.33.1
version: 2.33.1
version: 2.33.1(encoding@0.1.13)
'@sentry/node':
specifier: ^8.26.0
version: 8.26.0
@ -79,7 +82,7 @@ importers:
version: 1.1.1
cohere-ai:
specifier: ^7.14.0
version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))
version: 7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13)
cors:
specifier: ^2.8.5
version: 2.8.5
@ -130,13 +133,13 @@ importers:
version: 2.9.0
langchain:
specifier: ^0.2.8
version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
version: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
languagedetect:
specifier: ^2.0.0
version: 2.0.0
logsnag:
specifier: ^1.0.0
version: 1.0.0
version: 1.0.0(encoding@0.1.13)
luxon:
specifier: ^3.4.3
version: 3.4.4
@ -157,7 +160,7 @@ importers:
version: 7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3)
openai:
specifier: ^4.57.0
version: 4.57.0(zod@3.23.8)
version: 4.57.0(encoding@0.1.13)(zod@3.23.8)
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
@ -275,7 +278,7 @@ importers:
version: 29.7.0(@types/node@20.14.1)(ts-node@10.9.2(@types/node@20.14.1)(typescript@5.4.5))
jest-fetch-mock:
specifier: ^3.0.3
version: 3.0.3
version: 3.0.3(encoding@0.1.13)
mammoth:
specifier: ^1.7.2
version: 1.7.2
@ -1006,6 +1009,10 @@ packages:
'@pdf-lib/upng@1.0.1':
resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==}
'@pinecone-database/pinecone@4.0.0':
resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==}
engines: {node: '>=18.0.0'}
'@pkgjs/parseargs@0.11.0':
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
engines: {node: '>=14'}
@ -2279,6 +2286,9 @@ packages:
resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==}
engines: {node: '>= 0.8'}
encoding@0.1.13:
resolution: {integrity: sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==}
end-of-stream@1.4.4:
resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==}
@ -4599,7 +4609,7 @@ snapshots:
'@jridgewell/gen-mapping': 0.3.5
'@jridgewell/trace-mapping': 0.3.25
'@anthropic-ai/sdk@0.24.3':
'@anthropic-ai/sdk@0.24.3(encoding@0.1.13)':
dependencies:
'@types/node': 18.19.39
'@types/node-fetch': 2.6.11
@ -4607,7 +4617,7 @@ snapshots:
agentkeepalive: 4.5.0
form-data-encoder: 1.7.2
formdata-node: 4.4.1
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
web-streams-polyfill: 3.3.3
transitivePeerDependencies:
- encoding
@ -5577,13 +5587,13 @@ snapshots:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.4.15
'@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
'@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
dependencies:
ansi-styles: 5.2.0
camelcase: 6.3.0
decamelize: 1.2.0
js-tiktoken: 1.0.12
langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu)
langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4)
ml-distance: 4.0.1
mustache: 4.2.0
p-queue: 6.6.2
@ -5595,20 +5605,20 @@ snapshots:
- langchain
- openai
'@langchain/openai@0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
'@langchain/openai@0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
js-tiktoken: 1.0.12
openai: 4.57.0(zod@3.23.8)
openai: 4.57.0(encoding@0.1.13)(zod@3.23.8)
zod: 3.23.8
zod-to-json-schema: 3.23.1(zod@3.23.8)
transitivePeerDependencies:
- encoding
- langchain
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
js-tiktoken: 1.0.12
transitivePeerDependencies:
- langchain
@ -5866,6 +5876,10 @@ snapshots:
dependencies:
pako: 1.0.11
'@pinecone-database/pinecone@4.0.0':
dependencies:
encoding: 0.1.13
'@pkgjs/parseargs@0.11.0':
optional: true
@ -5950,10 +5964,10 @@ snapshots:
'@sentry/cli-win32-x64@2.33.1':
optional: true
'@sentry/cli@2.33.1':
'@sentry/cli@2.33.1(encoding@0.1.13)':
dependencies:
https-proxy-agent: 5.0.1
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
progress: 2.0.3
proxy-from-env: 1.1.0
which: 2.0.2
@ -7088,7 +7102,7 @@ snapshots:
co@4.6.0: {}
cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)):
cohere-ai@7.14.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(encoding@0.1.13):
dependencies:
'@aws-sdk/client-sagemaker': 3.679.0
'@aws-sdk/credential-providers': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))
@ -7098,7 +7112,7 @@ snapshots:
form-data-encoder: 4.0.2
formdata-node: 6.0.3
js-base64: 3.7.2
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
qs: 6.11.2
readable-stream: 4.5.2
url-join: 4.0.1
@ -7208,9 +7222,9 @@ snapshots:
dependencies:
luxon: 3.4.4
cross-fetch@3.1.8:
cross-fetch@3.1.8(encoding@0.1.13):
dependencies:
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
transitivePeerDependencies:
- encoding
@ -7365,6 +7379,10 @@ snapshots:
encodeurl@1.0.2: {}
encoding@0.1.13:
dependencies:
iconv-lite: 0.6.3
end-of-stream@1.4.4:
dependencies:
once: 1.4.0
@ -7899,9 +7917,9 @@ snapshots:
isexe@2.0.0: {}
isomorphic-fetch@3.0.0:
isomorphic-fetch@3.0.0(encoding@0.1.13):
dependencies:
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
whatwg-fetch: 3.6.20
transitivePeerDependencies:
- encoding
@ -8070,9 +8088,9 @@ snapshots:
jest-mock: 29.7.0
jest-util: 29.7.0
jest-fetch-mock@3.0.3:
jest-fetch-mock@3.0.3(encoding@0.1.13):
dependencies:
cross-fetch: 3.1.8
cross-fetch: 3.1.8(encoding@0.1.13)
promise-polyfill: 8.3.0
transitivePeerDependencies:
- encoding
@ -8342,17 +8360,17 @@ snapshots:
kuler@2.0.0: {}
langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/openai': 0.2.1(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
'@langchain/openai': 0.2.1(encoding@0.1.13)(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
binary-extensions: 2.3.0
js-tiktoken: 1.0.12
js-yaml: 4.1.0
jsonpointer: 5.0.1
langchainhub: 0.0.11
langsmith: 0.1.34(zyeavx4tfqw3smbbpiinhfxxeu)
langsmith: 0.1.34(npkyd6f7wyl3urgrzoxaktl5a4)
ml-distance: 4.0.1
openapi-types: 12.1.3
p-retry: 4.6.2
@ -8362,6 +8380,7 @@ snapshots:
zod-to-json-schema: 3.23.1(zod@3.23.8)
optionalDependencies:
'@aws-sdk/credential-provider-node': 3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0)
'@pinecone-database/pinecone': 4.0.0
'@supabase/supabase-js': 2.44.2
axios: 1.7.2
cheerio: 1.0.0-rc.12
@ -8381,7 +8400,7 @@ snapshots:
langchainhub@0.0.11: {}
langsmith@0.1.34(zyeavx4tfqw3smbbpiinhfxxeu):
langsmith@0.1.34(npkyd6f7wyl3urgrzoxaktl5a4):
dependencies:
'@types/uuid': 9.0.8
commander: 10.0.1
@ -8390,9 +8409,9 @@ snapshots:
p-retry: 4.6.2
uuid: 9.0.1
optionalDependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.57.0(zod@3.23.8)
'@langchain/core': 0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))
langchain: 0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.57.0(encoding@0.1.13)(zod@3.23.8)
languagedetect@2.0.0: {}
@ -8442,9 +8461,9 @@ snapshots:
loglevel@1.9.1: {}
logsnag@1.0.0:
logsnag@1.0.0(encoding@0.1.13):
dependencies:
isomorphic-fetch: 3.0.0
isomorphic-fetch: 3.0.0(encoding@0.1.13)
transitivePeerDependencies:
- encoding
@ -8703,9 +8722,11 @@ snapshots:
node-ensure@0.0.0: {}
node-fetch@2.7.0:
node-fetch@2.7.0(encoding@0.1.13):
dependencies:
whatwg-url: 5.0.0
optionalDependencies:
encoding: 0.1.13
node-fetch@3.3.2:
dependencies:
@ -8780,7 +8801,7 @@ snapshots:
transitivePeerDependencies:
- debug
openai@4.57.0(zod@3.23.8):
openai@4.57.0(encoding@0.1.13)(zod@3.23.8):
dependencies:
'@types/node': 18.19.39
'@types/node-fetch': 2.6.11
@ -8789,7 +8810,7 @@ snapshots:
agentkeepalive: 4.5.0
form-data-encoder: 1.7.2
formdata-node: 4.4.1
node-fetch: 2.7.0
node-fetch: 2.7.0(encoding@0.1.13)
qs: 6.12.2
optionalDependencies:
zod: 3.23.8

View File

@ -254,3 +254,4 @@ logger.info(`Worker ${process.pid} started`);
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
//

View File

@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
try {
await cacheRedis.set(key, JSON.stringify(entry));
await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds
} catch (error) {
logger.warn("Failed to save to cache", { key, error });
}

View File

@ -0,0 +1,7 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}

View File

@ -0,0 +1,7 @@
export const extractConfig = {
MAX_INITIAL_RANKING_LIMIT: 1000,
MAX_RANKING_LIMIT: 20,
INITIAL_SCORE_THRESHOLD: 0.75,
FALLBACK_SCORE_THRESHOLD: 0.5,
MIN_REQUIRED_LINKS: 1,
};

View File

@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
timeout: number;
}
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
export async function scrapeDocument(
options: ScrapeDocumentOptions,
urlTraces: URLTrace[],
): Promise<Document | null> {
const trace = urlTraces.find((t) => t.url === options.url);
if (trace) {
trace.status = 'scraped';
trace.status = "scraped";
trace.timing.scrapedAt = new Date().toISOString();
}
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
mode: "single_urls",
team_id: options.teamId,
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
internalOptions: {
useCache: true,
},
plan: options.plan,
origin: options.origin,
is_scrape: true,
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
} catch (error) {
logger.error(`Error in scrapeDocument: ${error}`);
if (trace) {
trace.status = 'error';
trace.status = "error";
trace.error = error.message;
}
return null;

View File

@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
interface ExtractServiceOptions {
request: ExtractRequest;
@ -24,6 +26,18 @@ interface ExtractResult {
error?: string;
}
function getRootDomain(url: string): string {
try {
if(url.endsWith("/*")) {
url = url.slice(0, -2);
}
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}`;
} catch (e) {
return url;
}
}
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
const { request, teamId, plan, subId } = options;
const scrapeId = crypto.randomUUID();
@ -87,7 +101,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
mode: "llm",
systemPrompt:
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "),
prompt: request.prompt,
schema: request.schema,
@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
});
}
// Kickoff background crawl for indexing root domains
// const rootDomains = new Set(request.urls.map(getRootDomain));
// rootDomains.forEach(async url => {
// const crawlId = crypto.randomUUID();
// // Create and save crawl configuration first
// const sc: StoredCrawl = {
// originUrl: url,
// crawlerOptions: {
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// includeSubdomains: true,
// allowExternalLinks: false,
// allowBackwardLinks: true
// },
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
// internalOptions: {
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// // Save the crawl configuration
// await saveCrawl(crawlId, sc);
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url,
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: crawlId,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// });
// Bill team for usage
billTeam(teamId, subId, links.length * 5).catch((error) => {
logger.error(

View File

@ -0,0 +1,158 @@
import { Pinecone } from '@pinecone-database/pinecone';
import { Document } from '../../../controllers/v1/types';
import { logger } from '../../logger';
import OpenAI from "openai";
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
});
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
export interface PageMetadata {
url: string;
originUrl: string;
title?: string;
description?: string;
crawlId?: string;
teamId?: string;
timestamp: number;
markdown?: string;
}
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: text,
encoding_format: "float",
});
return embedding.data[0].embedding;
}
function normalizeUrl(url: string) {
const urlO = new URL(url);
if (!urlO.hostname.startsWith("www.")) {
urlO.hostname = "www." + urlO.hostname;
}
return urlO.href;
}
export async function indexPage({
document,
originUrl,
crawlId,
teamId
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
}
) {
try {
const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
}
// Create text to embed
const textToEmbed = [
document.metadata.title,
document.metadata.description,
trimmedMarkdown
].filter(Boolean).join('\n\n');
// Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
// Prepare metadata
const metadata: PageMetadata = {
url: normalizedUrl,
originUrl: normalizeUrl(originUrl),
title: document.metadata.title,
description: document.metadata.description,
crawlId,
teamId,
markdown: trimmedMarkdown,
timestamp: Date.now()
};
// Upsert to Pinecone
await index.upsert([{
id: normalizedUrl,
values: embedding,
metadata: {
...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true
}
}]);
logger.debug('Successfully indexed page in Pinecone', {
url: metadata.url,
crawlId
});
} catch (error) {
logger.error('Failed to index page in Pinecone', {
error,
url: document.metadata.sourceURL || document.metadata.url,
crawlId
});
}
}
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 10
) {
try {
const index = pinecone.index(INDEX_NAME);
// Get query embedding from OpenAI
const queryEmbedding = await getEmbedding(query);
const queryParams: any = {
vector: queryEmbedding,
topK: limit,
includeMetadata: true
};
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided
if (normalizedOriginUrl) {
queryParams.filter = {
originUrl: { $eq: normalizedOriginUrl }
};
}
const results = await index.query(queryParams);
return results.matches.map(match => ({
url: match.metadata?.url,
title: match.metadata?.title,
description: match.metadata?.description,
score: match.score,
markdown: match.metadata?.markdown
}));
} catch (error) {
logger.error('Failed to search similar pages in Pinecone', {
error,
query,
originUrl
});
return [];
}
}

View File

@ -3,15 +3,13 @@ import { performRanking } from "../ranker";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger";
import { CohereClient } from "cohere-ai";
import { extractConfig } from "./config";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
const MAX_RANKING_LIMIT = 10;
const INITIAL_SCORE_THRESHOLD = 0.75;
const FALLBACK_SCORE_THRESHOLD = 0.5;
const MIN_REQUIRED_LINKS = 1;
interface RankingResult {
mappedLinks: MapDocument[];
@ -61,32 +59,35 @@ export async function rerankLinks(
searchQuery,
);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
INITIAL_SCORE_THRESHOLD,
extractConfig.INITIAL_SCORE_THRESHOLD,
);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
logger.info(
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
`Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
);
filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
FALLBACK_SCORE_THRESHOLD,
extractConfig.FALLBACK_SCORE_THRESHOLD,
);
if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score
logger.warn(
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
`No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`,
);
filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score)
.slice(0, MIN_REQUIRED_LINKS)
.slice(0, extractConfig.MIN_REQUIRED_LINKS)
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
@ -108,7 +109,7 @@ export async function rerankLinks(
}
});
const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
// Mark URLs that will be used in completion
rankedLinks.forEach(link => {
@ -119,7 +120,7 @@ export async function rerankLinks(
});
// Mark URLs that were dropped due to ranking limit
filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
const trace = urlTraces.find(t => t.url === link.url);
if (trace) {
trace.warning = 'Excluded due to ranking limit';

View File

@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction";
import { buildRefrasedPrompt } from "./build-prompts";
import { logger } from "../logger";
import { rerankLinks } from "./reranker";
const MAX_EXTRACT_LIMIT = 100;
import { extractConfig } from "./config";
interface ProcessUrlOptions {
url: string;
@ -67,8 +66,56 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
});
let mappedLinks = mapResults.mapResults as MapDocument[];
const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
const uniqueUrls = removeDuplicateUrls(allUrls);
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
let uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
if (!urlTraces.some(t => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: 'mapped',
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
// retry if only one url is returned
if (uniqueUrls.length <= 1) {
const retryMapResults = await getMapResults({
url: baseUrl,
teamId: options.teamId,
plan: options.plan,
allowExternalLinks: options.allowExternalLinks,
origin: options.origin,
limit: options.limit,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
});
mappedLinks = retryMapResults.mapResults as MapDocument[];
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
if (!urlTraces.some(t => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: 'mapped',
warning: 'Broader search. Not limiting map results to prompt.',
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
}
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
@ -96,8 +143,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit initial set of links
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
// Limit initial set of links (1000)
mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
// Perform reranking if prompt is provided
if (options.prompt) {

View File

@ -10,7 +10,7 @@ const openai = new OpenAI({
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-ada-002",
model: "text-embedding-3-small",
input: text,
encoding_format: "float",
});

View File

@ -16,6 +16,7 @@ import {
ScrapeUrlResponse,
} from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines";
import { indexPage } from "../lib/extract/index/pinecone";
configDotenv();
export async function startWebScraperPipeline({
@ -173,6 +174,12 @@ export async function runWebScraper({
creditsToBeBilled = 5;
}
// If the team is the background index team, return the response
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
return response;
}
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,

View File

@ -38,7 +38,7 @@ const useCache =
process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [
// ...(useCache ? [ "cache" as const ] : []),
...(useCache ? [ "cache" as const ] : []),
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,
@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache");
if (cacheIndex !== -1) {
engines.splice(cacheIndex, 1);
}
} else {
meta.logger.debug("Cache engine enabled by useCache option");
}
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0,

View File

@ -151,8 +151,9 @@ export type InternalOptions = {
v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean;
useCache?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
};
export type EngineResultsTracker = {

View File

@ -48,6 +48,11 @@ import {
} from "../lib/concurrency-limit";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
import { normalizeUrl } from "../lib/canonical-url";
configDotenv();
class RacedRedirectError extends Error {
@ -74,6 +79,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});
if (error) {
_logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
}
})();
if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -209,7 +277,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
const result = await processJob(job, token);
if (result.success) {
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
if (
job.data.crawl_id &&
process.env.USE_DB_AUTHENTICATION === "true"
) {
logger.debug(
"Job succeeded -- has crawl associated, putting null in Redis",
);
@ -411,7 +482,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
: await crawler.tryGetSitemap(async (urls) => {
if (urls.length === 0) return;
logger.debug("Using sitemap chunk of length " + urls.length, {
@ -425,7 +496,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
});
logger.debug("Using job priority " + jobPriority, { jobPriority });
const jobs = urls.map(url => {
const jobs = urls.map((url) => {
const uuid = uuidv4();
return {
name: uuid,
@ -512,13 +583,32 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
);
}
return { success: true }
return { success: true };
} catch (error) {
logger.error("An error occurred!", { error })
logger.error("An error occurred!", { error });
return { success: false, error };
}
}
async function indexJob(job: Job & { id: string }, document: Document) {
if (
document &&
document.markdown &&
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
) {
// indexPage({
// document: document,
// originUrl: job.data.crawl_id
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
// : document.metadata.sourceURL!,
// crawlId: job.data.crawl_id,
// teamId: job.data.team_id,
// }).catch((error) => {
// _logger.error("Error indexing page", { error });
// });
}
}
async function processJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({
module: "queue-worker",
@ -623,8 +713,12 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.sourceURL, sc)
) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
if (crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null) {
throw new Error("Redirected target URL is not allowed by crawlOptions"); // TODO: make this its own error type that is ignored by error tracking
if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
) {
throw new Error(
"Redirected target URL is not allowed by crawlOptions",
); // TODO: make this its own error type that is ignored by error tracking
}
if (isUrlBlocked(doc.metadata.url)) {
@ -675,6 +769,8 @@ async function processJob(job: Job & { id: string }, token: string) {
true,
);
indexJob(job, doc);
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, true);
@ -746,15 +842,18 @@ async function processJob(job: Job & { id: string }, token: string) {
newJobId: jobId,
});
} else {
logger.debug("Could not lock URL " + JSON.stringify(link), {
url: link,
});
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
// logger.debug("Could not lock URL " + JSON.stringify(link), {
// url: link,
// });
}
}
}
}
await finishCrawlIfNeeded(job, sc);
} else {
indexJob(job, doc);
}
logger.info(`🐂 Job done ${job.id}`);

View File

@ -166,6 +166,7 @@ const testSuiteTokens = [
"4c2638d",
"cbb3462", // don't remove (s-ai)
"824abcd", // don't remove (s-ai)
"0966288",
];
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];