mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 04:15:57 +08:00
wip
This commit is contained in:
parent
830d15f2f6
commit
bf0b1c7ae0
@ -51,6 +51,9 @@
|
|||||||
"typescript": "^5.4.2"
|
"typescript": "^5.4.2"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@ai-sdk/anthropic": "^1.2.2",
|
||||||
|
"@ai-sdk/google": "^1.2.3",
|
||||||
|
"@ai-sdk/groq": "^1.2.1",
|
||||||
"@ai-sdk/openai": "^1.1.13",
|
"@ai-sdk/openai": "^1.1.13",
|
||||||
"@anthropic-ai/sdk": "^0.24.3",
|
"@anthropic-ai/sdk": "^0.24.3",
|
||||||
"@apidevtools/json-schema-ref-parser": "^11.7.3",
|
"@apidevtools/json-schema-ref-parser": "^11.7.3",
|
||||||
@ -67,7 +70,7 @@
|
|||||||
"@supabase/supabase-js": "^2.44.2",
|
"@supabase/supabase-js": "^2.44.2",
|
||||||
"@types/express-ws": "^3.0.4",
|
"@types/express-ws": "^3.0.4",
|
||||||
"@types/ws": "^8.5.12",
|
"@types/ws": "^8.5.12",
|
||||||
"ai": "^4.1.45",
|
"ai": "^4.2.8",
|
||||||
"ajv": "^8.16.0",
|
"ajv": "^8.16.0",
|
||||||
"async": "^3.2.5",
|
"async": "^3.2.5",
|
||||||
"async-mutex": "^0.5.0",
|
"async-mutex": "^0.5.0",
|
||||||
|
160
apps/api/pnpm-lock.yaml
generated
160
apps/api/pnpm-lock.yaml
generated
@ -8,6 +8,15 @@ importers:
|
|||||||
|
|
||||||
.:
|
.:
|
||||||
dependencies:
|
dependencies:
|
||||||
|
'@ai-sdk/anthropic':
|
||||||
|
specifier: ^1.2.2
|
||||||
|
version: 1.2.2(zod@3.24.2)
|
||||||
|
'@ai-sdk/google':
|
||||||
|
specifier: ^1.2.3
|
||||||
|
version: 1.2.3(zod@3.24.2)
|
||||||
|
'@ai-sdk/groq':
|
||||||
|
specifier: ^1.2.1
|
||||||
|
version: 1.2.1(zod@3.24.2)
|
||||||
'@ai-sdk/openai':
|
'@ai-sdk/openai':
|
||||||
specifier: ^1.1.13
|
specifier: ^1.1.13
|
||||||
version: 1.1.13(zod@3.24.2)
|
version: 1.1.13(zod@3.24.2)
|
||||||
@ -57,8 +66,8 @@ importers:
|
|||||||
specifier: ^8.5.12
|
specifier: ^8.5.12
|
||||||
version: 8.5.12
|
version: 8.5.12
|
||||||
ai:
|
ai:
|
||||||
specifier: ^4.1.45
|
specifier: ^4.2.8
|
||||||
version: 4.1.45(react@18.3.1)(zod@3.24.2)
|
version: 4.2.8(react@18.3.1)(zod@3.24.2)
|
||||||
ajv:
|
ajv:
|
||||||
specifier: ^8.16.0
|
specifier: ^8.16.0
|
||||||
version: 8.16.0
|
version: 8.16.0
|
||||||
@ -324,6 +333,24 @@ importers:
|
|||||||
|
|
||||||
packages:
|
packages:
|
||||||
|
|
||||||
|
'@ai-sdk/anthropic@1.2.2':
|
||||||
|
resolution: {integrity: sha512-BNZTtbP+zuCzUf8hkpTx7YhwjbQ9oLh2yhjgXC7X1QcNsn9TFRaweX7CP8USM0g6lm/Crm1Qn+4tOCe1p10NTA==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
peerDependencies:
|
||||||
|
zod: ^3.0.0
|
||||||
|
|
||||||
|
'@ai-sdk/google@1.2.3':
|
||||||
|
resolution: {integrity: sha512-zsgwko7T+MFIdEfhg4fIXv6O2dnzTLFr6BOpAA21eo/moOBA5szVzOto1jTwIwoBYsF2ixPGNZBoc+k/fQ2AWw==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
peerDependencies:
|
||||||
|
zod: ^3.0.0
|
||||||
|
|
||||||
|
'@ai-sdk/groq@1.2.1':
|
||||||
|
resolution: {integrity: sha512-e9Vn6sE6u+pm97YSK9+xiTgQ2ScRdipE5gAwXj/9HdgMnUyp3mDpWjFsmDM6bzyeb2iKOGv6f3eiRsLxOAPv4A==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
peerDependencies:
|
||||||
|
zod: ^3.0.0
|
||||||
|
|
||||||
'@ai-sdk/openai@1.1.13':
|
'@ai-sdk/openai@1.1.13':
|
||||||
resolution: {integrity: sha512-IdChK1pJTW3NQis02PG/hHTG0gZSyQIMOLPt7f7ES56C0xH2yaKOU1Tp2aib7pZzWGwDlzTOW2h5TtAB8+V6CQ==}
|
resolution: {integrity: sha512-IdChK1pJTW3NQis02PG/hHTG0gZSyQIMOLPt7f7ES56C0xH2yaKOU1Tp2aib7pZzWGwDlzTOW2h5TtAB8+V6CQ==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
@ -339,30 +366,35 @@ packages:
|
|||||||
zod:
|
zod:
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
|
'@ai-sdk/provider-utils@2.2.1':
|
||||||
|
resolution: {integrity: sha512-BuExLp+NcpwsAVj1F4bgJuQkSqO/+roV9wM7RdIO+NVrcT8RBUTdXzf5arHt5T58VpK7bZyB2V9qigjaPHE+Dg==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
peerDependencies:
|
||||||
|
zod: ^3.23.8
|
||||||
|
|
||||||
'@ai-sdk/provider@1.0.8':
|
'@ai-sdk/provider@1.0.8':
|
||||||
resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==}
|
resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
|
|
||||||
'@ai-sdk/react@1.1.17':
|
'@ai-sdk/provider@1.1.0':
|
||||||
resolution: {integrity: sha512-NAuEflFvjw1uh1AOmpyi7rBF4xasWsiWUb86JQ8ScjDGxoGDYEdBnaHOxUpooLna0dGNbSPkvDMnVRhoLKoxPQ==}
|
resolution: {integrity: sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
|
||||||
|
'@ai-sdk/react@1.2.3':
|
||||||
|
resolution: {integrity: sha512-EQ6nmmQBBAal1yg72GB/Q7QnmDXMfgYvCo9Gym2mESXUHTqwpXU0JFHtk5Kq3EEkk7CVMf1oBWlNFNvU5ckQBg==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
react: ^18 || ^19 || ^19.0.0-rc
|
react: ^18 || ^19 || ^19.0.0-rc
|
||||||
zod: ^3.0.0
|
zod: ^3.23.8
|
||||||
peerDependenciesMeta:
|
peerDependenciesMeta:
|
||||||
react:
|
|
||||||
optional: true
|
|
||||||
zod:
|
zod:
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
'@ai-sdk/ui-utils@1.1.15':
|
'@ai-sdk/ui-utils@1.2.2':
|
||||||
resolution: {integrity: sha512-NsV/3CMmjc4m53snzRdtZM6teTQUXIKi8u0Kf7GBruSzaMSuZ4DWaAAlUshhR3p2FpZgtsogW+vYG1/rXsGu+Q==}
|
resolution: {integrity: sha512-6rCx2jSEPuiF6fytfMNscSOinHQZp52aFCHyPVpPPkcWnOur1jPWhol+0TFCUruDl7dCfcSIfTexQUq2ioLwaA==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
zod: ^3.0.0
|
zod: ^3.23.8
|
||||||
peerDependenciesMeta:
|
|
||||||
zod:
|
|
||||||
optional: true
|
|
||||||
|
|
||||||
'@ampproject/remapping@2.3.0':
|
'@ampproject/remapping@2.3.0':
|
||||||
resolution: {integrity: sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==}
|
resolution: {integrity: sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==}
|
||||||
@ -1795,17 +1827,15 @@ packages:
|
|||||||
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
|
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
|
||||||
engines: {node: '>= 8.0.0'}
|
engines: {node: '>= 8.0.0'}
|
||||||
|
|
||||||
ai@4.1.45:
|
ai@4.2.8:
|
||||||
resolution: {integrity: sha512-nQkxQ2zCD+O/h8zJ+PxmBv9coyMaG1uP9kGJvhNaGAA25hbZRQWL0NbTsSJ/QMOUraXKLa+6fBm3VF1NkJK9Kg==}
|
resolution: {integrity: sha512-0gwfPZAuuQ+uTfk/GssrfnNTYxliCFKojbSQoEhzpbpSVaPao9NoU3iuE8vwBjWuDKqILRGzYGFE4+vTak0Oxg==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
react: ^18 || ^19 || ^19.0.0-rc
|
react: ^18 || ^19 || ^19.0.0-rc
|
||||||
zod: ^3.0.0
|
zod: ^3.23.8
|
||||||
peerDependenciesMeta:
|
peerDependenciesMeta:
|
||||||
react:
|
react:
|
||||||
optional: true
|
optional: true
|
||||||
zod:
|
|
||||||
optional: true
|
|
||||||
|
|
||||||
ajv@8.16.0:
|
ajv@8.16.0:
|
||||||
resolution: {integrity: sha512-F0twR8U1ZU67JIEtekUcLkXkoO5mMMmgGD8sK/xUFzJ805jxHQl92hImFAqqXMyMYjSPOyUPAwHYhB72g5sTXw==}
|
resolution: {integrity: sha512-F0twR8U1ZU67JIEtekUcLkXkoO5mMMmgGD8sK/xUFzJ805jxHQl92hImFAqqXMyMYjSPOyUPAwHYhB72g5sTXw==}
|
||||||
@ -2054,8 +2084,8 @@ packages:
|
|||||||
resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
|
resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
|
||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
|
|
||||||
chalk@5.3.0:
|
chalk@5.4.1:
|
||||||
resolution: {integrity: sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==}
|
resolution: {integrity: sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==}
|
||||||
engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
|
engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
|
||||||
|
|
||||||
char-regex@1.0.2:
|
char-regex@1.0.2:
|
||||||
@ -4427,8 +4457,8 @@ packages:
|
|||||||
resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==}
|
resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
|
|
||||||
swr@2.3.2:
|
swr@2.3.3:
|
||||||
resolution: {integrity: sha512-RosxFpiabojs75IwQ316DGoDRmOqtiAj0tg8wCcbEu4CiLZBs/a9QNtHV7TUfDXmmlgqij/NqzKq/eLelyv9xA==}
|
resolution: {integrity: sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
react: ^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
react: ^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||||
|
|
||||||
@ -4584,8 +4614,8 @@ packages:
|
|||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
typescript@5.7.3:
|
typescript@5.8.2:
|
||||||
resolution: {integrity: sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==}
|
resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==}
|
||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
@ -4865,8 +4895,8 @@ packages:
|
|||||||
peerDependencies:
|
peerDependencies:
|
||||||
zod: ^3.23.3
|
zod: ^3.23.3
|
||||||
|
|
||||||
zod-to-json-schema@3.24.2:
|
zod-to-json-schema@3.24.5:
|
||||||
resolution: {integrity: sha512-pNUqrcSxuuB3/+jBbU8qKUbTbDqYUaG1vf5cXFjbhGgoUuA1amO/y4Q8lzfOhHU8HNPK6VFJ18lBDKj3OHyDsg==}
|
resolution: {integrity: sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
zod: ^3.24.1
|
zod: ^3.24.1
|
||||||
|
|
||||||
@ -4878,6 +4908,24 @@ packages:
|
|||||||
|
|
||||||
snapshots:
|
snapshots:
|
||||||
|
|
||||||
|
'@ai-sdk/anthropic@1.2.2(zod@3.24.2)':
|
||||||
|
dependencies:
|
||||||
|
'@ai-sdk/provider': 1.1.0
|
||||||
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
|
zod: 3.24.2
|
||||||
|
|
||||||
|
'@ai-sdk/google@1.2.3(zod@3.24.2)':
|
||||||
|
dependencies:
|
||||||
|
'@ai-sdk/provider': 1.1.0
|
||||||
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
|
zod: 3.24.2
|
||||||
|
|
||||||
|
'@ai-sdk/groq@1.2.1(zod@3.24.2)':
|
||||||
|
dependencies:
|
||||||
|
'@ai-sdk/provider': 1.1.0
|
||||||
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
|
zod: 3.24.2
|
||||||
|
|
||||||
'@ai-sdk/openai@1.1.13(zod@3.24.2)':
|
'@ai-sdk/openai@1.1.13(zod@3.24.2)':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ai-sdk/provider': 1.0.8
|
'@ai-sdk/provider': 1.0.8
|
||||||
@ -4893,27 +4941,37 @@ snapshots:
|
|||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
|
|
||||||
|
'@ai-sdk/provider-utils@2.2.1(zod@3.24.2)':
|
||||||
|
dependencies:
|
||||||
|
'@ai-sdk/provider': 1.1.0
|
||||||
|
nanoid: 3.3.8
|
||||||
|
secure-json-parse: 2.7.0
|
||||||
|
zod: 3.24.2
|
||||||
|
|
||||||
'@ai-sdk/provider@1.0.8':
|
'@ai-sdk/provider@1.0.8':
|
||||||
dependencies:
|
dependencies:
|
||||||
json-schema: 0.4.0
|
json-schema: 0.4.0
|
||||||
|
|
||||||
'@ai-sdk/react@1.1.17(react@18.3.1)(zod@3.24.2)':
|
'@ai-sdk/provider@1.1.0':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
json-schema: 0.4.0
|
||||||
'@ai-sdk/ui-utils': 1.1.15(zod@3.24.2)
|
|
||||||
swr: 2.3.2(react@18.3.1)
|
'@ai-sdk/react@1.2.3(react@18.3.1)(zod@3.24.2)':
|
||||||
|
dependencies:
|
||||||
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
|
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||||
|
react: 18.3.1
|
||||||
|
swr: 2.3.3(react@18.3.1)
|
||||||
throttleit: 2.1.0
|
throttleit: 2.1.0
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
react: 18.3.1
|
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
|
|
||||||
'@ai-sdk/ui-utils@1.1.15(zod@3.24.2)':
|
'@ai-sdk/ui-utils@1.2.2(zod@3.24.2)':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ai-sdk/provider': 1.0.8
|
'@ai-sdk/provider': 1.1.0
|
||||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
zod-to-json-schema: 3.24.2(zod@3.24.2)
|
|
||||||
optionalDependencies:
|
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
|
zod-to-json-schema: 3.24.5(zod@3.24.2)
|
||||||
|
|
||||||
'@ampproject/remapping@2.3.0':
|
'@ampproject/remapping@2.3.0':
|
||||||
dependencies:
|
dependencies:
|
||||||
@ -5932,7 +5990,7 @@ snapshots:
|
|||||||
camelcase: 6.3.0
|
camelcase: 6.3.0
|
||||||
decamelize: 1.2.0
|
decamelize: 1.2.0
|
||||||
js-tiktoken: 1.0.12
|
js-tiktoken: 1.0.12
|
||||||
langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a)
|
langsmith: 0.1.34(shktx2gypnhlt5ehsbxjv4b3uq)
|
||||||
ml-distance: 4.0.1
|
ml-distance: 4.0.1
|
||||||
mustache: 4.2.0
|
mustache: 4.2.0
|
||||||
p-queue: 6.6.2
|
p-queue: 6.6.2
|
||||||
@ -7103,17 +7161,17 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
humanize-ms: 1.2.1
|
humanize-ms: 1.2.1
|
||||||
|
|
||||||
ai@4.1.45(react@18.3.1)(zod@3.24.2):
|
ai@4.2.8(react@18.3.1)(zod@3.24.2):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ai-sdk/provider': 1.0.8
|
'@ai-sdk/provider': 1.1.0
|
||||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||||
'@ai-sdk/react': 1.1.17(react@18.3.1)(zod@3.24.2)
|
'@ai-sdk/react': 1.2.3(react@18.3.1)(zod@3.24.2)
|
||||||
'@ai-sdk/ui-utils': 1.1.15(zod@3.24.2)
|
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||||
'@opentelemetry/api': 1.9.0
|
'@opentelemetry/api': 1.9.0
|
||||||
jsondiffpatch: 0.6.0
|
jsondiffpatch: 0.6.0
|
||||||
|
zod: 3.24.2
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
react: 18.3.1
|
react: 18.3.1
|
||||||
zod: 3.24.2
|
|
||||||
|
|
||||||
ajv@8.16.0:
|
ajv@8.16.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
@ -7415,7 +7473,7 @@ snapshots:
|
|||||||
ansi-styles: 4.3.0
|
ansi-styles: 4.3.0
|
||||||
supports-color: 7.2.0
|
supports-color: 7.2.0
|
||||||
|
|
||||||
chalk@5.3.0: {}
|
chalk@5.4.1: {}
|
||||||
|
|
||||||
char-regex@1.0.2: {}
|
char-regex@1.0.2: {}
|
||||||
|
|
||||||
@ -8778,7 +8836,7 @@ snapshots:
|
|||||||
jsondiffpatch@0.6.0:
|
jsondiffpatch@0.6.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/diff-match-patch': 1.0.36
|
'@types/diff-match-patch': 1.0.36
|
||||||
chalk: 5.3.0
|
chalk: 5.4.1
|
||||||
diff-match-patch: 1.0.5
|
diff-match-patch: 1.0.5
|
||||||
|
|
||||||
jsonfile@6.1.0:
|
jsonfile@6.1.0:
|
||||||
@ -8816,7 +8874,7 @@ snapshots:
|
|||||||
js-yaml: 4.1.0
|
js-yaml: 4.1.0
|
||||||
jsonpointer: 5.0.1
|
jsonpointer: 5.0.1
|
||||||
langchainhub: 0.0.11
|
langchainhub: 0.0.11
|
||||||
langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a)
|
langsmith: 0.1.34(shktx2gypnhlt5ehsbxjv4b3uq)
|
||||||
ml-distance: 4.0.1
|
ml-distance: 4.0.1
|
||||||
openapi-types: 12.1.3
|
openapi-types: 12.1.3
|
||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
@ -8847,7 +8905,7 @@ snapshots:
|
|||||||
|
|
||||||
langchainhub@0.0.11: {}
|
langchainhub@0.0.11: {}
|
||||||
|
|
||||||
langsmith@0.1.34(7c31787ccbd7899ead3aa20aba61c53a):
|
langsmith@0.1.34(shktx2gypnhlt5ehsbxjv4b3uq):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/uuid': 9.0.8
|
'@types/uuid': 9.0.8
|
||||||
commander: 10.0.1
|
commander: 10.0.1
|
||||||
@ -9507,7 +9565,7 @@ snapshots:
|
|||||||
csv-parse: 5.5.6
|
csv-parse: 5.5.6
|
||||||
gpt3-tokenizer: 1.1.5
|
gpt3-tokenizer: 1.1.5
|
||||||
openai: 3.3.0
|
openai: 3.3.0
|
||||||
typescript: 5.7.3
|
typescript: 5.8.2
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
@ -9979,7 +10037,7 @@ snapshots:
|
|||||||
|
|
||||||
supports-preserve-symlinks-flag@1.0.0: {}
|
supports-preserve-symlinks-flag@1.0.0: {}
|
||||||
|
|
||||||
swr@2.3.2(react@18.3.1):
|
swr@2.3.3(react@18.3.1):
|
||||||
dependencies:
|
dependencies:
|
||||||
dequal: 2.0.3
|
dequal: 2.0.3
|
||||||
react: 18.3.1
|
react: 18.3.1
|
||||||
@ -10121,7 +10179,7 @@ snapshots:
|
|||||||
|
|
||||||
typescript@5.4.5: {}
|
typescript@5.4.5: {}
|
||||||
|
|
||||||
typescript@5.7.3: {}
|
typescript@5.8.2: {}
|
||||||
|
|
||||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||||
dependencies:
|
dependencies:
|
||||||
@ -10360,7 +10418,7 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
|
|
||||||
zod-to-json-schema@3.24.2(zod@3.24.2):
|
zod-to-json-schema@3.24.5(zod@3.24.2):
|
||||||
dependencies:
|
dependencies:
|
||||||
zod: 3.24.2
|
zod: 3.24.2
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ Provide a rephrased search query that:
|
|||||||
4. Is concise and focused
|
4. Is concise and focused
|
||||||
5. Short is better than long
|
5. Short is better than long
|
||||||
6. It is a search engine, not a chatbot
|
6. It is a search engine, not a chatbot
|
||||||
7. Concise
|
7. Concise, no more than 3 words besides the site
|
||||||
|
|
||||||
Return only the rephrased search query, without any explanation or additional text.`;
|
Return only the rephrased search query, without any explanation or additional text.`;
|
||||||
}
|
}
|
||||||
@ -40,7 +40,20 @@ to determine their relevance to the user's query and intent.
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
return `Given these URLs and their content, analyze their relevance to this extraction request: "${searchQuery}".
|
||||||
|
|
||||||
|
For each URL, consider:
|
||||||
|
1. How well it matches the extraction needs
|
||||||
|
2. The quantity and quality of extractable information
|
||||||
|
3. Whether the content structure matches what we're looking for
|
||||||
|
|
||||||
|
Score each URL from 0-1 based on the scoring guidelines provided in the system prompt.
|
||||||
|
|
||||||
|
Provide detailed reasoning for each URL to explain why you assigned that score, considering:
|
||||||
|
- Content relevance
|
||||||
|
- Information completeness
|
||||||
|
- Structure suitability
|
||||||
|
- Potential extraction value`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Multi entity schema anlayzer
|
// Multi entity schema anlayzer
|
||||||
@ -73,7 +86,7 @@ export function buildAnalyzeSchemaUserPrompt(
|
|||||||
urls: string[],
|
urls: string[],
|
||||||
): string {
|
): string {
|
||||||
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
||||||
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
|
Schema: ${schemaString}\nPrompt: ${prompt}\n URLs: ${urls}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should Extract
|
// Should Extract
|
||||||
@ -97,8 +110,7 @@ export function buildBatchExtractSystemPrompt(
|
|||||||
): string {
|
): string {
|
||||||
return (
|
return (
|
||||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
|
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null.`
|
||||||
links.join(", ")
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,8 +19,8 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
): Promise<{
|
): Promise<{
|
||||||
isMultiEntity: boolean;
|
isMultiEntity: boolean;
|
||||||
multiEntityKeys: string[];
|
multiEntityKeys: string[];
|
||||||
reasoning?: string;
|
reasoning: string;
|
||||||
keyIndicators?: string[];
|
keyIndicators: string[];
|
||||||
tokenUsage: TokenUsage;
|
tokenUsage: TokenUsage;
|
||||||
}> {
|
}> {
|
||||||
if (!schema) {
|
if (!schema) {
|
||||||
|
@ -7,7 +7,8 @@ import {
|
|||||||
buildBatchExtractPrompt,
|
buildBatchExtractPrompt,
|
||||||
buildBatchExtractSystemPrompt,
|
buildBatchExtractSystemPrompt,
|
||||||
} from "../build-prompts";
|
} from "../build-prompts";
|
||||||
|
import { getGemini } from "../../generic-ai";
|
||||||
|
import fs from "fs/promises";
|
||||||
/**
|
/**
|
||||||
* Batch extract information from a list of URLs using a multi-entity schema.
|
* Batch extract information from a list of URLs using a multi-entity schema.
|
||||||
* @param multiEntitySchema - The schema for the multi-entity extraction
|
* @param multiEntitySchema - The schema for the multi-entity extraction
|
||||||
@ -30,6 +31,7 @@ export async function batchExtractPromise(
|
|||||||
warning?: string;
|
warning?: string;
|
||||||
sources: string[];
|
sources: string[];
|
||||||
}> {
|
}> {
|
||||||
|
const gemini = getGemini();
|
||||||
const completion = await generateCompletions({
|
const completion = await generateCompletions({
|
||||||
logger: logger.child({
|
logger: logger.child({
|
||||||
method: "extractService/generateCompletions",
|
method: "extractService/generateCompletions",
|
||||||
@ -45,8 +47,10 @@ export async function batchExtractPromise(
|
|||||||
schema: multiEntitySchema,
|
schema: multiEntitySchema,
|
||||||
},
|
},
|
||||||
markdown: buildDocument(doc),
|
markdown: buildDocument(doc),
|
||||||
isExtractEndpoint: true
|
isExtractEndpoint: true,
|
||||||
|
model: gemini("gemini-2.0-flash"),
|
||||||
});
|
});
|
||||||
|
await fs.writeFile(`logs/batchExtract-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
extract: completion.extract,
|
extract: completion.extract,
|
||||||
|
@ -2,6 +2,8 @@ import { logger } from "../../../lib/logger";
|
|||||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildDocument } from "../build-document";
|
import { buildDocument } from "../build-document";
|
||||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||||
|
import { getGemini } from "../../../lib/generic-ai";
|
||||||
|
import fs from "fs/promises";
|
||||||
|
|
||||||
export async function singleAnswerCompletion({
|
export async function singleAnswerCompletion({
|
||||||
singleAnswerDocs,
|
singleAnswerDocs,
|
||||||
@ -20,20 +22,22 @@ export async function singleAnswerCompletion({
|
|||||||
tokenUsage: TokenUsage;
|
tokenUsage: TokenUsage;
|
||||||
sources: string[];
|
sources: string[];
|
||||||
}> {
|
}> {
|
||||||
|
const gemini = getGemini();
|
||||||
const completion = await generateCompletions({
|
const completion = await generateCompletions({
|
||||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||||
options: {
|
options: {
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt:
|
systemPrompt:
|
||||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.",
|
||||||
links.join(", "),
|
|
||||||
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
||||||
schema: rSchema,
|
schema: rSchema,
|
||||||
},
|
},
|
||||||
markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||||
isExtractEndpoint: true
|
isExtractEndpoint: true,
|
||||||
|
model: gemini("gemini-2.0-flash"),
|
||||||
});
|
});
|
||||||
|
await fs.writeFile(`logs/singleAnswer-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||||
return {
|
return {
|
||||||
extract: completion.extract,
|
extract: completion.extract,
|
||||||
tokenUsage: completion.totalUsage,
|
tokenUsage: completion.totalUsage,
|
||||||
|
@ -2,8 +2,8 @@ export const extractConfig = {
|
|||||||
RERANKING: {
|
RERANKING: {
|
||||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||||
MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
|
MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
|
||||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75,
|
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5,
|
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||||
MIN_REQUIRED_LINKS: 1,
|
MIN_REQUIRED_LINKS: 1,
|
||||||
},
|
},
|
||||||
DEDUPLICATION: {
|
DEDUPLICATION: {
|
||||||
|
@ -40,7 +40,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
|||||||
import { normalizeUrl } from "../canonical-url";
|
import { normalizeUrl } from "../canonical-url";
|
||||||
import { search } from "../../search";
|
import { search } from "../../search";
|
||||||
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||||
|
import fs from "fs/promises";
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@ -86,6 +86,10 @@ export async function performExtraction(
|
|||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
let log = {
|
||||||
|
extractId,
|
||||||
|
request
|
||||||
|
};
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract",
|
module: "extract",
|
||||||
@ -148,6 +152,51 @@ export async function performExtraction(
|
|||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let reqSchema = request.schema;
|
||||||
|
if (!reqSchema && request.prompt) {
|
||||||
|
reqSchema = await generateSchemaFromPrompt(request.prompt);
|
||||||
|
logger.debug("Generated request schema.", {
|
||||||
|
originalSchema: request.schema,
|
||||||
|
schema: reqSchema,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reqSchema) {
|
||||||
|
reqSchema = await dereferenceSchema(reqSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug("Transformed schema.", {
|
||||||
|
originalSchema: request.schema,
|
||||||
|
schema: reqSchema,
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
let rSchema = reqSchema;
|
||||||
|
|
||||||
|
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||||
|
// also it checks if the schema any other properties that are not arrays
|
||||||
|
// if so, it splits the results into 2 types of completions:
|
||||||
|
// 1. the first one is a completion that will extract the array of items
|
||||||
|
// 2. the second one is multiple completions that will extract the items from the array
|
||||||
|
let startAnalyze = Date.now();
|
||||||
|
const {
|
||||||
|
isMultiEntity,
|
||||||
|
multiEntityKeys,
|
||||||
|
reasoning,
|
||||||
|
keyIndicators,
|
||||||
|
tokenUsage: schemaAnalysisTokenUsage,
|
||||||
|
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
|
||||||
|
|
||||||
|
logger.debug("Analyzed schema.", {
|
||||||
|
isMultiEntity,
|
||||||
|
multiEntityKeys,
|
||||||
|
reasoning,
|
||||||
|
keyIndicators,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Track schema analysis tokens
|
||||||
|
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||||
|
|
||||||
let startMap = Date.now();
|
let startMap = Date.now();
|
||||||
let aggMapLinks: string[] = [];
|
let aggMapLinks: string[] = [];
|
||||||
logger.debug("Processing URLs...", {
|
logger.debug("Processing URLs...", {
|
||||||
@ -166,6 +215,11 @@ export async function performExtraction(
|
|||||||
limit: request.limit,
|
limit: request.limit,
|
||||||
includeSubdomains: request.includeSubdomains,
|
includeSubdomains: request.includeSubdomains,
|
||||||
schema: request.schema,
|
schema: request.schema,
|
||||||
|
log,
|
||||||
|
isMultiEntity,
|
||||||
|
reasoning,
|
||||||
|
multiEntityKeys,
|
||||||
|
keyIndicators,
|
||||||
},
|
},
|
||||||
urlTraces,
|
urlTraces,
|
||||||
(links: string[]) => {
|
(links: string[]) => {
|
||||||
@ -191,6 +245,9 @@ export async function performExtraction(
|
|||||||
linkCount: links.length,
|
linkCount: links.length,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
log['links'] = links;
|
||||||
|
log['linksLength'] = links.length;
|
||||||
|
|
||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
logger.error("0 links! Bailing.", {
|
logger.error("0 links! Bailing.", {
|
||||||
linkCount: links.length,
|
linkCount: links.length,
|
||||||
@ -217,55 +274,8 @@ export async function performExtraction(
|
|||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
let reqSchema = request.schema;
|
|
||||||
if (!reqSchema && request.prompt) {
|
|
||||||
reqSchema = await generateSchemaFromPrompt(request.prompt);
|
|
||||||
logger.debug("Generated request schema.", {
|
|
||||||
originalSchema: request.schema,
|
|
||||||
schema: reqSchema,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (reqSchema) {
|
|
||||||
reqSchema = await dereferenceSchema(reqSchema);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Transformed schema.", {
|
|
||||||
originalSchema: request.schema,
|
|
||||||
schema: reqSchema,
|
|
||||||
});
|
|
||||||
|
|
||||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
|
||||||
// also it checks if the schema any other properties that are not arrays
|
|
||||||
// if so, it splits the results into 2 types of completions:
|
|
||||||
// 1. the first one is a completion that will extract the array of items
|
|
||||||
// 2. the second one is multiple completions that will extract the items from the array
|
|
||||||
let startAnalyze = Date.now();
|
|
||||||
const {
|
|
||||||
isMultiEntity,
|
|
||||||
multiEntityKeys,
|
|
||||||
reasoning,
|
|
||||||
keyIndicators,
|
|
||||||
tokenUsage: schemaAnalysisTokenUsage,
|
|
||||||
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
|
||||||
|
|
||||||
logger.debug("Analyzed schema.", {
|
|
||||||
isMultiEntity,
|
|
||||||
multiEntityKeys,
|
|
||||||
reasoning,
|
|
||||||
keyIndicators,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Track schema analysis tokens
|
|
||||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
|
||||||
|
|
||||||
// console.log("\nIs Multi Entity:", isMultiEntity);
|
|
||||||
// console.log("\nMulti Entity Keys:", multiEntityKeys);
|
|
||||||
// console.log("\nReasoning:", reasoning);
|
|
||||||
// console.log("\nKey Indicators:", keyIndicators);
|
|
||||||
|
|
||||||
let rSchema = reqSchema;
|
|
||||||
if (isMultiEntity && reqSchema) {
|
if (isMultiEntity && reqSchema) {
|
||||||
|
log['isMultiEntity'] = true;
|
||||||
logger.debug("=== MULTI-ENTITY ===");
|
logger.debug("=== MULTI-ENTITY ===");
|
||||||
|
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
@ -303,6 +313,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
logger.debug("Starting multi-entity scrape...");
|
logger.debug("Starting multi-entity scrape...");
|
||||||
let startScrape = Date.now();
|
let startScrape = Date.now();
|
||||||
|
log['docsSizeBeforeMultiEntityScrape'] = docsMap.size;
|
||||||
|
|
||||||
const scrapePromises = links.map((url) => {
|
const scrapePromises = links.map((url) => {
|
||||||
if (!docsMap.has(normalizeUrl(url))) {
|
if (!docsMap.has(normalizeUrl(url))) {
|
||||||
@ -336,6 +347,8 @@ export async function performExtraction(
|
|||||||
(doc): doc is Document => doc !== null,
|
(doc): doc is Document => doc !== null,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
log['docsSizeAfterMultiEntityScrape'] = scrapePromises.length;
|
||||||
|
|
||||||
logger.debug("Multi-entity scrape finished.", {
|
logger.debug("Multi-entity scrape finished.", {
|
||||||
docCount: multyEntityDocs.length,
|
docCount: multyEntityDocs.length,
|
||||||
});
|
});
|
||||||
@ -387,50 +400,50 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Check if page should be extracted before proceeding
|
// Check if page should be extracted before proceeding
|
||||||
const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
|
// const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
|
||||||
request.prompt ?? "",
|
// request.prompt ?? "",
|
||||||
multiEntitySchema,
|
// multiEntitySchema,
|
||||||
doc,
|
// doc,
|
||||||
);
|
// );
|
||||||
|
|
||||||
tokenUsage.push(shouldExtractCheckTokenUsage);
|
// tokenUsage.push(shouldExtractCheckTokenUsage);
|
||||||
|
|
||||||
if (!extract) {
|
// if (!extract) {
|
||||||
logger.info(
|
// logger.info(
|
||||||
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
// `Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
||||||
);
|
// );
|
||||||
return null;
|
// return null;
|
||||||
}
|
// }
|
||||||
// Add confidence score to schema with 5 levels
|
// // Add confidence score to schema with 5 levels
|
||||||
const schemaWithConfidence = {
|
// const schemaWithConfidence = {
|
||||||
...multiEntitySchema,
|
// ...multiEntitySchema,
|
||||||
properties: {
|
// properties: {
|
||||||
...multiEntitySchema.properties,
|
// ...multiEntitySchema.properties,
|
||||||
is_content_relevant: {
|
// is_content_relevant: {
|
||||||
type: "boolean",
|
// type: "boolean",
|
||||||
description:
|
// description:
|
||||||
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
// "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
||||||
},
|
// },
|
||||||
},
|
// },
|
||||||
required: [
|
// required: [
|
||||||
...(multiEntitySchema.required || []),
|
// ...(multiEntitySchema.required || []),
|
||||||
"is_content_relevant",
|
// "is_content_relevant",
|
||||||
],
|
// ],
|
||||||
};
|
// };
|
||||||
|
|
||||||
await updateExtract(extractId, {
|
// await updateExtract(extractId, {
|
||||||
status: "processing",
|
// status: "processing",
|
||||||
steps: [
|
// steps: [
|
||||||
{
|
// {
|
||||||
step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
// step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
||||||
startedAt: startScrape,
|
// startedAt: startScrape,
|
||||||
finishedAt: Date.now(),
|
// finishedAt: Date.now(),
|
||||||
discoveredLinks: [
|
// discoveredLinks: [
|
||||||
doc.metadata.url || doc.metadata.sourceURL || "",
|
// doc.metadata.url || doc.metadata.sourceURL || "",
|
||||||
],
|
// ],
|
||||||
},
|
// },
|
||||||
],
|
// ],
|
||||||
});
|
// });
|
||||||
|
|
||||||
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
||||||
|
|
||||||
@ -502,6 +515,7 @@ export async function performExtraction(
|
|||||||
logger.debug("All multi-entity completion chunks finished.", {
|
logger.debug("All multi-entity completion chunks finished.", {
|
||||||
completionCount: multiEntityCompletions.length,
|
completionCount: multiEntityCompletions.length,
|
||||||
});
|
});
|
||||||
|
log['multiEntityCompletionsLength'] = multiEntityCompletions.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -545,6 +559,7 @@ export async function performExtraction(
|
|||||||
rSchema.properties &&
|
rSchema.properties &&
|
||||||
Object.keys(rSchema.properties).length > 0
|
Object.keys(rSchema.properties).length > 0
|
||||||
) {
|
) {
|
||||||
|
log['isSingleEntity'] = true;
|
||||||
logger.debug("=== SINGLE PAGES ===", {
|
logger.debug("=== SINGLE PAGES ===", {
|
||||||
linkCount: links.length,
|
linkCount: links.length,
|
||||||
schema: rSchema,
|
schema: rSchema,
|
||||||
@ -567,6 +582,7 @@ export async function performExtraction(
|
|||||||
},
|
},
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
log['docsSizeBeforeSingleEntityScrape'] = docsMap.size;
|
||||||
const scrapePromises = links.map((url) => {
|
const scrapePromises = links.map((url) => {
|
||||||
if (!docsMap.has(normalizeUrl(url))) {
|
if (!docsMap.has(normalizeUrl(url))) {
|
||||||
return scrapeDocument(
|
return scrapeDocument(
|
||||||
@ -592,6 +608,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const results = await Promise.all(scrapePromises);
|
const results = await Promise.all(scrapePromises);
|
||||||
|
log['docsSizeAfterSingleEntityScrape'] = docsMap.size;
|
||||||
|
|
||||||
for (const doc of results) {
|
for (const doc of results) {
|
||||||
if (doc?.metadata?.url) {
|
if (doc?.metadata?.url) {
|
||||||
@ -644,6 +661,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
// Generate completions
|
// Generate completions
|
||||||
logger.debug("Generating singleAnswer completions...");
|
logger.debug("Generating singleAnswer completions...");
|
||||||
|
log['singleAnswerDocsLength'] = singleAnswerDocs.length;
|
||||||
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
|
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
|
||||||
singleAnswerDocs,
|
singleAnswerDocs,
|
||||||
rSchema,
|
rSchema,
|
||||||
@ -690,6 +708,9 @@ export async function performExtraction(
|
|||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log['singleAnswerResult'] = singleAnswerResult;
|
||||||
|
log['multiEntityResult'] = multiEntityResult;
|
||||||
|
|
||||||
let finalResult = reqSchema
|
let finalResult = reqSchema
|
||||||
? await mixSchemaObjects(
|
? await mixSchemaObjects(
|
||||||
reqSchema,
|
reqSchema,
|
||||||
@ -803,6 +824,8 @@ export async function performExtraction(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fs.writeFile(`logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, JSON.stringify(log, null, 2));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: finalResult ?? {},
|
data: finalResult ?? {},
|
||||||
|
@ -9,6 +9,11 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt
|
|||||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||||
import { dumpToFile } from "./helpers/dump-to-file";
|
import { dumpToFile } from "./helpers/dump-to-file";
|
||||||
|
import { getAnthropic, getGemini, getGroq, getModel, getOpenAI } from "../generic-ai";
|
||||||
|
import fs from "fs/promises";
|
||||||
|
|
||||||
|
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||||
|
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
||||||
|
|
||||||
const cohere = new CohereClient({
|
const cohere = new CohereClient({
|
||||||
token: process.env.COHERE_API_KEY,
|
token: process.env.COHERE_API_KEY,
|
||||||
@ -167,16 +172,22 @@ export type RerankerOptions = {
|
|||||||
links: MapDocument[];
|
links: MapDocument[];
|
||||||
searchQuery: string;
|
searchQuery: string;
|
||||||
urlTraces: URLTrace[];
|
urlTraces: URLTrace[];
|
||||||
|
isMultiEntity: boolean;
|
||||||
|
reasoning: string;
|
||||||
|
multiEntityKeys: string[];
|
||||||
|
keyIndicators: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||||
const { links, searchQuery, urlTraces } = options;
|
const { links, searchQuery, urlTraces, isMultiEntity, reasoning, multiEntityKeys, keyIndicators } = options;
|
||||||
const chunkSize = 100;
|
const chunkSize = 5000;
|
||||||
const chunks: MapDocument[][] = [];
|
const chunks: MapDocument[][] = [];
|
||||||
const TIMEOUT_MS = 20000;
|
const TIMEOUT_MS = 60000;
|
||||||
const MAX_RETRIES = 2;
|
const MAX_RETRIES = 2;
|
||||||
let totalTokensUsed = 0;
|
let totalTokensUsed = 0;
|
||||||
|
|
||||||
|
await fs.writeFile(`logs/links-${crypto.randomUUID()}.txt`, JSON.stringify(links, null, 2));
|
||||||
|
|
||||||
// Split links into chunks of 200
|
// Split links into chunks of 200
|
||||||
for (let i = 0; i < links.length; i += chunkSize) {
|
for (let i = 0; i < links.length; i += chunkSize) {
|
||||||
chunks.push(links.slice(i, i + chunkSize));
|
chunks.push(links.slice(i, i + chunkSize));
|
||||||
@ -207,6 +218,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
|||||||
chunks.map(async (chunk, chunkIndex) => {
|
chunks.map(async (chunk, chunkIndex) => {
|
||||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||||
|
|
||||||
|
|
||||||
const linksContent = chunk
|
const linksContent = chunk
|
||||||
.map(
|
.map(
|
||||||
(link) =>
|
(link) =>
|
||||||
@ -214,14 +226,48 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
|||||||
)
|
)
|
||||||
.join("\n\n");
|
.join("\n\n");
|
||||||
|
|
||||||
|
fs.writeFile(`logs/links-content-${crypto.randomUUID()}.txt`, linksContent);
|
||||||
|
|
||||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||||
try {
|
try {
|
||||||
const timeoutPromise = new Promise<null>((resolve) => {
|
const timeoutPromise = new Promise<null>((resolve) => {
|
||||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? 'collecting multiple items' : 'specific information'}.
|
||||||
|
The user's query is: ${searchQuery}
|
||||||
|
${isMultiEntity
|
||||||
|
? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(', ')}.
|
||||||
|
Score URLs higher if they contain ANY instance of the target entities.
|
||||||
|
Key indicators to look for: ${keyIndicators.join(', ')}`
|
||||||
|
: `IMPORTANT: This is a specific information task.
|
||||||
|
Score URLs based on precision and relevance to answering the query.`
|
||||||
|
}
|
||||||
|
|
||||||
|
Scoring guidelines:
|
||||||
|
${isMultiEntity ? `
|
||||||
|
- 1.0: Contains ANY instance of target entities, even just one. Give this score if page has any relevant entity. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||||
|
- 0.8: Contains entity but may be incomplete information
|
||||||
|
- 0.6: Mentions entity type but no clear instance
|
||||||
|
- 0.4: Only tangentially related to entity type
|
||||||
|
- Below 0.4: No mention of relevant entities, or duplicates
|
||||||
|
|
||||||
|
Reason: ${reasoning}
|
||||||
|
` : `
|
||||||
|
- 1.0: Contains direct, authoritative answer to query. Give this score if unsure about relevance. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||||
|
- 0.8: Contains information that directly helps answer the query
|
||||||
|
- 0.6: Contains related information that partially answers query
|
||||||
|
- Below 0.6: Information too general or not focused on query
|
||||||
|
`}`;
|
||||||
|
|
||||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||||
|
// const gemini = getGemini();
|
||||||
|
const model = getOpenAI()
|
||||||
|
// const model = getGemini()
|
||||||
|
let completion: any;
|
||||||
|
try {
|
||||||
const completionPromise = generateCompletions({
|
const completionPromise = generateCompletions({
|
||||||
|
model: model("o3-mini"),
|
||||||
logger: logger.child({
|
logger: logger.child({
|
||||||
method: "rerankLinksWithLLM",
|
method: "rerankLinksWithLLM",
|
||||||
chunk: chunkIndex + 1,
|
chunk: chunkIndex + 1,
|
||||||
@ -229,18 +275,47 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
|||||||
}),
|
}),
|
||||||
options: {
|
options: {
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt: buildRerankerSystemPrompt(),
|
systemPrompt: systemPrompt,
|
||||||
prompt: buildRerankerUserPrompt(searchQuery),
|
prompt: buildRerankerUserPrompt(searchQuery),
|
||||||
schema: schema,
|
schema: schema,
|
||||||
|
// temperature: isMultiEntity ? 0.5 : 0.3,
|
||||||
},
|
},
|
||||||
|
// providerOptions: {
|
||||||
|
// anthropic: {
|
||||||
|
// thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||||
|
// tool_choice: "auto",
|
||||||
|
// },
|
||||||
|
// },
|
||||||
markdown: linksContent,
|
markdown: linksContent,
|
||||||
isExtractEndpoint: true
|
isExtractEndpoint: true
|
||||||
});
|
});
|
||||||
|
|
||||||
const completion = await Promise.race([
|
completion = await completionPromise
|
||||||
completionPromise,
|
// completion = await Promise.race([
|
||||||
timeoutPromise,
|
// completionPromise,
|
||||||
]);
|
// timeoutPromise,
|
||||||
|
// ]);
|
||||||
|
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!˜")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log({ completion })
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(
|
||||||
|
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||||
|
error,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
await fs.writeFile(`logs/reranker-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||||
|
|
||||||
if (!completion) {
|
if (!completion) {
|
||||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||||
@ -278,17 +353,34 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
|||||||
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||||
|
|
||||||
// Map back to MapDocument format, keeping only relevant links
|
// Map back to MapDocument format, keeping ALL links for testing
|
||||||
const relevantLinks = flattenedResults
|
const relevantLinks = flattenedResults
|
||||||
.map((result) => {
|
.map((result) => {
|
||||||
|
if (result.relevanceScore > (isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE)) {
|
||||||
const link = links.find((link) => link.url === result.url);
|
const link = links.find((link) => link.url === result.url);
|
||||||
if (link) {
|
if (link) {
|
||||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
|
return {
|
||||||
|
...link,
|
||||||
|
relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0,
|
||||||
|
reason: result.reason
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return undefined;
|
return undefined;
|
||||||
})
|
})
|
||||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||||
|
|
||||||
|
// Add debug logging for testing
|
||||||
|
fs.writeFile(`logs/reranker-aaa-${crypto.randomUUID()}.json`, JSON.stringify(
|
||||||
|
{
|
||||||
|
totalResults: relevantLinks.length,
|
||||||
|
scores: relevantLinks.map(l => ({
|
||||||
|
url: l.url,
|
||||||
|
score: l.relevanceScore,
|
||||||
|
reason: l.reason
|
||||||
|
}))
|
||||||
|
}, null, 2));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
mapDocument: relevantLinks,
|
mapDocument: relevantLinks,
|
||||||
tokensUsed: totalTokensUsed,
|
tokensUsed: totalTokensUsed,
|
||||||
|
@ -8,13 +8,21 @@ import { rerankLinksWithLLM } from "./reranker";
|
|||||||
import { extractConfig } from "./config";
|
import { extractConfig } from "./config";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
import { generateText } from "ai";
|
import { generateText } from "ai";
|
||||||
import { getModel } from "../generic-ai";
|
import { getAnthropic, getGemini, getGroq, getModel } from "../generic-ai";
|
||||||
|
|
||||||
export async function generateBasicCompletion(prompt: string) {
|
export async function generateBasicCompletion(prompt: string) {
|
||||||
|
|
||||||
|
const anthropic = getAnthropic();
|
||||||
|
|
||||||
const { text } = await generateText({
|
const { text } = await generateText({
|
||||||
model: getModel("gpt-4o"),
|
model: anthropic("claude-3-7-sonnet-latest"),
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
temperature: 0
|
providerOptions: {
|
||||||
|
anthropic: {
|
||||||
|
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// temperature: 0.7
|
||||||
});
|
});
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
@ -28,6 +36,11 @@ interface ProcessUrlOptions {
|
|||||||
origin?: string;
|
origin?: string;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
includeSubdomains?: boolean;
|
includeSubdomains?: boolean;
|
||||||
|
log?: any;
|
||||||
|
isMultiEntity: boolean;
|
||||||
|
reasoning: string;
|
||||||
|
multiEntityKeys: string[];
|
||||||
|
keyIndicators: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processUrl(
|
export async function processUrl(
|
||||||
@ -96,6 +109,7 @@ export async function processUrl(
|
|||||||
linkCount: allUrls.length,
|
linkCount: allUrls.length,
|
||||||
uniqueLinkCount: uniqueUrls.length,
|
uniqueLinkCount: uniqueUrls.length,
|
||||||
});
|
});
|
||||||
|
options.log['uniqueUrlsLength-1'] = uniqueUrls.length;
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach((discoveredUrl) => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
@ -150,6 +164,8 @@ export async function processUrl(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
options.log['uniqueUrlsLength-2'] = uniqueUrls.length;
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach((discoveredUrl) => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||||
@ -215,13 +231,17 @@ export async function processUrl(
|
|||||||
links: mappedLinks,
|
links: mappedLinks,
|
||||||
searchQuery: rephrasedPrompt,
|
searchQuery: rephrasedPrompt,
|
||||||
urlTraces,
|
urlTraces,
|
||||||
|
isMultiEntity: options.isMultiEntity,
|
||||||
|
reasoning: options.reasoning,
|
||||||
|
multiEntityKeys: options.multiEntityKeys,
|
||||||
|
keyIndicators: options.keyIndicators,
|
||||||
});
|
});
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
let tokensUsed = rerankerResult.tokensUsed;
|
let tokensUsed = rerankerResult.tokensUsed;
|
||||||
logger.info("Reranked! (pass 1)", {
|
logger.info("Reranked! (pass 1)", {
|
||||||
linkCount: mappedLinks.length,
|
linkCount: mappedLinks.length,
|
||||||
});
|
});
|
||||||
|
options.log['rerankerResult-1'] = mappedLinks.length;
|
||||||
// 2nd Pass, useful for when the first pass returns too many links
|
// 2nd Pass, useful for when the first pass returns too many links
|
||||||
if (mappedLinks.length > 100) {
|
if (mappedLinks.length > 100) {
|
||||||
logger.info("Reranking (pass 2)...");
|
logger.info("Reranking (pass 2)...");
|
||||||
@ -229,6 +249,10 @@ export async function processUrl(
|
|||||||
links: mappedLinks,
|
links: mappedLinks,
|
||||||
searchQuery: rephrasedPrompt,
|
searchQuery: rephrasedPrompt,
|
||||||
urlTraces,
|
urlTraces,
|
||||||
|
isMultiEntity: options.isMultiEntity,
|
||||||
|
reasoning: options.reasoning,
|
||||||
|
multiEntityKeys: options.multiEntityKeys,
|
||||||
|
keyIndicators: options.keyIndicators,
|
||||||
});
|
});
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
tokensUsed += rerankerResult.tokensUsed;
|
tokensUsed += rerankerResult.tokensUsed;
|
||||||
@ -236,6 +260,7 @@ export async function processUrl(
|
|||||||
linkCount: mappedLinks.length,
|
linkCount: mappedLinks.length,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
options.log['rerankerResult-2'] = mappedLinks.length;
|
||||||
|
|
||||||
// dumpToFile(
|
// dumpToFile(
|
||||||
// "llm-links.txt",
|
// "llm-links.txt",
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import { createOpenAI } from '@ai-sdk/openai';
|
import { createOpenAI } from '@ai-sdk/openai';
|
||||||
import { createOllama } from "ollama-ai-provider";
|
import { createOllama } from "ollama-ai-provider";
|
||||||
|
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
||||||
|
import { createGroq } from "@ai-sdk/groq";
|
||||||
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
||||||
|
|
||||||
const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({
|
const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({
|
||||||
baseURL: process.env.OLLAMA_BASE_URL,
|
baseURL: process.env.OLLAMA_BASE_URL,
|
||||||
@ -12,6 +15,30 @@ export function getModel(name: string) {
|
|||||||
return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name);
|
return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getGemini() {
|
||||||
|
return createGoogleGenerativeAI({
|
||||||
|
apiKey: process.env.GEMINI_API_KEY,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getGroq() {
|
||||||
|
return createGroq({
|
||||||
|
apiKey: process.env.GROQ_API_KEY ?? "",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getAnthropic() {
|
||||||
|
return createAnthropic({
|
||||||
|
apiKey: process.env.ANTHROPIC_API_KEY ?? "",
|
||||||
|
});
|
||||||
|
} // claude-3-7-sonnet-latest
|
||||||
|
|
||||||
|
export function getOpenAI() {
|
||||||
|
return createOpenAI({
|
||||||
|
apiKey: process.env.OPENAI_API_KEY ?? ""
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
export function getEmbeddingModel(name: string) {
|
export function getEmbeddingModel(name: string) {
|
||||||
return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name);
|
return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name);
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,18 @@ import { generateObject, generateText, LanguageModel } from 'ai';
|
|||||||
import { jsonSchema } from 'ai';
|
import { jsonSchema } from 'ai';
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
|
||||||
|
// TODO: fix this, it's horrible
|
||||||
|
type LanguageModelV1ProviderMetadata = {
|
||||||
|
anthropic?: {
|
||||||
|
thinking?: {
|
||||||
|
type: 'enabled' | 'disabled';
|
||||||
|
budgetTokens?: number;
|
||||||
|
};
|
||||||
|
tool_choice?: "auto" | "none" | "required";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// Get max tokens from model prices
|
// Get max tokens from model prices
|
||||||
const getModelLimits = (model: string) => {
|
const getModelLimits = (model: string) => {
|
||||||
@ -157,6 +169,7 @@ export async function generateCompletions({
|
|||||||
isExtractEndpoint,
|
isExtractEndpoint,
|
||||||
model = getModel("gpt-4o-mini"),
|
model = getModel("gpt-4o-mini"),
|
||||||
mode = "object",
|
mode = "object",
|
||||||
|
providerOptions
|
||||||
}: {
|
}: {
|
||||||
model?: LanguageModel;
|
model?: LanguageModel;
|
||||||
logger: Logger;
|
logger: Logger;
|
||||||
@ -165,6 +178,7 @@ export async function generateCompletions({
|
|||||||
previousWarning?: string;
|
previousWarning?: string;
|
||||||
isExtractEndpoint?: boolean;
|
isExtractEndpoint?: boolean;
|
||||||
mode?: "object" | "no-object";
|
mode?: "object" | "no-object";
|
||||||
|
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
extract: any;
|
extract: any;
|
||||||
numTokens: number;
|
numTokens: number;
|
||||||
@ -191,8 +205,9 @@ export async function generateCompletions({
|
|||||||
previousWarning
|
previousWarning
|
||||||
);
|
);
|
||||||
|
|
||||||
markdown = trimmedMarkdown;
|
// WE USE BIG MODELS NOW
|
||||||
warning = trimWarning;
|
// markdown = trimmedMarkdown;
|
||||||
|
// warning = trimWarning;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const prompt = options.prompt !== undefined
|
const prompt = options.prompt !== undefined
|
||||||
@ -203,8 +218,13 @@ export async function generateCompletions({
|
|||||||
const result = await generateText({
|
const result = await generateText({
|
||||||
model: model,
|
model: model,
|
||||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||||
temperature: options.temperature ?? 0,
|
// temperature: options.temperature ?? 0,
|
||||||
system: options.systemPrompt,
|
system: options.systemPrompt,
|
||||||
|
providerOptions: {
|
||||||
|
anthropic: {
|
||||||
|
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
extract = result.text;
|
extract = result.text;
|
||||||
@ -279,7 +299,12 @@ export async function generateCompletions({
|
|||||||
const { text: fixedText } = await generateText({
|
const { text: fixedText } = await generateText({
|
||||||
model: model,
|
model: model,
|
||||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON."
|
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.",
|
||||||
|
providerOptions: {
|
||||||
|
anthropic: {
|
||||||
|
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
return fixedText;
|
return fixedText;
|
||||||
}
|
}
|
||||||
@ -288,7 +313,8 @@ export async function generateCompletions({
|
|||||||
const generateObjectConfig = {
|
const generateObjectConfig = {
|
||||||
model: model,
|
model: model,
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
temperature: options.temperature ?? 0,
|
providerOptions: providerOptions || undefined,
|
||||||
|
// temperature: options.temperature ?? 0,
|
||||||
system: options.systemPrompt,
|
system: options.systemPrompt,
|
||||||
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
||||||
...(!schema && { output: 'no-schema' as const }),
|
...(!schema && { output: 'no-schema' as const }),
|
||||||
@ -300,9 +326,21 @@ export async function generateCompletions({
|
|||||||
})
|
})
|
||||||
} satisfies Parameters<typeof generateObject>[0];
|
} satisfies Parameters<typeof generateObject>[0];
|
||||||
|
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
const now = new Date().getTime()
|
||||||
|
console.log(now)
|
||||||
|
console.log({generateObjectConfig})
|
||||||
|
|
||||||
|
await fs.writeFile(`logs/generateObjectConfig-${now}.json`, JSON.stringify(generateObjectConfig, null, 2))
|
||||||
|
|
||||||
const result = await generateObject(generateObjectConfig);
|
const result = await generateObject(generateObjectConfig);
|
||||||
extract = result.object;
|
extract = result.object;
|
||||||
|
|
||||||
|
const now2 = new Date().getTime()
|
||||||
|
console.log('>>>>>>', now2-now)
|
||||||
|
console.log({extract})
|
||||||
|
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
|
||||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||||
// otherwise, we just return the items array
|
// otherwise, we just return the items array
|
||||||
if (
|
if (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user