mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 15:25:57 +08:00
wip
This commit is contained in:
parent
830d15f2f6
commit
bf0b1c7ae0
@ -51,6 +51,9 @@
|
||||
"typescript": "^5.4.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/anthropic": "^1.2.2",
|
||||
"@ai-sdk/google": "^1.2.3",
|
||||
"@ai-sdk/groq": "^1.2.1",
|
||||
"@ai-sdk/openai": "^1.1.13",
|
||||
"@anthropic-ai/sdk": "^0.24.3",
|
||||
"@apidevtools/json-schema-ref-parser": "^11.7.3",
|
||||
@ -67,7 +70,7 @@
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ai": "^4.1.45",
|
||||
"ai": "^4.2.8",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
|
160
apps/api/pnpm-lock.yaml
generated
160
apps/api/pnpm-lock.yaml
generated
@ -8,6 +8,15 @@ importers:
|
||||
|
||||
.:
|
||||
dependencies:
|
||||
'@ai-sdk/anthropic':
|
||||
specifier: ^1.2.2
|
||||
version: 1.2.2(zod@3.24.2)
|
||||
'@ai-sdk/google':
|
||||
specifier: ^1.2.3
|
||||
version: 1.2.3(zod@3.24.2)
|
||||
'@ai-sdk/groq':
|
||||
specifier: ^1.2.1
|
||||
version: 1.2.1(zod@3.24.2)
|
||||
'@ai-sdk/openai':
|
||||
specifier: ^1.1.13
|
||||
version: 1.1.13(zod@3.24.2)
|
||||
@ -57,8 +66,8 @@ importers:
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ai:
|
||||
specifier: ^4.1.45
|
||||
version: 4.1.45(react@18.3.1)(zod@3.24.2)
|
||||
specifier: ^4.2.8
|
||||
version: 4.2.8(react@18.3.1)(zod@3.24.2)
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
@ -324,6 +333,24 @@ importers:
|
||||
|
||||
packages:
|
||||
|
||||
'@ai-sdk/anthropic@1.2.2':
|
||||
resolution: {integrity: sha512-BNZTtbP+zuCzUf8hkpTx7YhwjbQ9oLh2yhjgXC7X1QcNsn9TFRaweX7CP8USM0g6lm/Crm1Qn+4tOCe1p10NTA==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/google@1.2.3':
|
||||
resolution: {integrity: sha512-zsgwko7T+MFIdEfhg4fIXv6O2dnzTLFr6BOpAA21eo/moOBA5szVzOto1jTwIwoBYsF2ixPGNZBoc+k/fQ2AWw==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/groq@1.2.1':
|
||||
resolution: {integrity: sha512-e9Vn6sE6u+pm97YSK9+xiTgQ2ScRdipE5gAwXj/9HdgMnUyp3mDpWjFsmDM6bzyeb2iKOGv6f3eiRsLxOAPv4A==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/openai@1.1.13':
|
||||
resolution: {integrity: sha512-IdChK1pJTW3NQis02PG/hHTG0gZSyQIMOLPt7f7ES56C0xH2yaKOU1Tp2aib7pZzWGwDlzTOW2h5TtAB8+V6CQ==}
|
||||
engines: {node: '>=18'}
|
||||
@ -339,30 +366,35 @@ packages:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.1':
|
||||
resolution: {integrity: sha512-BuExLp+NcpwsAVj1F4bgJuQkSqO/+roV9wM7RdIO+NVrcT8RBUTdXzf5arHt5T58VpK7bZyB2V9qigjaPHE+Dg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/react@1.1.17':
|
||||
resolution: {integrity: sha512-NAuEflFvjw1uh1AOmpyi7rBF4xasWsiWUb86JQ8ScjDGxoGDYEdBnaHOxUpooLna0dGNbSPkvDMnVRhoLKoxPQ==}
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
resolution: {integrity: sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/react@1.2.3':
|
||||
resolution: {integrity: sha512-EQ6nmmQBBAal1yg72GB/Q7QnmDXMfgYvCo9Gym2mESXUHTqwpXU0JFHtk5Kq3EEkk7CVMf1oBWlNFNvU5ckQBg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
react: ^18 || ^19 || ^19.0.0-rc
|
||||
zod: ^3.0.0
|
||||
zod: ^3.23.8
|
||||
peerDependenciesMeta:
|
||||
react:
|
||||
optional: true
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/ui-utils@1.1.15':
|
||||
resolution: {integrity: sha512-NsV/3CMmjc4m53snzRdtZM6teTQUXIKi8u0Kf7GBruSzaMSuZ4DWaAAlUshhR3p2FpZgtsogW+vYG1/rXsGu+Q==}
|
||||
'@ai-sdk/ui-utils@1.2.2':
|
||||
resolution: {integrity: sha512-6rCx2jSEPuiF6fytfMNscSOinHQZp52aFCHyPVpPPkcWnOur1jPWhol+0TFCUruDl7dCfcSIfTexQUq2ioLwaA==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
peerDependenciesMeta:
|
||||
zod:
|
||||
optional: true
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ampproject/remapping@2.3.0':
|
||||
resolution: {integrity: sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==}
|
||||
@ -1795,17 +1827,15 @@ packages:
|
||||
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
|
||||
engines: {node: '>= 8.0.0'}
|
||||
|
||||
ai@4.1.45:
|
||||
resolution: {integrity: sha512-nQkxQ2zCD+O/h8zJ+PxmBv9coyMaG1uP9kGJvhNaGAA25hbZRQWL0NbTsSJ/QMOUraXKLa+6fBm3VF1NkJK9Kg==}
|
||||
ai@4.2.8:
|
||||
resolution: {integrity: sha512-0gwfPZAuuQ+uTfk/GssrfnNTYxliCFKojbSQoEhzpbpSVaPao9NoU3iuE8vwBjWuDKqILRGzYGFE4+vTak0Oxg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
react: ^18 || ^19 || ^19.0.0-rc
|
||||
zod: ^3.0.0
|
||||
zod: ^3.23.8
|
||||
peerDependenciesMeta:
|
||||
react:
|
||||
optional: true
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
ajv@8.16.0:
|
||||
resolution: {integrity: sha512-F0twR8U1ZU67JIEtekUcLkXkoO5mMMmgGD8sK/xUFzJ805jxHQl92hImFAqqXMyMYjSPOyUPAwHYhB72g5sTXw==}
|
||||
@ -2054,8 +2084,8 @@ packages:
|
||||
resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
chalk@5.3.0:
|
||||
resolution: {integrity: sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==}
|
||||
chalk@5.4.1:
|
||||
resolution: {integrity: sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==}
|
||||
engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
|
||||
|
||||
char-regex@1.0.2:
|
||||
@ -4427,8 +4457,8 @@ packages:
|
||||
resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
swr@2.3.2:
|
||||
resolution: {integrity: sha512-RosxFpiabojs75IwQ316DGoDRmOqtiAj0tg8wCcbEu4CiLZBs/a9QNtHV7TUfDXmmlgqij/NqzKq/eLelyv9xA==}
|
||||
swr@2.3.3:
|
||||
resolution: {integrity: sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==}
|
||||
peerDependencies:
|
||||
react: ^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
@ -4584,8 +4614,8 @@ packages:
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
typescript@5.7.3:
|
||||
resolution: {integrity: sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==}
|
||||
typescript@5.8.2:
|
||||
resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
@ -4865,8 +4895,8 @@ packages:
|
||||
peerDependencies:
|
||||
zod: ^3.23.3
|
||||
|
||||
zod-to-json-schema@3.24.2:
|
||||
resolution: {integrity: sha512-pNUqrcSxuuB3/+jBbU8qKUbTbDqYUaG1vf5cXFjbhGgoUuA1amO/y4Q8lzfOhHU8HNPK6VFJ18lBDKj3OHyDsg==}
|
||||
zod-to-json-schema@3.24.5:
|
||||
resolution: {integrity: sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==}
|
||||
peerDependencies:
|
||||
zod: ^3.24.1
|
||||
|
||||
@ -4878,6 +4908,24 @@ packages:
|
||||
|
||||
snapshots:
|
||||
|
||||
'@ai-sdk/anthropic@1.2.2(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/google@1.2.3(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/groq@1.2.1(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/openai@1.1.13(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.8
|
||||
@ -4893,27 +4941,37 @@ snapshots:
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.1(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/react@1.1.17(react@18.3.1)(zod@3.24.2)':
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
dependencies:
|
||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.1.15(zod@3.24.2)
|
||||
swr: 2.3.2(react@18.3.1)
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/react@1.2.3(react@18.3.1)(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||
react: 18.3.1
|
||||
swr: 2.3.3(react@18.3.1)
|
||||
throttleit: 2.1.0
|
||||
optionalDependencies:
|
||||
react: 18.3.1
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/ui-utils@1.1.15(zod@3.24.2)':
|
||||
'@ai-sdk/ui-utils@1.2.2(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.8
|
||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
||||
zod-to-json-schema: 3.24.2(zod@3.24.2)
|
||||
optionalDependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
zod-to-json-schema: 3.24.5(zod@3.24.2)
|
||||
|
||||
'@ampproject/remapping@2.3.0':
|
||||
dependencies:
|
||||
@ -5932,7 +5990,7 @@ snapshots:
|
||||
camelcase: 6.3.0
|
||||
decamelize: 1.2.0
|
||||
js-tiktoken: 1.0.12
|
||||
langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a)
|
||||
langsmith: 0.1.34(shktx2gypnhlt5ehsbxjv4b3uq)
|
||||
ml-distance: 4.0.1
|
||||
mustache: 4.2.0
|
||||
p-queue: 6.6.2
|
||||
@ -7103,17 +7161,17 @@ snapshots:
|
||||
dependencies:
|
||||
humanize-ms: 1.2.1
|
||||
|
||||
ai@4.1.45(react@18.3.1)(zod@3.24.2):
|
||||
ai@4.2.8(react@18.3.1)(zod@3.24.2):
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.8
|
||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
||||
'@ai-sdk/react': 1.1.17(react@18.3.1)(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.1.15(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/react': 1.2.3(react@18.3.1)(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||
'@opentelemetry/api': 1.9.0
|
||||
jsondiffpatch: 0.6.0
|
||||
zod: 3.24.2
|
||||
optionalDependencies:
|
||||
react: 18.3.1
|
||||
zod: 3.24.2
|
||||
|
||||
ajv@8.16.0:
|
||||
dependencies:
|
||||
@ -7415,7 +7473,7 @@ snapshots:
|
||||
ansi-styles: 4.3.0
|
||||
supports-color: 7.2.0
|
||||
|
||||
chalk@5.3.0: {}
|
||||
chalk@5.4.1: {}
|
||||
|
||||
char-regex@1.0.2: {}
|
||||
|
||||
@ -8778,7 +8836,7 @@ snapshots:
|
||||
jsondiffpatch@0.6.0:
|
||||
dependencies:
|
||||
'@types/diff-match-patch': 1.0.36
|
||||
chalk: 5.3.0
|
||||
chalk: 5.4.1
|
||||
diff-match-patch: 1.0.5
|
||||
|
||||
jsonfile@6.1.0:
|
||||
@ -8816,7 +8874,7 @@ snapshots:
|
||||
js-yaml: 4.1.0
|
||||
jsonpointer: 5.0.1
|
||||
langchainhub: 0.0.11
|
||||
langsmith: 0.1.34(7c31787ccbd7899ead3aa20aba61c53a)
|
||||
langsmith: 0.1.34(shktx2gypnhlt5ehsbxjv4b3uq)
|
||||
ml-distance: 4.0.1
|
||||
openapi-types: 12.1.3
|
||||
p-retry: 4.6.2
|
||||
@ -8847,7 +8905,7 @@ snapshots:
|
||||
|
||||
langchainhub@0.0.11: {}
|
||||
|
||||
langsmith@0.1.34(7c31787ccbd7899ead3aa20aba61c53a):
|
||||
langsmith@0.1.34(shktx2gypnhlt5ehsbxjv4b3uq):
|
||||
dependencies:
|
||||
'@types/uuid': 9.0.8
|
||||
commander: 10.0.1
|
||||
@ -9507,7 +9565,7 @@ snapshots:
|
||||
csv-parse: 5.5.6
|
||||
gpt3-tokenizer: 1.1.5
|
||||
openai: 3.3.0
|
||||
typescript: 5.7.3
|
||||
typescript: 5.8.2
|
||||
uuid: 9.0.1
|
||||
zod: 3.24.2
|
||||
transitivePeerDependencies:
|
||||
@ -9979,7 +10037,7 @@ snapshots:
|
||||
|
||||
supports-preserve-symlinks-flag@1.0.0: {}
|
||||
|
||||
swr@2.3.2(react@18.3.1):
|
||||
swr@2.3.3(react@18.3.1):
|
||||
dependencies:
|
||||
dequal: 2.0.3
|
||||
react: 18.3.1
|
||||
@ -10121,7 +10179,7 @@ snapshots:
|
||||
|
||||
typescript@5.4.5: {}
|
||||
|
||||
typescript@5.7.3: {}
|
||||
typescript@5.8.2: {}
|
||||
|
||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||
dependencies:
|
||||
@ -10360,7 +10418,7 @@ snapshots:
|
||||
dependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
zod-to-json-schema@3.24.2(zod@3.24.2):
|
||||
zod-to-json-schema@3.24.5(zod@3.24.2):
|
||||
dependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
|
@ -10,7 +10,7 @@ Provide a rephrased search query that:
|
||||
4. Is concise and focused
|
||||
5. Short is better than long
|
||||
6. It is a search engine, not a chatbot
|
||||
7. Concise
|
||||
7. Concise, no more than 3 words besides the site
|
||||
|
||||
Return only the rephrased search query, without any explanation or additional text.`;
|
||||
}
|
||||
@ -40,7 +40,20 @@ to determine their relevance to the user's query and intent.
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
||||
return `Given these URLs and their content, analyze their relevance to this extraction request: "${searchQuery}".
|
||||
|
||||
For each URL, consider:
|
||||
1. How well it matches the extraction needs
|
||||
2. The quantity and quality of extractable information
|
||||
3. Whether the content structure matches what we're looking for
|
||||
|
||||
Score each URL from 0-1 based on the scoring guidelines provided in the system prompt.
|
||||
|
||||
Provide detailed reasoning for each URL to explain why you assigned that score, considering:
|
||||
- Content relevance
|
||||
- Information completeness
|
||||
- Structure suitability
|
||||
- Potential extraction value`;
|
||||
}
|
||||
|
||||
// Multi entity schema anlayzer
|
||||
@ -73,7 +86,7 @@ export function buildAnalyzeSchemaUserPrompt(
|
||||
urls: string[],
|
||||
): string {
|
||||
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\n URLs: ${urls}`;
|
||||
}
|
||||
|
||||
// Should Extract
|
||||
@ -97,8 +110,7 @@ export function buildBatchExtractSystemPrompt(
|
||||
): string {
|
||||
return (
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
|
||||
links.join(", ")
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null.`
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -19,8 +19,8 @@ export async function analyzeSchemaAndPrompt(
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
reasoning?: string;
|
||||
keyIndicators?: string[];
|
||||
reasoning: string;
|
||||
keyIndicators: string[];
|
||||
tokenUsage: TokenUsage;
|
||||
}> {
|
||||
if (!schema) {
|
||||
|
@ -7,7 +7,8 @@ import {
|
||||
buildBatchExtractPrompt,
|
||||
buildBatchExtractSystemPrompt,
|
||||
} from "../build-prompts";
|
||||
|
||||
import { getGemini } from "../../generic-ai";
|
||||
import fs from "fs/promises";
|
||||
/**
|
||||
* Batch extract information from a list of URLs using a multi-entity schema.
|
||||
* @param multiEntitySchema - The schema for the multi-entity extraction
|
||||
@ -30,6 +31,7 @@ export async function batchExtractPromise(
|
||||
warning?: string;
|
||||
sources: string[];
|
||||
}> {
|
||||
const gemini = getGemini();
|
||||
const completion = await generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "extractService/generateCompletions",
|
||||
@ -45,8 +47,10 @@ export async function batchExtractPromise(
|
||||
schema: multiEntitySchema,
|
||||
},
|
||||
markdown: buildDocument(doc),
|
||||
isExtractEndpoint: true
|
||||
isExtractEndpoint: true,
|
||||
model: gemini("gemini-2.0-flash"),
|
||||
});
|
||||
await fs.writeFile(`logs/batchExtract-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
|
||||
return {
|
||||
extract: completion.extract,
|
||||
|
@ -2,6 +2,8 @@ import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { getGemini } from "../../../lib/generic-ai";
|
||||
import fs from "fs/promises";
|
||||
|
||||
export async function singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
@ -20,20 +22,22 @@ export async function singleAnswerCompletion({
|
||||
tokenUsage: TokenUsage;
|
||||
sources: string[];
|
||||
}> {
|
||||
const gemini = getGemini();
|
||||
const completion = await generateCompletions({
|
||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.",
|
||||
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
||||
schema: rSchema,
|
||||
},
|
||||
markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||
isExtractEndpoint: true
|
||||
isExtractEndpoint: true,
|
||||
model: gemini("gemini-2.0-flash"),
|
||||
});
|
||||
await fs.writeFile(`logs/singleAnswer-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: completion.totalUsage,
|
||||
|
@ -2,8 +2,8 @@ export const extractConfig = {
|
||||
RERANKING: {
|
||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||
MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
|
||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75,
|
||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5,
|
||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
},
|
||||
DEDUPLICATION: {
|
||||
|
@ -40,7 +40,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||
import { normalizeUrl } from "../canonical-url";
|
||||
import { search } from "../../search";
|
||||
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||
|
||||
import fs from "fs/promises";
|
||||
interface ExtractServiceOptions {
|
||||
request: ExtractRequest;
|
||||
teamId: string;
|
||||
@ -86,6 +86,10 @@ export async function performExtraction(
|
||||
let totalUrlsScraped = 0;
|
||||
let sources: Record<string, string[]> = {};
|
||||
|
||||
let log = {
|
||||
extractId,
|
||||
request
|
||||
};
|
||||
|
||||
const logger = _logger.child({
|
||||
module: "extract",
|
||||
@ -148,6 +152,51 @@ export async function performExtraction(
|
||||
],
|
||||
});
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
reqSchema = await generateSchemaFromPrompt(request.prompt);
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
}
|
||||
|
||||
if (reqSchema) {
|
||||
reqSchema = await dereferenceSchema(reqSchema);
|
||||
}
|
||||
|
||||
logger.debug("Transformed schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
|
||||
let rSchema = reqSchema;
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
let startMap = Date.now();
|
||||
let aggMapLinks: string[] = [];
|
||||
logger.debug("Processing URLs...", {
|
||||
@ -166,6 +215,11 @@ export async function performExtraction(
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
schema: request.schema,
|
||||
log,
|
||||
isMultiEntity,
|
||||
reasoning,
|
||||
multiEntityKeys,
|
||||
keyIndicators,
|
||||
},
|
||||
urlTraces,
|
||||
(links: string[]) => {
|
||||
@ -191,6 +245,9 @@ export async function performExtraction(
|
||||
linkCount: links.length,
|
||||
});
|
||||
|
||||
log['links'] = links;
|
||||
log['linksLength'] = links.length;
|
||||
|
||||
if (links.length === 0) {
|
||||
logger.error("0 links! Bailing.", {
|
||||
linkCount: links.length,
|
||||
@ -217,55 +274,8 @@ export async function performExtraction(
|
||||
],
|
||||
});
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
reqSchema = await generateSchemaFromPrompt(request.prompt);
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
}
|
||||
|
||||
if (reqSchema) {
|
||||
reqSchema = await dereferenceSchema(reqSchema);
|
||||
}
|
||||
|
||||
logger.debug("Transformed schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
// console.log("\nIs Multi Entity:", isMultiEntity);
|
||||
// console.log("\nMulti Entity Keys:", multiEntityKeys);
|
||||
// console.log("\nReasoning:", reasoning);
|
||||
// console.log("\nKey Indicators:", keyIndicators);
|
||||
|
||||
let rSchema = reqSchema;
|
||||
if (isMultiEntity && reqSchema) {
|
||||
log['isMultiEntity'] = true;
|
||||
logger.debug("=== MULTI-ENTITY ===");
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
@ -303,6 +313,7 @@ export async function performExtraction(
|
||||
|
||||
logger.debug("Starting multi-entity scrape...");
|
||||
let startScrape = Date.now();
|
||||
log['docsSizeBeforeMultiEntityScrape'] = docsMap.size;
|
||||
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
@ -336,6 +347,8 @@ export async function performExtraction(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
|
||||
log['docsSizeAfterMultiEntityScrape'] = scrapePromises.length;
|
||||
|
||||
logger.debug("Multi-entity scrape finished.", {
|
||||
docCount: multyEntityDocs.length,
|
||||
});
|
||||
@ -387,50 +400,50 @@ export async function performExtraction(
|
||||
});
|
||||
|
||||
// Check if page should be extracted before proceeding
|
||||
const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
|
||||
request.prompt ?? "",
|
||||
multiEntitySchema,
|
||||
doc,
|
||||
);
|
||||
// const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
|
||||
// request.prompt ?? "",
|
||||
// multiEntitySchema,
|
||||
// doc,
|
||||
// );
|
||||
|
||||
tokenUsage.push(shouldExtractCheckTokenUsage);
|
||||
// tokenUsage.push(shouldExtractCheckTokenUsage);
|
||||
|
||||
if (!extract) {
|
||||
logger.info(
|
||||
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
// Add confidence score to schema with 5 levels
|
||||
const schemaWithConfidence = {
|
||||
...multiEntitySchema,
|
||||
properties: {
|
||||
...multiEntitySchema.properties,
|
||||
is_content_relevant: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
||||
},
|
||||
},
|
||||
required: [
|
||||
...(multiEntitySchema.required || []),
|
||||
"is_content_relevant",
|
||||
],
|
||||
};
|
||||
// if (!extract) {
|
||||
// logger.info(
|
||||
// `Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
||||
// );
|
||||
// return null;
|
||||
// }
|
||||
// // Add confidence score to schema with 5 levels
|
||||
// const schemaWithConfidence = {
|
||||
// ...multiEntitySchema,
|
||||
// properties: {
|
||||
// ...multiEntitySchema.properties,
|
||||
// is_content_relevant: {
|
||||
// type: "boolean",
|
||||
// description:
|
||||
// "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
||||
// },
|
||||
// },
|
||||
// required: [
|
||||
// ...(multiEntitySchema.required || []),
|
||||
// "is_content_relevant",
|
||||
// ],
|
||||
// };
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
||||
startedAt: startScrape,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: [
|
||||
doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
// await updateExtract(extractId, {
|
||||
// status: "processing",
|
||||
// steps: [
|
||||
// {
|
||||
// step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
||||
// startedAt: startScrape,
|
||||
// finishedAt: Date.now(),
|
||||
// discoveredLinks: [
|
||||
// doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
// ],
|
||||
// },
|
||||
// ],
|
||||
// });
|
||||
|
||||
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
||||
|
||||
@ -502,6 +515,7 @@ export async function performExtraction(
|
||||
logger.debug("All multi-entity completion chunks finished.", {
|
||||
completionCount: multiEntityCompletions.length,
|
||||
});
|
||||
log['multiEntityCompletionsLength'] = multiEntityCompletions.length;
|
||||
}
|
||||
|
||||
try {
|
||||
@ -545,6 +559,7 @@ export async function performExtraction(
|
||||
rSchema.properties &&
|
||||
Object.keys(rSchema.properties).length > 0
|
||||
) {
|
||||
log['isSingleEntity'] = true;
|
||||
logger.debug("=== SINGLE PAGES ===", {
|
||||
linkCount: links.length,
|
||||
schema: rSchema,
|
||||
@ -567,6 +582,7 @@ export async function performExtraction(
|
||||
},
|
||||
],
|
||||
});
|
||||
log['docsSizeBeforeSingleEntityScrape'] = docsMap.size;
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument(
|
||||
@ -592,6 +608,7 @@ export async function performExtraction(
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
log['docsSizeAfterSingleEntityScrape'] = docsMap.size;
|
||||
|
||||
for (const doc of results) {
|
||||
if (doc?.metadata?.url) {
|
||||
@ -644,6 +661,7 @@ export async function performExtraction(
|
||||
|
||||
// Generate completions
|
||||
logger.debug("Generating singleAnswer completions...");
|
||||
log['singleAnswerDocsLength'] = singleAnswerDocs.length;
|
||||
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
@ -690,6 +708,9 @@ export async function performExtraction(
|
||||
// }
|
||||
}
|
||||
|
||||
log['singleAnswerResult'] = singleAnswerResult;
|
||||
log['multiEntityResult'] = multiEntityResult;
|
||||
|
||||
let finalResult = reqSchema
|
||||
? await mixSchemaObjects(
|
||||
reqSchema,
|
||||
@ -803,6 +824,8 @@ export async function performExtraction(
|
||||
}
|
||||
}
|
||||
|
||||
fs.writeFile(`logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, JSON.stringify(log, null, 2));
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: finalResult ?? {},
|
||||
|
@ -9,6 +9,11 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt
|
||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||
import { dumpToFile } from "./helpers/dump-to-file";
|
||||
import { getAnthropic, getGemini, getGroq, getModel, getOpenAI } from "../generic-ai";
|
||||
import fs from "fs/promises";
|
||||
|
||||
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
@ -167,16 +172,22 @@ export type RerankerOptions = {
|
||||
links: MapDocument[];
|
||||
searchQuery: string;
|
||||
urlTraces: URLTrace[];
|
||||
isMultiEntity: boolean;
|
||||
reasoning: string;
|
||||
multiEntityKeys: string[];
|
||||
keyIndicators: string[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
const { links, searchQuery, urlTraces, isMultiEntity, reasoning, multiEntityKeys, keyIndicators } = options;
|
||||
const chunkSize = 5000;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const TIMEOUT_MS = 60000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
await fs.writeFile(`logs/links-${crypto.randomUUID()}.txt`, JSON.stringify(links, null, 2));
|
||||
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
chunks.push(links.slice(i, i + chunkSize));
|
||||
@ -207,6 +218,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
|
||||
|
||||
const linksContent = chunk
|
||||
.map(
|
||||
(link) =>
|
||||
@ -214,33 +226,96 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
fs.writeFile(`logs/links-content-${crypto.randomUUID()}.txt`, linksContent);
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
const timeoutPromise = new Promise<null>((resolve) => {
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
const completionPromise = generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: buildRerankerSystemPrompt(),
|
||||
prompt: buildRerankerUserPrompt(searchQuery),
|
||||
schema: schema,
|
||||
},
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? 'collecting multiple items' : 'specific information'}.
|
||||
The user's query is: ${searchQuery}
|
||||
${isMultiEntity
|
||||
? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(', ')}.
|
||||
Score URLs higher if they contain ANY instance of the target entities.
|
||||
Key indicators to look for: ${keyIndicators.join(', ')}`
|
||||
: `IMPORTANT: This is a specific information task.
|
||||
Score URLs based on precision and relevance to answering the query.`
|
||||
}
|
||||
|
||||
Scoring guidelines:
|
||||
${isMultiEntity ? `
|
||||
- 1.0: Contains ANY instance of target entities, even just one. Give this score if page has any relevant entity. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains entity but may be incomplete information
|
||||
- 0.6: Mentions entity type but no clear instance
|
||||
- 0.4: Only tangentially related to entity type
|
||||
- Below 0.4: No mention of relevant entities, or duplicates
|
||||
|
||||
Reason: ${reasoning}
|
||||
` : `
|
||||
- 1.0: Contains direct, authoritative answer to query. Give this score if unsure about relevance. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains information that directly helps answer the query
|
||||
- 0.6: Contains related information that partially answers query
|
||||
- Below 0.6: Information too general or not focused on query
|
||||
`}`;
|
||||
|
||||
const completion = await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
]);
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
// const gemini = getGemini();
|
||||
const model = getOpenAI()
|
||||
// const model = getGemini()
|
||||
let completion: any;
|
||||
try {
|
||||
const completionPromise = generateCompletions({
|
||||
model: model("o3-mini"),
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: systemPrompt,
|
||||
prompt: buildRerankerUserPrompt(searchQuery),
|
||||
schema: schema,
|
||||
// temperature: isMultiEntity ? 0.5 : 0.3,
|
||||
},
|
||||
// providerOptions: {
|
||||
// anthropic: {
|
||||
// thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
// tool_choice: "auto",
|
||||
// },
|
||||
// },
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
|
||||
completion = await completionPromise
|
||||
// completion = await Promise.race([
|
||||
// completionPromise,
|
||||
// timeoutPromise,
|
||||
// ]);
|
||||
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!˜")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log({ completion })
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
|
||||
await fs.writeFile(`logs/reranker-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
@ -278,17 +353,34 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
// Map back to MapDocument format, keeping ALL links for testing
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
|
||||
if (result.relevanceScore > (isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE)) {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return {
|
||||
...link,
|
||||
relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0,
|
||||
reason: result.reason
|
||||
};
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
})
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
// Add debug logging for testing
|
||||
fs.writeFile(`logs/reranker-aaa-${crypto.randomUUID()}.json`, JSON.stringify(
|
||||
{
|
||||
totalResults: relevantLinks.length,
|
||||
scores: relevantLinks.map(l => ({
|
||||
url: l.url,
|
||||
score: l.relevanceScore,
|
||||
reason: l.reason
|
||||
}))
|
||||
}, null, 2));
|
||||
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
tokensUsed: totalTokensUsed,
|
||||
|
@ -8,13 +8,21 @@ import { rerankLinksWithLLM } from "./reranker";
|
||||
import { extractConfig } from "./config";
|
||||
import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../generic-ai";
|
||||
import { getAnthropic, getGemini, getGroq, getModel } from "../generic-ai";
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
|
||||
const anthropic = getAnthropic();
|
||||
|
||||
const { text } = await generateText({
|
||||
model: getModel("gpt-4o"),
|
||||
model: anthropic("claude-3-7-sonnet-latest"),
|
||||
prompt: prompt,
|
||||
temperature: 0
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
},
|
||||
// temperature: 0.7
|
||||
});
|
||||
return text;
|
||||
}
|
||||
@ -28,6 +36,11 @@ interface ProcessUrlOptions {
|
||||
origin?: string;
|
||||
limit?: number;
|
||||
includeSubdomains?: boolean;
|
||||
log?: any;
|
||||
isMultiEntity: boolean;
|
||||
reasoning: string;
|
||||
multiEntityKeys: string[];
|
||||
keyIndicators: string[];
|
||||
}
|
||||
|
||||
export async function processUrl(
|
||||
@ -96,6 +109,7 @@ export async function processUrl(
|
||||
linkCount: allUrls.length,
|
||||
uniqueLinkCount: uniqueUrls.length,
|
||||
});
|
||||
options.log['uniqueUrlsLength-1'] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
@ -150,6 +164,8 @@ export async function processUrl(
|
||||
});
|
||||
}
|
||||
|
||||
options.log['uniqueUrlsLength-2'] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
@ -215,13 +231,17 @@ export async function processUrl(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
isMultiEntity: options.isMultiEntity,
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
});
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
|
||||
options.log['rerankerResult-1'] = mappedLinks.length;
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
@ -229,6 +249,10 @@ export async function processUrl(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
isMultiEntity: options.isMultiEntity,
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
});
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
@ -236,6 +260,7 @@ export async function processUrl(
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
}
|
||||
options.log['rerankerResult-2'] = mappedLinks.length;
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
|
@ -1,5 +1,8 @@
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { createOllama } from "ollama-ai-provider";
|
||||
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
||||
import { createGroq } from "@ai-sdk/groq";
|
||||
import { createAnthropic } from "@ai-sdk/anthropic";
|
||||
|
||||
const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({
|
||||
baseURL: process.env.OLLAMA_BASE_URL,
|
||||
@ -12,6 +15,30 @@ export function getModel(name: string) {
|
||||
return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name);
|
||||
}
|
||||
|
||||
export function getGemini() {
|
||||
return createGoogleGenerativeAI({
|
||||
apiKey: process.env.GEMINI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
export function getGroq() {
|
||||
return createGroq({
|
||||
apiKey: process.env.GROQ_API_KEY ?? "",
|
||||
});
|
||||
}
|
||||
|
||||
export function getAnthropic() {
|
||||
return createAnthropic({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY ?? "",
|
||||
});
|
||||
} // claude-3-7-sonnet-latest
|
||||
|
||||
export function getOpenAI() {
|
||||
return createOpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY ?? ""
|
||||
});
|
||||
}
|
||||
|
||||
export function getEmbeddingModel(name: string) {
|
||||
return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name);
|
||||
}
|
||||
|
@ -13,6 +13,18 @@ import { generateObject, generateText, LanguageModel } from 'ai';
|
||||
import { jsonSchema } from 'ai';
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { z } from "zod";
|
||||
import fs from 'fs/promises';
|
||||
|
||||
// TODO: fix this, it's horrible
|
||||
type LanguageModelV1ProviderMetadata = {
|
||||
anthropic?: {
|
||||
thinking?: {
|
||||
type: 'enabled' | 'disabled';
|
||||
budgetTokens?: number;
|
||||
};
|
||||
tool_choice?: "auto" | "none" | "required";
|
||||
};
|
||||
};
|
||||
|
||||
// Get max tokens from model prices
|
||||
const getModelLimits = (model: string) => {
|
||||
@ -157,6 +169,7 @@ export async function generateCompletions({
|
||||
isExtractEndpoint,
|
||||
model = getModel("gpt-4o-mini"),
|
||||
mode = "object",
|
||||
providerOptions
|
||||
}: {
|
||||
model?: LanguageModel;
|
||||
logger: Logger;
|
||||
@ -165,6 +178,7 @@ export async function generateCompletions({
|
||||
previousWarning?: string;
|
||||
isExtractEndpoint?: boolean;
|
||||
mode?: "object" | "no-object";
|
||||
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
@ -191,8 +205,9 @@ export async function generateCompletions({
|
||||
previousWarning
|
||||
);
|
||||
|
||||
markdown = trimmedMarkdown;
|
||||
warning = trimWarning;
|
||||
// WE USE BIG MODELS NOW
|
||||
// markdown = trimmedMarkdown;
|
||||
// warning = trimWarning;
|
||||
|
||||
try {
|
||||
const prompt = options.prompt !== undefined
|
||||
@ -203,8 +218,13 @@ export async function generateCompletions({
|
||||
const result = await generateText({
|
||||
model: model,
|
||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||
temperature: options.temperature ?? 0,
|
||||
// temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
@ -279,7 +299,12 @@ export async function generateCompletions({
|
||||
const { text: fixedText } = await generateText({
|
||||
model: model,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON."
|
||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.",
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
}
|
||||
});
|
||||
return fixedText;
|
||||
}
|
||||
@ -288,7 +313,8 @@ export async function generateCompletions({
|
||||
const generateObjectConfig = {
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
temperature: options.temperature ?? 0,
|
||||
providerOptions: providerOptions || undefined,
|
||||
// temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
||||
...(!schema && { output: 'no-schema' as const }),
|
||||
@ -300,9 +326,21 @@ export async function generateCompletions({
|
||||
})
|
||||
} satisfies Parameters<typeof generateObject>[0];
|
||||
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
const now = new Date().getTime()
|
||||
console.log(now)
|
||||
console.log({generateObjectConfig})
|
||||
|
||||
await fs.writeFile(`logs/generateObjectConfig-${now}.json`, JSON.stringify(generateObjectConfig, null, 2))
|
||||
|
||||
const result = await generateObject(generateObjectConfig);
|
||||
extract = result.object;
|
||||
|
||||
const now2 = new Date().getTime()
|
||||
console.log('>>>>>>', now2-now)
|
||||
console.log({extract})
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
if (
|
||||
|
Loading…
x
Reference in New Issue
Block a user