mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 15:05:57 +08:00
integrating smart-scrape
This commit is contained in:
parent
bf0b1c7ae0
commit
c0fe770520
@ -51,7 +51,9 @@
|
||||
"typescript": "^5.4.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/anthropic": "^1.2.2",
|
||||
"@ai-sdk/anthropic": "^1.2.4",
|
||||
"@ai-sdk/deepinfra": "^0.2.4",
|
||||
"@ai-sdk/fireworks": "^0.2.4",
|
||||
"@ai-sdk/google": "^1.2.3",
|
||||
"@ai-sdk/groq": "^1.2.1",
|
||||
"@ai-sdk/openai": "^1.1.13",
|
||||
@ -63,6 +65,7 @@
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.17",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@openrouter/ai-sdk-provider": "^0.4.5",
|
||||
"@pinecone-database/pinecone": "^4.0.0",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
@ -70,7 +73,7 @@
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ai": "^4.2.8",
|
||||
"ai": "^4.2.10",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
|
152
apps/api/pnpm-lock.yaml
generated
152
apps/api/pnpm-lock.yaml
generated
@ -9,8 +9,14 @@ importers:
|
||||
.:
|
||||
dependencies:
|
||||
'@ai-sdk/anthropic':
|
||||
specifier: ^1.2.2
|
||||
version: 1.2.2(zod@3.24.2)
|
||||
specifier: ^1.2.4
|
||||
version: 1.2.4(zod@3.24.2)
|
||||
'@ai-sdk/deepinfra':
|
||||
specifier: ^0.2.4
|
||||
version: 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/fireworks':
|
||||
specifier: ^0.2.4
|
||||
version: 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/google':
|
||||
specifier: ^1.2.3
|
||||
version: 1.2.3(zod@3.24.2)
|
||||
@ -44,6 +50,9 @@ importers:
|
||||
'@nangohq/node':
|
||||
specifier: ^0.40.8
|
||||
version: 0.40.8
|
||||
'@openrouter/ai-sdk-provider':
|
||||
specifier: ^0.4.5
|
||||
version: 0.4.5(zod@3.24.2)
|
||||
'@pinecone-database/pinecone':
|
||||
specifier: ^4.0.0
|
||||
version: 4.0.0
|
||||
@ -66,8 +75,8 @@ importers:
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ai:
|
||||
specifier: ^4.2.8
|
||||
version: 4.2.8(react@18.3.1)(zod@3.24.2)
|
||||
specifier: ^4.2.10
|
||||
version: 4.2.10(react@18.3.1)(zod@3.24.2)
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
@ -333,8 +342,20 @@ importers:
|
||||
|
||||
packages:
|
||||
|
||||
'@ai-sdk/anthropic@1.2.2':
|
||||
resolution: {integrity: sha512-BNZTtbP+zuCzUf8hkpTx7YhwjbQ9oLh2yhjgXC7X1QcNsn9TFRaweX7CP8USM0g6lm/Crm1Qn+4tOCe1p10NTA==}
|
||||
'@ai-sdk/anthropic@1.2.4':
|
||||
resolution: {integrity: sha512-dAN6MXvLffeFVAr2gz3RGvOTgX1KL/Yn5q1l4/Dt0TUeDjQgCt4AbbYxZZB2qIAYzQvoyAFPhlw0sB3nNizG/g==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/deepinfra@0.2.4':
|
||||
resolution: {integrity: sha512-JBF3tUOLYgQDCwkvN9I5ZbSqsAxTJWOKmIpyJXJl5RpLXOEviJUqpKSZufs11J9S4Z0U9vZX9jfhO1+DBjS56w==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/fireworks@0.2.4':
|
||||
resolution: {integrity: sha512-tNXJfEyyXHBD4hMoYjZW/IrsZNcTlmZkQFx3hFRwhiz35rT9TC9QG/RuKCz+UtziQU765g7NP4G/t7f0cJ154Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
@ -351,12 +372,27 @@ packages:
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/openai-compatible@0.2.4':
|
||||
resolution: {integrity: sha512-hLQnBn5e69rUXvXW+9SOkiL+S4yQX62hjtlX3zKXBI/3VnfOTcGKMamK51GoQB7uQCN1h7l9orvWqWpuQXxzRg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/openai@1.1.13':
|
||||
resolution: {integrity: sha512-IdChK1pJTW3NQis02PG/hHTG0gZSyQIMOLPt7f7ES56C0xH2yaKOU1Tp2aib7pZzWGwDlzTOW2h5TtAB8+V6CQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.10':
|
||||
resolution: {integrity: sha512-4GZ8GHjOFxePFzkl3q42AU0DQOtTQ5w09vmaWUf/pKFXJPizlnzKSUkF0f+VkapIUfDugyMqPMT1ge8XQzVI7Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
peerDependenciesMeta:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.9':
|
||||
resolution: {integrity: sha512-NerKjTuuUUs6glJGaentaXEBH52jRM0pR+cRCzc7aWke/K5jYBD6Frv1JYBpcxS7gnnCqSQZR9woiyS+6jrdjw==}
|
||||
engines: {node: '>=18'}
|
||||
@ -372,16 +408,26 @@ packages:
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.3':
|
||||
resolution: {integrity: sha512-o3fWTzkxzI5Af7U7y794MZkYNEsxbjLam2nxyoUZSScqkacb7vZ3EYHLh21+xCcSSzEC161C7pZAGHtC0hTUMw==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.0.9':
|
||||
resolution: {integrity: sha512-jie6ZJT2ZR0uVOVCDc9R2xCX5I/Dum/wEK28lx21PJx6ZnFAN9EzD2WsPhcDWfCgGx3OAZZ0GyM3CEobXpa9LA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
resolution: {integrity: sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/react@1.2.3':
|
||||
resolution: {integrity: sha512-EQ6nmmQBBAal1yg72GB/Q7QnmDXMfgYvCo9Gym2mESXUHTqwpXU0JFHtk5Kq3EEkk7CVMf1oBWlNFNvU5ckQBg==}
|
||||
'@ai-sdk/react@1.2.5':
|
||||
resolution: {integrity: sha512-0jOop3S2WkDOdO4X5I+5fTGqZlNX8/h1T1eYokpkR9xh8Vmrxqw8SsovqGvrddTsZykH8uXRsvI+G4FTyy894A==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
react: ^18 || ^19 || ^19.0.0-rc
|
||||
@ -390,8 +436,8 @@ packages:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/ui-utils@1.2.2':
|
||||
resolution: {integrity: sha512-6rCx2jSEPuiF6fytfMNscSOinHQZp52aFCHyPVpPPkcWnOur1jPWhol+0TFCUruDl7dCfcSIfTexQUq2ioLwaA==}
|
||||
'@ai-sdk/ui-utils@1.2.4':
|
||||
resolution: {integrity: sha512-wLTxEZrKZRyBmlVZv8nGXgLBg5tASlqXwbuhoDu0MhZa467ZFREEnosH/OC/novyEHTQXko2zC606xoVbMrUcA==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
@ -966,6 +1012,12 @@ packages:
|
||||
'@one-ini/wasm@0.1.1':
|
||||
resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==}
|
||||
|
||||
'@openrouter/ai-sdk-provider@0.4.5':
|
||||
resolution: {integrity: sha512-gbCOcSjNhyWlLHyYZX2rIFnpJi3C2RXNyyzJj+d6pMRfTS/mdvEEOsU66KxK9H8Qju2i9YRLOn/FdQT26K7bIQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==}
|
||||
engines: {node: '>=14'}
|
||||
@ -1827,8 +1879,8 @@ packages:
|
||||
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
|
||||
engines: {node: '>= 8.0.0'}
|
||||
|
||||
ai@4.2.8:
|
||||
resolution: {integrity: sha512-0gwfPZAuuQ+uTfk/GssrfnNTYxliCFKojbSQoEhzpbpSVaPao9NoU3iuE8vwBjWuDKqILRGzYGFE4+vTak0Oxg==}
|
||||
ai@4.2.10:
|
||||
resolution: {integrity: sha512-rOfKbNRWlzwxbFll6W9oAdnC0R5VVbAJoof+p92CatHzA3reqQZmYn33IBnj+CgqeXYUsH9KX9Wnj7g2wCHc9Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
react: ^18 || ^19 || ^19.0.0-rc
|
||||
@ -4678,8 +4730,8 @@ packages:
|
||||
urlpattern-polyfill@10.0.0:
|
||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||
|
||||
use-sync-external-store@1.4.0:
|
||||
resolution: {integrity: sha512-9WXSPC5fMv61vaupRkCKCxsPxBocVnwakBEkMIHHpkTTg6icbJtg6jzgtLDm4bl3cSHAca52rYWih0k4K3PfHw==}
|
||||
use-sync-external-store@1.5.0:
|
||||
resolution: {integrity: sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==}
|
||||
peerDependencies:
|
||||
react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
@ -4908,10 +4960,24 @@ packages:
|
||||
|
||||
snapshots:
|
||||
|
||||
'@ai-sdk/anthropic@1.2.2(zod@3.24.2)':
|
||||
'@ai-sdk/anthropic@1.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/deepinfra@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/fireworks@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/google@1.2.3(zod@3.24.2)':
|
||||
@ -4926,12 +4992,27 @@ snapshots:
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/openai-compatible@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/openai@1.1.13(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.8
|
||||
'@ai-sdk/provider-utils': 2.1.9(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.10(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.9
|
||||
eventsource-parser: 3.0.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.9(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.8
|
||||
@ -4948,28 +5029,39 @@ snapshots:
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.3(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.0.9':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/react@1.2.3(react@18.3.1)(zod@3.24.2)':
|
||||
'@ai-sdk/react@1.2.5(react@18.3.1)(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.4(zod@3.24.2)
|
||||
react: 18.3.1
|
||||
swr: 2.3.3(react@18.3.1)
|
||||
throttleit: 2.1.0
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/ui-utils@1.2.2(zod@3.24.2)':
|
||||
'@ai-sdk/ui-utils@1.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
zod-to-json-schema: 3.24.5(zod@3.24.2)
|
||||
|
||||
@ -6053,6 +6145,12 @@ snapshots:
|
||||
|
||||
'@one-ini/wasm@0.1.1': {}
|
||||
|
||||
'@openrouter/ai-sdk-provider@0.4.5(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.9
|
||||
'@ai-sdk/provider-utils': 2.1.10(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
dependencies:
|
||||
'@opentelemetry/api': 1.9.0
|
||||
@ -7161,12 +7259,12 @@ snapshots:
|
||||
dependencies:
|
||||
humanize-ms: 1.2.1
|
||||
|
||||
ai@4.2.8(react@18.3.1)(zod@3.24.2):
|
||||
ai@4.2.10(react@18.3.1)(zod@3.24.2):
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
'@ai-sdk/react': 1.2.3(react@18.3.1)(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.2(zod@3.24.2)
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
'@ai-sdk/react': 1.2.5(react@18.3.1)(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.4(zod@3.24.2)
|
||||
'@opentelemetry/api': 1.9.0
|
||||
jsondiffpatch: 0.6.0
|
||||
zod: 3.24.2
|
||||
@ -10041,7 +10139,7 @@ snapshots:
|
||||
dependencies:
|
||||
dequal: 2.0.3
|
||||
react: 18.3.1
|
||||
use-sync-external-store: 1.4.0(react@18.3.1)
|
||||
use-sync-external-store: 1.5.0(react@18.3.1)
|
||||
|
||||
sylvester@0.0.12: {}
|
||||
|
||||
@ -10233,7 +10331,7 @@ snapshots:
|
||||
|
||||
urlpattern-polyfill@10.0.0: {}
|
||||
|
||||
use-sync-external-store@1.4.0(react@18.3.1):
|
||||
use-sync-external-store@1.5.0(react@18.3.1):
|
||||
dependencies:
|
||||
react: 18.3.1
|
||||
|
||||
|
@ -62,7 +62,17 @@ export const extractOptions = z
|
||||
.string()
|
||||
.max(10000)
|
||||
.default(
|
||||
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.",
|
||||
`You are an expert web data extractor. Your task is to analyze the provided markdown content from a web page and generate a JSON object based *strictly* on the provided schema.
|
||||
|
||||
Key Instructions:
|
||||
1. **Schema Adherence:** Populate the JSON object according to the structure defined in the schema.
|
||||
2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
|
||||
3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
|
||||
4. **SmartScrape Recommendation:**
|
||||
* Assess if the *full* required data seems unavailable in the current markdown likely because user interaction (like clicking or scrolling) is needed to reveal it.
|
||||
* If interaction seems necessary to get the complete data, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
|
||||
* Otherwise, set \`shouldUseSmartscrape\` to \`false\`.
|
||||
5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`,
|
||||
),
|
||||
prompt: z.string().max(10000).optional(),
|
||||
temperature: z.number().optional(),
|
||||
@ -246,11 +256,9 @@ const extractRefine = (obj) => {
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions)
|
||||
|| (!hasExtractFormat && !hasExtractOptions)
|
||||
) && (
|
||||
(hasJsonFormat && hasJsonOptions)
|
||||
|| (!hasJsonFormat && !hasJsonOptions)
|
||||
((hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)) &&
|
||||
((hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions))
|
||||
);
|
||||
};
|
||||
const extractRefineOpts = {
|
||||
@ -264,7 +272,7 @@ const extractTransform = (obj) => {
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
(obj.timeout === 30000)
|
||||
obj.timeout === 30000
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
@ -356,12 +364,9 @@ export const extractV1Options = z
|
||||
.optional(),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => obj.urls || obj.prompt,
|
||||
{
|
||||
message: "Either 'urls' or 'prompt' must be provided.",
|
||||
},
|
||||
)
|
||||
.refine((obj) => obj.urls || obj.prompt, {
|
||||
message: "Either 'urls' or 'prompt' must be provided.",
|
||||
})
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
@ -542,8 +547,8 @@ export type Document = {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
javascriptReturns?: {
|
||||
type: string,
|
||||
value: unknown
|
||||
type: string;
|
||||
value: unknown;
|
||||
}[];
|
||||
};
|
||||
metadata: {
|
||||
@ -831,7 +836,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||
}),
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
},
|
||||
@ -1030,6 +1035,6 @@ export type GenerateLLMsTextRequest = z.infer<
|
||||
|
||||
export class TimeoutSignal extends Error {
|
||||
constructor() {
|
||||
super("Operation timed out")
|
||||
super("Operation timed out");
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,8 @@ import {
|
||||
buildBatchExtractPrompt,
|
||||
buildBatchExtractSystemPrompt,
|
||||
} from "../build-prompts";
|
||||
import { getGemini } from "../../generic-ai";
|
||||
import { getModel } from "../../generic-ai";
|
||||
|
||||
import fs from "fs/promises";
|
||||
/**
|
||||
* Batch extract information from a list of URLs using a multi-entity schema.
|
||||
@ -31,7 +32,7 @@ export async function batchExtractPromise(
|
||||
warning?: string;
|
||||
sources: string[];
|
||||
}> {
|
||||
const gemini = getGemini();
|
||||
const gemini = getModel("gemini-2.0-flash", "google");
|
||||
const completion = await generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "extractService/generateCompletions",
|
||||
@ -50,12 +51,15 @@ export async function batchExtractPromise(
|
||||
isExtractEndpoint: true,
|
||||
model: gemini("gemini-2.0-flash"),
|
||||
});
|
||||
await fs.writeFile(`logs/batchExtract-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
await fs.writeFile(
|
||||
`logs/batchExtract-${crypto.randomUUID()}.json`,
|
||||
JSON.stringify(completion, null, 2),
|
||||
);
|
||||
|
||||
return {
|
||||
extract: completion.extract,
|
||||
numTokens: completion.numTokens,
|
||||
totalUsage: completion.totalUsage,
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""]
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
||||
};
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { getGemini } from "../../../lib/generic-ai";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import fs from "fs/promises";
|
||||
|
||||
export async function singleAnswerCompletion({
|
||||
@ -22,7 +22,6 @@ export async function singleAnswerCompletion({
|
||||
tokenUsage: TokenUsage;
|
||||
sources: string[];
|
||||
}> {
|
||||
const gemini = getGemini();
|
||||
const completion = await generateCompletions({
|
||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
options: {
|
||||
@ -35,12 +34,17 @@ export async function singleAnswerCompletion({
|
||||
},
|
||||
markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||
isExtractEndpoint: true,
|
||||
model: gemini("gemini-2.0-flash"),
|
||||
model: getModel("gemini-2.0-flash", "google"),
|
||||
});
|
||||
await fs.writeFile(`logs/singleAnswer-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
return {
|
||||
extract: completion.extract,
|
||||
await fs.writeFile(
|
||||
`logs/singleAnswer-${crypto.randomUUID()}.json`,
|
||||
JSON.stringify(completion, null, 2),
|
||||
);
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: completion.totalUsage,
|
||||
sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "")
|
||||
sources: singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
),
|
||||
};
|
||||
}
|
||||
|
@ -27,10 +27,7 @@ import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
||||
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
||||
import { areMergeable } from "./helpers/merge-null-val-objs";
|
||||
import { CUSTOM_U_TEAMS } from "./config";
|
||||
import {
|
||||
calculateFinalResultCost,
|
||||
estimateTotalCost,
|
||||
} from "./usage/llm-cost";
|
||||
import { calculateFinalResultCost, estimateTotalCost } from "./usage/llm-cost";
|
||||
import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt";
|
||||
import { checkShouldExtract } from "./completions/checkShouldExtract";
|
||||
import { batchExtractPromise } from "./completions/batchExtract";
|
||||
@ -71,7 +68,6 @@ type completions = {
|
||||
sources?: string[];
|
||||
};
|
||||
|
||||
|
||||
export async function performExtraction(
|
||||
extractId: string,
|
||||
options: ExtractServiceOptions,
|
||||
@ -88,7 +84,7 @@ export async function performExtraction(
|
||||
|
||||
let log = {
|
||||
extractId,
|
||||
request
|
||||
request,
|
||||
};
|
||||
|
||||
const logger = _logger.child({
|
||||
@ -102,13 +98,15 @@ export async function performExtraction(
|
||||
logger.debug("Generating URLs from prompt...", {
|
||||
prompt: request.prompt,
|
||||
});
|
||||
const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt));
|
||||
const rephrasedPrompt = await generateBasicCompletion(
|
||||
buildRephraseToSerpPrompt(request.prompt),
|
||||
);
|
||||
const searchResults = await search({
|
||||
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||
num_results: 10,
|
||||
});
|
||||
|
||||
request.urls = searchResults.map(result => result.url) as string[];
|
||||
request.urls = searchResults.map((result) => result.url) as string[];
|
||||
}
|
||||
if (request.urls && request.urls.length === 0) {
|
||||
logger.error("No search results found", {
|
||||
@ -123,7 +121,11 @@ export async function performExtraction(
|
||||
|
||||
const urls = request.urls || ([] as string[]);
|
||||
|
||||
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
|
||||
if (
|
||||
request.__experimental_cacheMode == "load" &&
|
||||
request.__experimental_cacheKey &&
|
||||
urls
|
||||
) {
|
||||
logger.debug("Loading cached docs...");
|
||||
try {
|
||||
const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
|
||||
@ -170,7 +172,6 @@ export async function performExtraction(
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
|
||||
let rSchema = reqSchema;
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
@ -202,7 +203,7 @@ export async function performExtraction(
|
||||
logger.debug("Processing URLs...", {
|
||||
urlCount: request.urls?.length || 0,
|
||||
});
|
||||
|
||||
|
||||
const urlPromises = urls.map((url) =>
|
||||
processUrl(
|
||||
{
|
||||
@ -245,8 +246,8 @@ export async function performExtraction(
|
||||
linkCount: links.length,
|
||||
});
|
||||
|
||||
log['links'] = links;
|
||||
log['linksLength'] = links.length;
|
||||
log["links"] = links;
|
||||
log["linksLength"] = links.length;
|
||||
|
||||
if (links.length === 0) {
|
||||
logger.error("0 links! Bailing.", {
|
||||
@ -275,7 +276,7 @@ export async function performExtraction(
|
||||
});
|
||||
|
||||
if (isMultiEntity && reqSchema) {
|
||||
log['isMultiEntity'] = true;
|
||||
log["isMultiEntity"] = true;
|
||||
logger.debug("=== MULTI-ENTITY ===");
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
@ -313,8 +314,8 @@ export async function performExtraction(
|
||||
|
||||
logger.debug("Starting multi-entity scrape...");
|
||||
let startScrape = Date.now();
|
||||
log['docsSizeBeforeMultiEntityScrape'] = docsMap.size;
|
||||
|
||||
log["docsSizeBeforeMultiEntityScrape"] = docsMap.size;
|
||||
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument(
|
||||
@ -337,7 +338,7 @@ export async function performExtraction(
|
||||
|
||||
// Needs to be true for multi-entity to work properly
|
||||
onlyMainContent: true,
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
@ -347,7 +348,7 @@ export async function performExtraction(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
|
||||
log['docsSizeAfterMultiEntityScrape'] = scrapePromises.length;
|
||||
log["docsSizeAfterMultiEntityScrape"] = scrapePromises.length;
|
||||
|
||||
logger.debug("Multi-entity scrape finished.", {
|
||||
docCount: multyEntityDocs.length,
|
||||
@ -381,7 +382,7 @@ export async function performExtraction(
|
||||
const chunkSize = 50;
|
||||
const timeoutCompletion = 45000; // 45 second timeout
|
||||
const chunks: Document[][] = [];
|
||||
const extractionResults: {extract: any, url: string}[] = [];
|
||||
const extractionResults: { extract: any; url: string }[] = [];
|
||||
|
||||
// Split into chunks
|
||||
for (let i = 0; i < multyEntityDocs.length; i += chunkSize) {
|
||||
@ -445,7 +446,13 @@ export async function performExtraction(
|
||||
// ],
|
||||
// });
|
||||
|
||||
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
||||
const completionPromise = batchExtractPromise(
|
||||
multiEntitySchema,
|
||||
links,
|
||||
request.prompt ?? "",
|
||||
request.systemPrompt ?? "",
|
||||
doc,
|
||||
);
|
||||
|
||||
// Race between timeout and completion
|
||||
const multiEntityCompletion = (await Promise.race([
|
||||
@ -456,11 +463,11 @@ export async function performExtraction(
|
||||
// Track multi-entity extraction tokens
|
||||
if (multiEntityCompletion) {
|
||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||
|
||||
|
||||
if (multiEntityCompletion.extract) {
|
||||
return {
|
||||
extract: multiEntityCompletion.extract,
|
||||
url: doc.metadata.url || doc.metadata.sourceURL || ""
|
||||
url: doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -509,38 +516,46 @@ export async function performExtraction(
|
||||
|
||||
// Wait for current chunk to complete before processing next chunk
|
||||
const chunkResults = await Promise.all(chunkPromises);
|
||||
const validResults = chunkResults.filter((result): result is {extract: any, url: string} => result !== null);
|
||||
const validResults = chunkResults.filter(
|
||||
(result): result is { extract: any; url: string } => result !== null,
|
||||
);
|
||||
extractionResults.push(...validResults);
|
||||
multiEntityCompletions.push(...validResults.map(r => r.extract));
|
||||
multiEntityCompletions.push(...validResults.map((r) => r.extract));
|
||||
logger.debug("All multi-entity completion chunks finished.", {
|
||||
completionCount: multiEntityCompletions.length,
|
||||
});
|
||||
log['multiEntityCompletionsLength'] = multiEntityCompletions.length;
|
||||
log["multiEntityCompletionsLength"] = multiEntityCompletions.length;
|
||||
}
|
||||
|
||||
try {
|
||||
// Use SourceTracker to handle source tracking
|
||||
const sourceTracker = new SourceTracker();
|
||||
|
||||
|
||||
// Transform and merge results while preserving sources
|
||||
sourceTracker.transformResults(extractionResults, multiEntitySchema, false);
|
||||
|
||||
sourceTracker.transformResults(
|
||||
extractionResults,
|
||||
multiEntitySchema,
|
||||
false,
|
||||
);
|
||||
|
||||
multiEntityResult = transformArrayToObject(
|
||||
multiEntitySchema,
|
||||
multiEntityCompletions,
|
||||
);
|
||||
|
||||
|
||||
// Track sources before deduplication
|
||||
sourceTracker.trackPreDeduplicationSources(multiEntityResult);
|
||||
|
||||
|
||||
// Apply deduplication and merge
|
||||
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
|
||||
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
||||
|
||||
// Map sources to final deduplicated/merged items
|
||||
const multiEntitySources = sourceTracker.mapSourcesToFinalItems(multiEntityResult, multiEntityKeys);
|
||||
Object.assign(sources, multiEntitySources);
|
||||
|
||||
// Map sources to final deduplicated/merged items
|
||||
const multiEntitySources = sourceTracker.mapSourcesToFinalItems(
|
||||
multiEntityResult,
|
||||
multiEntityKeys,
|
||||
);
|
||||
Object.assign(sources, multiEntitySources);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to transform array to object`, { error });
|
||||
return {
|
||||
@ -559,7 +574,7 @@ export async function performExtraction(
|
||||
rSchema.properties &&
|
||||
Object.keys(rSchema.properties).length > 0
|
||||
) {
|
||||
log['isSingleEntity'] = true;
|
||||
log["isSingleEntity"] = true;
|
||||
logger.debug("=== SINGLE PAGES ===", {
|
||||
linkCount: links.length,
|
||||
schema: rSchema,
|
||||
@ -582,7 +597,7 @@ export async function performExtraction(
|
||||
},
|
||||
],
|
||||
});
|
||||
log['docsSizeBeforeSingleEntityScrape'] = docsMap.size;
|
||||
log["docsSizeBeforeSingleEntityScrape"] = docsMap.size;
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument(
|
||||
@ -600,7 +615,7 @@ export async function performExtraction(
|
||||
url,
|
||||
isMultiEntity: false,
|
||||
}),
|
||||
request.scrapeOptions
|
||||
request.scrapeOptions,
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
@ -608,7 +623,7 @@ export async function performExtraction(
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
log['docsSizeAfterSingleEntityScrape'] = docsMap.size;
|
||||
log["docsSizeAfterSingleEntityScrape"] = docsMap.size;
|
||||
|
||||
for (const doc of results) {
|
||||
if (doc?.metadata?.url) {
|
||||
@ -661,8 +676,12 @@ export async function performExtraction(
|
||||
|
||||
// Generate completions
|
||||
logger.debug("Generating singleAnswer completions...");
|
||||
log['singleAnswerDocsLength'] = singleAnswerDocs.length;
|
||||
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
|
||||
log["singleAnswerDocsLength"] = singleAnswerDocs.length;
|
||||
let {
|
||||
extract: completionResult,
|
||||
tokenUsage: singleAnswerTokenUsage,
|
||||
sources: singleAnswerSources,
|
||||
} = await singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
links,
|
||||
@ -674,12 +693,16 @@ export async function performExtraction(
|
||||
// Track single answer extraction tokens and sources
|
||||
if (completionResult) {
|
||||
tokenUsage.push(singleAnswerTokenUsage);
|
||||
|
||||
|
||||
// Add sources for top-level properties in single answer
|
||||
if (rSchema?.properties) {
|
||||
Object.keys(rSchema.properties).forEach(key => {
|
||||
Object.keys(rSchema.properties).forEach((key) => {
|
||||
if (completionResult[key] !== undefined) {
|
||||
sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "");
|
||||
sources[key] =
|
||||
singleAnswerSources ||
|
||||
singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -708,8 +731,8 @@ export async function performExtraction(
|
||||
// }
|
||||
}
|
||||
|
||||
log['singleAnswerResult'] = singleAnswerResult;
|
||||
log['multiEntityResult'] = multiEntityResult;
|
||||
log["singleAnswerResult"] = singleAnswerResult;
|
||||
log["multiEntityResult"] = multiEntityResult;
|
||||
|
||||
let finalResult = reqSchema
|
||||
? await mixSchemaObjects(
|
||||
@ -815,16 +838,25 @@ export async function performExtraction(
|
||||
|
||||
logger.debug("Done!");
|
||||
|
||||
if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) {
|
||||
if (
|
||||
request.__experimental_cacheMode == "save" &&
|
||||
request.__experimental_cacheKey
|
||||
) {
|
||||
logger.debug("Saving cached docs...");
|
||||
try {
|
||||
await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey);
|
||||
await saveCachedDocs(
|
||||
[...docsMap.values()],
|
||||
request.__experimental_cacheKey,
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error("Error saving cached docs", { error });
|
||||
}
|
||||
}
|
||||
|
||||
fs.writeFile(`logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`, JSON.stringify(log, null, 2));
|
||||
// fs.writeFile(
|
||||
// `logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`,
|
||||
// JSON.stringify(log, null, 2),
|
||||
// );
|
||||
|
||||
return {
|
||||
success: true,
|
||||
|
@ -9,7 +9,7 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt
|
||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||
import { dumpToFile } from "./helpers/dump-to-file";
|
||||
import { getAnthropic, getGemini, getGroq, getModel, getOpenAI } from "../generic-ai";
|
||||
import { getModel } from "../generic-ai";
|
||||
import fs from "fs/promises";
|
||||
|
||||
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||
@ -178,15 +178,28 @@ export type RerankerOptions = {
|
||||
keyIndicators: string[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces, isMultiEntity, reasoning, multiEntityKeys, keyIndicators } = options;
|
||||
export async function rerankLinksWithLLM(
|
||||
options: RerankerOptions,
|
||||
): Promise<RerankerResult> {
|
||||
const {
|
||||
links,
|
||||
searchQuery,
|
||||
urlTraces,
|
||||
isMultiEntity,
|
||||
reasoning,
|
||||
multiEntityKeys,
|
||||
keyIndicators,
|
||||
} = options;
|
||||
const chunkSize = 5000;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 60000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
await fs.writeFile(`logs/links-${crypto.randomUUID()}.txt`, JSON.stringify(links, null, 2));
|
||||
await fs.writeFile(
|
||||
`logs/links-${crypto.randomUUID()}.txt`,
|
||||
JSON.stringify(links, null, 2),
|
||||
);
|
||||
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
@ -205,7 +218,11 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" },
|
||||
reason: { type: "string", description: "The reason why you chose the score for this link given the intent." },
|
||||
reason: {
|
||||
type: "string",
|
||||
description:
|
||||
"The reason why you chose the score for this link given the intent.",
|
||||
},
|
||||
},
|
||||
required: ["url", "relevanceScore", "reason"],
|
||||
},
|
||||
@ -218,7 +235,6 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
|
||||
|
||||
const linksContent = chunk
|
||||
.map(
|
||||
(link) =>
|
||||
@ -226,7 +242,10 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
fs.writeFile(`logs/links-content-${crypto.randomUUID()}.txt`, linksContent);
|
||||
fs.writeFile(
|
||||
`logs/links-content-${crypto.randomUUID()}.txt`,
|
||||
linksContent,
|
||||
);
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
@ -234,18 +253,21 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? 'collecting multiple items' : 'specific information'}.
|
||||
const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? "collecting multiple items" : "specific information"}.
|
||||
The user's query is: ${searchQuery}
|
||||
${isMultiEntity
|
||||
? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(', ')}.
|
||||
${
|
||||
isMultiEntity
|
||||
? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(", ")}.
|
||||
Score URLs higher if they contain ANY instance of the target entities.
|
||||
Key indicators to look for: ${keyIndicators.join(', ')}`
|
||||
: `IMPORTANT: This is a specific information task.
|
||||
Key indicators to look for: ${keyIndicators.join(", ")}`
|
||||
: `IMPORTANT: This is a specific information task.
|
||||
Score URLs based on precision and relevance to answering the query.`
|
||||
}
|
||||
|
||||
Scoring guidelines:
|
||||
${isMultiEntity ? `
|
||||
${
|
||||
isMultiEntity
|
||||
? `
|
||||
- 1.0: Contains ANY instance of target entities, even just one. Give this score if page has any relevant entity. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains entity but may be incomplete information
|
||||
- 0.6: Mentions entity type but no clear instance
|
||||
@ -253,21 +275,22 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
- Below 0.4: No mention of relevant entities, or duplicates
|
||||
|
||||
Reason: ${reasoning}
|
||||
` : `
|
||||
`
|
||||
: `
|
||||
- 1.0: Contains direct, authoritative answer to query. Give this score if unsure about relevance. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains information that directly helps answer the query
|
||||
- 0.6: Contains related information that partially answers query
|
||||
- Below 0.6: Information too general or not focused on query
|
||||
`}`;
|
||||
`
|
||||
}`;
|
||||
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
// const gemini = getGemini();
|
||||
const model = getOpenAI()
|
||||
// const model = getGemini()
|
||||
let completion: any;
|
||||
try {
|
||||
const completionPromise = generateCompletions({
|
||||
model: model("o3-mini"),
|
||||
model: getModel("o3-mini", "openai"),
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
@ -287,27 +310,46 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
// },
|
||||
// },
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
isExtractEndpoint: true,
|
||||
});
|
||||
|
||||
completion = await completionPromise
|
||||
completion = await completionPromise;
|
||||
// completion = await Promise.race([
|
||||
// completionPromise,
|
||||
// timeoutPromise,
|
||||
// ]);
|
||||
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!˜")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log({ completion })
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!˜",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log({ completion });
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||
@ -315,7 +357,10 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
);
|
||||
}
|
||||
|
||||
await fs.writeFile(`logs/reranker-${crypto.randomUUID()}.json`, JSON.stringify(completion, null, 2));
|
||||
await fs.writeFile(
|
||||
`logs/reranker-${crypto.randomUUID()}.json`,
|
||||
JSON.stringify(completion, null, 2),
|
||||
);
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
@ -356,13 +401,18 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
// Map back to MapDocument format, keeping ALL links for testing
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => {
|
||||
if (result.relevanceScore > (isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE)) {
|
||||
if (
|
||||
result.relevanceScore >
|
||||
(isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE)
|
||||
) {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return {
|
||||
...link,
|
||||
relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0,
|
||||
reason: result.reason
|
||||
return {
|
||||
...link,
|
||||
relevanceScore: result.relevanceScore
|
||||
? parseFloat(result.relevanceScore)
|
||||
: 0,
|
||||
reason: result.reason,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -371,15 +421,21 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
// Add debug logging for testing
|
||||
fs.writeFile(`logs/reranker-aaa-${crypto.randomUUID()}.json`, JSON.stringify(
|
||||
{
|
||||
totalResults: relevantLinks.length,
|
||||
scores: relevantLinks.map(l => ({
|
||||
url: l.url,
|
||||
score: l.relevanceScore,
|
||||
reason: l.reason
|
||||
}))
|
||||
}, null, 2));
|
||||
fs.writeFile(
|
||||
`logs/reranker-aaa-${crypto.randomUUID()}.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
totalResults: relevantLinks.length,
|
||||
scores: relevantLinks.map((l) => ({
|
||||
url: l.url,
|
||||
score: l.relevanceScore,
|
||||
reason: l.reason,
|
||||
})),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
|
@ -8,19 +8,16 @@ import { rerankLinksWithLLM } from "./reranker";
|
||||
import { extractConfig } from "./config";
|
||||
import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getAnthropic, getGemini, getGroq, getModel } from "../generic-ai";
|
||||
import { getModel } from "../generic-ai";
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
|
||||
const anthropic = getAnthropic();
|
||||
|
||||
const { text } = await generateText({
|
||||
model: anthropic("claude-3-7-sonnet-latest"),
|
||||
model: getModel("claude-3-7-sonnet-latest", "anthropic"),
|
||||
prompt: prompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
// temperature: 0.7
|
||||
});
|
||||
@ -109,7 +106,7 @@ export async function processUrl(
|
||||
linkCount: allUrls.length,
|
||||
uniqueLinkCount: uniqueUrls.length,
|
||||
});
|
||||
options.log['uniqueUrlsLength-1'] = uniqueUrls.length;
|
||||
options.log["uniqueUrlsLength-1"] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
@ -164,7 +161,7 @@ export async function processUrl(
|
||||
});
|
||||
}
|
||||
|
||||
options.log['uniqueUrlsLength-2'] = uniqueUrls.length;
|
||||
options.log["uniqueUrlsLength-2"] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
@ -241,7 +238,7 @@ export async function processUrl(
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
options.log['rerankerResult-1'] = mappedLinks.length;
|
||||
options.log["rerankerResult-1"] = mappedLinks.length;
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
@ -260,7 +257,7 @@ export async function processUrl(
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
}
|
||||
options.log['rerankerResult-2'] = mappedLinks.length;
|
||||
options.log["rerankerResult-2"] = mappedLinks.length;
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
|
@ -1,44 +1,50 @@
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { createOllama } from "ollama-ai-provider";
|
||||
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
||||
import { createGroq } from "@ai-sdk/groq";
|
||||
import { createAnthropic } from "@ai-sdk/anthropic";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { groq } from "@ai-sdk/groq";
|
||||
import { google } from "@ai-sdk/google";
|
||||
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||
import { fireworks } from "@ai-sdk/fireworks";
|
||||
import { deepinfra } from "@ai-sdk/deepinfra";
|
||||
type Provider =
|
||||
| "openai"
|
||||
| "ollama"
|
||||
| "anthropic"
|
||||
| "groq"
|
||||
| "google"
|
||||
| "openrouter"
|
||||
| "fireworks"
|
||||
| "deepinfra";
|
||||
const defaultProvider: Provider = process.env.OLLAMA_BASE_URL
|
||||
? "ollama"
|
||||
: "openai";
|
||||
|
||||
const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({
|
||||
const providerList: Record<Provider, any> = {
|
||||
openai, //OPENAI_API_KEY
|
||||
ollama: createOllama({
|
||||
baseURL: process.env.OLLAMA_BASE_URL,
|
||||
}) : createOpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
baseURL: process.env.OPENAI_BASE_URL,
|
||||
});
|
||||
}),
|
||||
anthropic, //ANTHROPIC_API_KEY
|
||||
groq, //GROQ_API_KEY
|
||||
google, //GOOGLE_GENERATIVE_AI_API_KEY
|
||||
openrouter: createOpenRouter({
|
||||
apiKey: process.env.OPENROUTER_API_KEY,
|
||||
}),
|
||||
fireworks, //FIREWORKS_API_KEY
|
||||
deepinfra, //DEEPINFRA_API_KEY
|
||||
};
|
||||
|
||||
export function getModel(name: string) {
|
||||
return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name);
|
||||
export function getModel(name: string, provider: Provider = defaultProvider) {
|
||||
return process.env.MODEL_NAME
|
||||
? providerList[provider](process.env.MODEL_NAME)
|
||||
: providerList[provider](name);
|
||||
}
|
||||
|
||||
export function getGemini() {
|
||||
return createGoogleGenerativeAI({
|
||||
apiKey: process.env.GEMINI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
export function getGroq() {
|
||||
return createGroq({
|
||||
apiKey: process.env.GROQ_API_KEY ?? "",
|
||||
});
|
||||
}
|
||||
|
||||
export function getAnthropic() {
|
||||
return createAnthropic({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY ?? "",
|
||||
});
|
||||
} // claude-3-7-sonnet-latest
|
||||
|
||||
export function getOpenAI() {
|
||||
return createOpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY ?? ""
|
||||
});
|
||||
}
|
||||
|
||||
export function getEmbeddingModel(name: string) {
|
||||
return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name);
|
||||
export function getEmbeddingModel(
|
||||
name: string,
|
||||
provider: Provider = defaultProvider,
|
||||
) {
|
||||
return process.env.MODEL_EMBEDDING_NAME
|
||||
? providerList[provider].embedding(process.env.MODEL_EMBEDDING_NAME)
|
||||
: providerList[provider].embedding(name);
|
||||
}
|
||||
|
@ -9,17 +9,18 @@ import { Logger } from "winston";
|
||||
import { EngineResultsTracker, Meta } from "..";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
||||
import { generateObject, generateText, LanguageModel } from 'ai';
|
||||
import { jsonSchema } from 'ai';
|
||||
import { generateObject, generateText, LanguageModel } from "ai";
|
||||
import { jsonSchema } from "ai";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { z } from "zod";
|
||||
import fs from 'fs/promises';
|
||||
import fs from "fs/promises";
|
||||
import Ajv from "ajv";
|
||||
|
||||
// TODO: fix this, it's horrible
|
||||
type LanguageModelV1ProviderMetadata = {
|
||||
anthropic?: {
|
||||
thinking?: {
|
||||
type: 'enabled' | 'disabled';
|
||||
type: "enabled" | "disabled";
|
||||
budgetTokens?: number;
|
||||
};
|
||||
tool_choice?: "auto" | "none" | "required";
|
||||
@ -101,21 +102,24 @@ function normalizeSchema(x: any): any {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
interface TrimResult {
|
||||
text: string;
|
||||
numTokens: number;
|
||||
warning?: string;
|
||||
}
|
||||
|
||||
export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
|
||||
export function trimToTokenLimit(
|
||||
text: string,
|
||||
maxTokens: number,
|
||||
modelId: string = "gpt-4o",
|
||||
previousWarning?: string,
|
||||
): TrimResult {
|
||||
try {
|
||||
const encoder = encoding_for_model(modelId as TiktokenModel);
|
||||
try {
|
||||
const tokens = encoder.encode(text);
|
||||
const numTokens = tokens.length;
|
||||
|
||||
|
||||
if (numTokens <= maxTokens) {
|
||||
return { text, numTokens };
|
||||
}
|
||||
@ -123,7 +127,7 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
const modifier = 3;
|
||||
// Start with 3 chars per token estimation
|
||||
let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
|
||||
|
||||
|
||||
// Keep trimming until we're under the token limit
|
||||
while (true) {
|
||||
const currentTokens = encoder.encode(currentText);
|
||||
@ -132,14 +136,18 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
return {
|
||||
text: currentText,
|
||||
numTokens: currentTokens.length,
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
warning: previousWarning
|
||||
? `${warning} ${previousWarning}`
|
||||
: warning,
|
||||
};
|
||||
}
|
||||
const overflow = currentTokens.length * modifier - maxTokens - 1;
|
||||
// If still over limit, remove another chunk
|
||||
currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
|
||||
currentText = currentText.slice(
|
||||
0,
|
||||
Math.floor(currentText.length - overflow),
|
||||
);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
throw e;
|
||||
} finally {
|
||||
@ -150,13 +158,13 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
const estimatedCharsPerToken = 2.8;
|
||||
const safeLength = maxTokens * estimatedCharsPerToken;
|
||||
const trimmedText = text.slice(0, Math.floor(safeLength));
|
||||
|
||||
|
||||
const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
|
||||
|
||||
return {
|
||||
text: trimmedText,
|
||||
numTokens: maxTokens, // We assume we hit the max in this fallback case
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -169,9 +177,9 @@ export async function generateCompletions({
|
||||
isExtractEndpoint,
|
||||
model = getModel("gpt-4o-mini"),
|
||||
mode = "object",
|
||||
providerOptions
|
||||
providerOptions,
|
||||
}: {
|
||||
model?: LanguageModel;
|
||||
model?: LanguageModel;
|
||||
logger: Logger;
|
||||
options: ExtractOptions;
|
||||
markdown?: string;
|
||||
@ -198,21 +206,21 @@ export async function generateCompletions({
|
||||
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
|
||||
|
||||
// Use the new trimming function
|
||||
const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit(
|
||||
markdown,
|
||||
maxTokensSafe,
|
||||
model.modelId,
|
||||
previousWarning
|
||||
);
|
||||
const {
|
||||
text: trimmedMarkdown,
|
||||
numTokens,
|
||||
warning: trimWarning,
|
||||
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
|
||||
|
||||
// WE USE BIG MODELS NOW
|
||||
// markdown = trimmedMarkdown;
|
||||
// warning = trimWarning;
|
||||
|
||||
try {
|
||||
const prompt = options.prompt !== undefined
|
||||
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
|
||||
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
|
||||
const prompt =
|
||||
options.prompt !== undefined
|
||||
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
|
||||
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
|
||||
|
||||
if (mode === "no-object") {
|
||||
const result = await generateText({
|
||||
@ -222,13 +230,13 @@ export async function generateCompletions({
|
||||
system: options.systemPrompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
}
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
@ -299,15 +307,16 @@ export async function generateCompletions({
|
||||
const { text: fixedText } = await generateText({
|
||||
model: model,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.",
|
||||
system:
|
||||
"You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.",
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
}
|
||||
}
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
return fixedText;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
const generateObjectConfig = {
|
||||
@ -316,30 +325,39 @@ export async function generateCompletions({
|
||||
providerOptions: providerOptions || undefined,
|
||||
// temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
||||
...(!schema && { output: 'no-schema' as const }),
|
||||
...(schema && {
|
||||
schema: schema instanceof z.ZodType ? schema : jsonSchema(schema),
|
||||
}),
|
||||
...(!schema && { output: "no-schema" as const }),
|
||||
...repairConfig,
|
||||
...(!schema && {
|
||||
onError: (error: Error) => {
|
||||
console.error(error);
|
||||
}
|
||||
})
|
||||
},
|
||||
}),
|
||||
} satisfies Parameters<typeof generateObject>[0];
|
||||
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
const now = new Date().getTime()
|
||||
console.log(now)
|
||||
console.log({generateObjectConfig})
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
const now = new Date().getTime();
|
||||
console.log(now);
|
||||
console.log({ generateObjectConfig });
|
||||
|
||||
await fs.writeFile(`logs/generateObjectConfig-${now}.json`, JSON.stringify(generateObjectConfig, null, 2))
|
||||
await fs.writeFile(
|
||||
`logs/generateObjectConfig-${now}.json`,
|
||||
JSON.stringify(generateObjectConfig, null, 2),
|
||||
);
|
||||
|
||||
const result = await generateObject(generateObjectConfig);
|
||||
extract = result.object;
|
||||
|
||||
const now2 = new Date().getTime()
|
||||
console.log('>>>>>>', now2-now)
|
||||
console.log({extract})
|
||||
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||
const now2 = new Date().getTime();
|
||||
console.log(">>>>>>", now2 - now);
|
||||
console.log({ extract });
|
||||
console.log(
|
||||
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
|
||||
);
|
||||
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
@ -367,7 +385,7 @@ export async function generateCompletions({
|
||||
model: model.modelId,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error.message?.includes('refused')) {
|
||||
if (error.message?.includes("refused")) {
|
||||
throw new LLMRefusalError(error.message);
|
||||
}
|
||||
throw error;
|
||||
@ -379,20 +397,116 @@ export async function performLLMExtract(
|
||||
document: Document,
|
||||
): Promise<Document> {
|
||||
if (meta.options.formats.includes("extract")) {
|
||||
const originalOptions = meta.options.extract!;
|
||||
let generationOptions = { ...originalOptions }; // Start with original options
|
||||
let schemaWasWrapped = false;
|
||||
|
||||
if (originalOptions.schema) {
|
||||
const wrappedSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
extractedData: originalOptions.schema, // Nest the original schema
|
||||
shouldUseSmartscrape: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
||||
},
|
||||
smartscrape_reasoning: {
|
||||
type: "string",
|
||||
description:
|
||||
"Fill this only if shouldUseSmartscrape is true. Reasoning for why you think the page requires or doesnt require smartscrape. If it does explain which data you can't get with the initial page load.",
|
||||
},
|
||||
smartscrape_prompt: {
|
||||
type: "string",
|
||||
description:
|
||||
"Prompt to use for Smartscrape refinement if shouldUseSmartscrape is true. Explain exactly what actions smartscrape should do. Smartscrape is a tool that can perform actions on the page like clicking, scrolling, etc.",
|
||||
},
|
||||
},
|
||||
required: ["reasoning", "shouldUseSmartscrape"],
|
||||
// Conditionally require 'prompt' if 'shouldUseSmartscrape' is true
|
||||
// if: {
|
||||
// properties: {
|
||||
// shouldUseSmartscrape: { const: true },
|
||||
// },
|
||||
// required: ["shouldUseSmartscrape"],
|
||||
// },
|
||||
// then: {
|
||||
// required: ["prompt"],
|
||||
// },
|
||||
};
|
||||
|
||||
// Update generationOptions to use the wrapped schema
|
||||
generationOptions.schema = wrappedSchema;
|
||||
schemaWasWrapped = true;
|
||||
meta.logger.info("Using wrapped schema for LLM extraction.", {
|
||||
wrappedSchema,
|
||||
});
|
||||
} else {
|
||||
meta.logger.info(
|
||||
"No original schema provided, proceeding without wrapping.",
|
||||
);
|
||||
}
|
||||
|
||||
meta.internalOptions.abort?.throwIfAborted();
|
||||
const { extract, warning } = await generateCompletions({
|
||||
const { extract, warning, totalUsage, model } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
method: "performLLMExtract/generateCompletions",
|
||||
}),
|
||||
options: meta.options.extract!,
|
||||
options: generationOptions, // Pass potentially modified options
|
||||
markdown: document.markdown,
|
||||
previousWarning: document.warning
|
||||
previousWarning: document.warning,
|
||||
// model: getModel("deepseek-ai/DeepSeek-R1", "deepinfra"),
|
||||
// model: getModel("deepseek-ai/DeepSeek-V3-0324", "deepinfra"),
|
||||
|
||||
// model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||
// model: getModel("o3-mini", "openai"),
|
||||
|
||||
// model: getModel("gemini-2.0-flash", "google"),
|
||||
// model: getModel("accounts/fireworks/models/deepseek-r1", "fireworks"),
|
||||
// model: getModel("gpt-4o-mini", "openai"),
|
||||
// model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||
// model: getModel("o3-mini", "openai"),
|
||||
model: getModel("qwen-qwq-32b", "groq"),
|
||||
|
||||
// model: getModel("claude-3-7-sonnet", "anthropic"),
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Log token usage
|
||||
meta.logger.info("LLM extraction token usage", {
|
||||
model: model,
|
||||
promptTokens: totalUsage.promptTokens,
|
||||
completionTokens: totalUsage.completionTokens,
|
||||
totalTokens: totalUsage.totalTokens,
|
||||
});
|
||||
|
||||
// Extract the actual data if the schema was wrapped
|
||||
let finalExtract = schemaWasWrapped ? extract?.extractedData : extract;
|
||||
console.log({ extract });
|
||||
// Double-check extraction if wrapping occurred but extractedData is missing
|
||||
if (
|
||||
schemaWasWrapped &&
|
||||
finalExtract === undefined &&
|
||||
extract?.hasOwnProperty("extractedData")
|
||||
) {
|
||||
finalExtract = extract.extractedData;
|
||||
} else if (schemaWasWrapped && finalExtract === undefined) {
|
||||
// Log a warning if wrapping occurred but the expected structure wasn't returned
|
||||
meta.logger.warn(
|
||||
"Schema was wrapped, but LLM result did not contain expected 'extractedData' property.",
|
||||
{ extractResult: extract },
|
||||
);
|
||||
}
|
||||
|
||||
// Assign the final extracted data
|
||||
if (meta.options.formats.includes("json")) {
|
||||
document.json = extract;
|
||||
document.json = finalExtract;
|
||||
} else {
|
||||
document.extract = extract;
|
||||
document.extract = finalExtract;
|
||||
}
|
||||
document.warning = warning;
|
||||
}
|
||||
@ -404,7 +518,7 @@ export function removeDefaultProperty(schema: any): any {
|
||||
if (typeof schema !== "object" || schema === null) return schema;
|
||||
|
||||
const rest = { ...schema };
|
||||
|
||||
|
||||
// unsupported global keys
|
||||
delete rest.default;
|
||||
|
||||
@ -486,13 +600,12 @@ DO NOT USE FORMATS.
|
||||
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
|
||||
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
|
||||
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
||||
temperature: temp
|
||||
temperature: temp,
|
||||
},
|
||||
markdown: prompt
|
||||
markdown: prompt,
|
||||
});
|
||||
|
||||
return extract;
|
||||
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
||||
|
3611
apps/js-sdk/firecrawl/pnpm-lock.yaml
generated
Normal file
3611
apps/js-sdk/firecrawl/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user