mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 13:52:01 +08:00
(feat/fire-1) FIRE-1 (#1462)
* wip * integrating smart-scrape * integrate smartscrape into llmExtract * wip * smart scrape multiple links * fixes * fix * wip * it worked! * wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON * wip * retry model * retry models * feat/scrape+json+extract interfaces ready * vertex -> googleapi * fix/transformArrayToObject. required params on schema is still a bug * change model * o3-mini -> gemini * Update extractSmartScrape.ts * sessionId * sessionId * Nick: f-0 start * Update extraction-service-f0.ts * Update types.ts * Nick: * Update queue-worker.ts * Nick: new interface * rename analyzeSchemaAndPrompt -> F0 * refactor: rename agent ID to model in types and extract logic * agent * id->model * id->model * refactor: standardize agent model handling and validation across extraction logic * livecast agent * (feat/f1) sdks (#1459) * feat: add FIRE-1 agent support to Python and JavaScript SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add FIRE-1 agent support to scrape methods in both SDKs Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * feat: add prompt and sessionId to AgentOptions interface Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev> * Update index.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com> * feat(v1): rate limits * Update types.ts * Update llmExtract.ts * add cost tracking * remove * Update requests.http * fix smart scrape cost calc * log sm cost * fix counts * fix * expose cost tracking * models fix * temp: skipLibcheck * get rid of it * fix ts * dont skip lib check * Update extractSmartScrape.ts * Update queue-worker.ts * Update smartScrape.ts * Update requests.http * fix(rate-limiter): * types: fire-1 refine * bill 150 * fix credits used on crawl * ban from crawl * route cost limit warning * Update generic-ai.ts * genres * Update llmExtract.ts * test server diff * cletu --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas <thomas510111@gmail.com> Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com> Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello@sideguide.dev> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
e2c4b0e72f
commit
6634d236bf
4
.github/workflows/test-server.yml
vendored
4
.github/workflows/test-server.yml
vendored
@ -31,6 +31,10 @@ env:
|
||||
RUNPOD_MU_API_KEY: ${{ secrets.RUNPOD_MU_API_KEY }}
|
||||
GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }}
|
||||
GCS_BUCKET_NAME: ${{ secrets.GCS_BUCKET_NAME }}
|
||||
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
|
||||
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }}
|
||||
USE_GO_MARKDOWN_PARSER: true
|
||||
|
||||
jobs:
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -39,4 +39,5 @@ apps/js-sdk/firecrawl/dist
|
||||
|
||||
.vscode
|
||||
llm-links.txt
|
||||
mapped-links.txt
|
||||
mapped-links.txt
|
||||
gke-key.json
|
@ -35,7 +35,8 @@
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/cors": "^2.8.13",
|
||||
"@types/escape-html": "^1.0.4",
|
||||
"@types/express": "^4.17.17",
|
||||
"@types/express": "^4.17.21",
|
||||
"@types/express-ws": "^3.0.5",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/lodash": "^4.17.14",
|
||||
"@types/node": "^20.14.1",
|
||||
@ -51,7 +52,13 @@
|
||||
"typescript": "^5.8.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/openai": "^1.3.10",
|
||||
"@ai-sdk/anthropic": "^1.2.4",
|
||||
"@ai-sdk/deepinfra": "^0.2.4",
|
||||
"@ai-sdk/fireworks": "^0.2.4",
|
||||
"@ai-sdk/google": "^1.2.3",
|
||||
"@ai-sdk/google-vertex": "^2.2.15",
|
||||
"@ai-sdk/groq": "^1.2.1",
|
||||
"@ai-sdk/openai": "^1.3.12",
|
||||
"@anthropic-ai/sdk": "^0.24.3",
|
||||
"@apidevtools/json-schema-ref-parser": "^11.7.3",
|
||||
"@brillout/import": "^0.2.2",
|
||||
@ -61,12 +68,12 @@
|
||||
"@dqbd/tiktoken": "^1.0.17",
|
||||
"@google-cloud/storage": "^7.16.0",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@openrouter/ai-sdk-provider": "^0.4.5",
|
||||
"@pinecone-database/pinecone": "^4.0.0",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
"@sentry/profiling-node": "^8.26.0",
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ai": "^4.3.4",
|
||||
"ajv": "^8.16.0",
|
||||
|
298
apps/api/pnpm-lock.yaml
generated
298
apps/api/pnpm-lock.yaml
generated
@ -8,9 +8,27 @@ importers:
|
||||
|
||||
.:
|
||||
dependencies:
|
||||
'@ai-sdk/anthropic':
|
||||
specifier: ^1.2.4
|
||||
version: 1.2.4(zod@3.24.2)
|
||||
'@ai-sdk/deepinfra':
|
||||
specifier: ^0.2.4
|
||||
version: 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/fireworks':
|
||||
specifier: ^0.2.4
|
||||
version: 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/google':
|
||||
specifier: ^1.2.3
|
||||
version: 1.2.3(zod@3.24.2)
|
||||
'@ai-sdk/google-vertex':
|
||||
specifier: ^2.2.15
|
||||
version: 2.2.15(encoding@0.1.13)(zod@3.24.2)
|
||||
'@ai-sdk/groq':
|
||||
specifier: ^1.2.1
|
||||
version: 1.2.1(zod@3.24.2)
|
||||
'@ai-sdk/openai':
|
||||
specifier: ^1.3.10
|
||||
version: 1.3.10(zod@3.24.2)
|
||||
specifier: ^1.3.12
|
||||
version: 1.3.12(zod@3.24.2)
|
||||
'@anthropic-ai/sdk':
|
||||
specifier: ^0.24.3
|
||||
version: 0.24.3(encoding@0.1.13)
|
||||
@ -38,6 +56,9 @@ importers:
|
||||
'@nangohq/node':
|
||||
specifier: ^0.40.8
|
||||
version: 0.40.8
|
||||
'@openrouter/ai-sdk-provider':
|
||||
specifier: ^0.4.5
|
||||
version: 0.4.5(zod@3.24.2)
|
||||
'@pinecone-database/pinecone':
|
||||
specifier: ^4.0.0
|
||||
version: 4.0.0
|
||||
@ -53,15 +74,12 @@ importers:
|
||||
'@supabase/supabase-js':
|
||||
specifier: ^2.44.2
|
||||
version: 2.44.2
|
||||
'@types/express-ws':
|
||||
specifier: ^3.0.4
|
||||
version: 3.0.4
|
||||
'@types/ws':
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ai:
|
||||
specifier: ^4.3.4
|
||||
version: 4.3.4(react@18.3.1)(zod@3.24.2)
|
||||
version: 4.3.5(react@18.3.1)(zod@3.24.2)
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
@ -286,8 +304,11 @@ importers:
|
||||
specifier: ^1.0.4
|
||||
version: 1.0.4
|
||||
'@types/express':
|
||||
specifier: ^4.17.17
|
||||
specifier: ^4.17.21
|
||||
version: 4.17.21
|
||||
'@types/express-ws':
|
||||
specifier: ^3.0.5
|
||||
version: 3.0.5
|
||||
'@types/jest':
|
||||
specifier: ^29.5.12
|
||||
version: 29.5.12
|
||||
@ -330,12 +351,75 @@ importers:
|
||||
|
||||
packages:
|
||||
|
||||
'@ai-sdk/openai@1.3.10':
|
||||
resolution: {integrity: sha512-XO0wF2lmAMWCYjkM5bLpWTKoXet61fBiIimTi+blqEGiLUjAvivt/1zZL1Lzhrv9+p19IC1rn9EWZI1dCelV8w==}
|
||||
'@ai-sdk/anthropic@1.2.10':
|
||||
resolution: {integrity: sha512-PyE7EC2fPjs9DnzRAHDrPQmcnI2m2Eojr8pfhckOejOlDEh2w7NnSJr1W3qe5hUWzKr+6d7NG1ZKR9fhmpDdEQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/anthropic@1.2.4':
|
||||
resolution: {integrity: sha512-dAN6MXvLffeFVAr2gz3RGvOTgX1KL/Yn5q1l4/Dt0TUeDjQgCt4AbbYxZZB2qIAYzQvoyAFPhlw0sB3nNizG/g==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/deepinfra@0.2.4':
|
||||
resolution: {integrity: sha512-JBF3tUOLYgQDCwkvN9I5ZbSqsAxTJWOKmIpyJXJl5RpLXOEviJUqpKSZufs11J9S4Z0U9vZX9jfhO1+DBjS56w==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/fireworks@0.2.4':
|
||||
resolution: {integrity: sha512-tNXJfEyyXHBD4hMoYjZW/IrsZNcTlmZkQFx3hFRwhiz35rT9TC9QG/RuKCz+UtziQU765g7NP4G/t7f0cJ154Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/google-vertex@2.2.15':
|
||||
resolution: {integrity: sha512-XTl0dQ1rvLjhrkifSy/483qw3O7vCI6H2b4aAJnzQMfy0vzczMXmvQFS5RA8KmnO+YvsKTuZwBM2xRCNvKw1oQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/google@1.2.11':
|
||||
resolution: {integrity: sha512-gjGcxKcRri/Jbkujs9nVwP4qOW5GI4rYQ6vQ17uLAvGMo3qnwr26Q2KUqUWuVHQYtboXVSrxC/Kb6sm3hE5WUQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/google@1.2.3':
|
||||
resolution: {integrity: sha512-zsgwko7T+MFIdEfhg4fIXv6O2dnzTLFr6BOpAA21eo/moOBA5szVzOto1jTwIwoBYsF2ixPGNZBoc+k/fQ2AWw==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/groq@1.2.1':
|
||||
resolution: {integrity: sha512-e9Vn6sE6u+pm97YSK9+xiTgQ2ScRdipE5gAwXj/9HdgMnUyp3mDpWjFsmDM6bzyeb2iKOGv6f3eiRsLxOAPv4A==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/openai-compatible@0.2.4':
|
||||
resolution: {integrity: sha512-hLQnBn5e69rUXvXW+9SOkiL+S4yQX62hjtlX3zKXBI/3VnfOTcGKMamK51GoQB7uQCN1h7l9orvWqWpuQXxzRg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/openai@1.3.12':
|
||||
resolution: {integrity: sha512-ueAP69p8a/ZR2ns+pmlr9h/nyV2/DAwzfnPUGZiLpXbxWnLXd2g3a7l38CuEhBydH/nOfDb/byMgpS8+bnJHTg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.10':
|
||||
resolution: {integrity: sha512-4GZ8GHjOFxePFzkl3q42AU0DQOtTQ5w09vmaWUf/pKFXJPizlnzKSUkF0f+VkapIUfDugyMqPMT1ge8XQzVI7Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
peerDependenciesMeta:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.9':
|
||||
resolution: {integrity: sha512-NerKjTuuUUs6glJGaentaXEBH52jRM0pR+cRCzc7aWke/K5jYBD6Frv1JYBpcxS7gnnCqSQZR9woiyS+6jrdjw==}
|
||||
engines: {node: '>=18'}
|
||||
@ -345,20 +429,50 @@ packages:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.1':
|
||||
resolution: {integrity: sha512-BuExLp+NcpwsAVj1F4bgJuQkSqO/+roV9wM7RdIO+NVrcT8RBUTdXzf5arHt5T58VpK7bZyB2V9qigjaPHE+Dg==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.3':
|
||||
resolution: {integrity: sha512-o3fWTzkxzI5Af7U7y794MZkYNEsxbjLam2nxyoUZSScqkacb7vZ3EYHLh21+xCcSSzEC161C7pZAGHtC0hTUMw==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.6':
|
||||
resolution: {integrity: sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.7':
|
||||
resolution: {integrity: sha512-kM0xS3GWg3aMChh9zfeM+80vEZfXzR3JEUBdycZLtbRZ2TRT8xOj3WodGHPb06sUK5yD7pAXC/P7ctsi2fvUGQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
resolution: {integrity: sha512-f9jSYwKMdXvm44Dmab1vUBnfCDSFfI5rOtvV1W9oKB7WYHR5dGvCC6x68Mk3NUfrdmNoMVHGoh6JT9HCVMlMow==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.0.9':
|
||||
resolution: {integrity: sha512-jie6ZJT2ZR0uVOVCDc9R2xCX5I/Dum/wEK28lx21PJx6ZnFAN9EzD2WsPhcDWfCgGx3OAZZ0GyM3CEobXpa9LA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
resolution: {integrity: sha512-0M+qjp+clUD0R1E5eWQFhxEvWLNaOtGQRUaBn8CUABnSKredagq92hUS9VjOzGsTm37xLfpaxl97AVtbeOsHew==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.1.2':
|
||||
resolution: {integrity: sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/provider@1.1.3':
|
||||
resolution: {integrity: sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/react@1.2.8':
|
||||
resolution: {integrity: sha512-S2FzCSi4uTF0JuSN6zYMXyiAWVAzi/Hho8ISYgHpGZiICYLNCP2si4DuXQOsnWef3IXzQPLVoE11C63lILZIkw==}
|
||||
engines: {node: '>=18'}
|
||||
@ -949,6 +1063,12 @@ packages:
|
||||
'@one-ini/wasm@0.1.1':
|
||||
resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==}
|
||||
|
||||
'@openrouter/ai-sdk-provider@0.4.5':
|
||||
resolution: {integrity: sha512-gbCOcSjNhyWlLHyYZX2rIFnpJi3C2RXNyyzJj+d6pMRfTS/mdvEEOsU66KxK9H8Qju2i9YRLOn/FdQT26K7bIQ==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.0.0
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==}
|
||||
engines: {node: '>=14'}
|
||||
@ -1644,8 +1764,8 @@ packages:
|
||||
'@types/express-serve-static-core@4.19.3':
|
||||
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
|
||||
'@types/express-ws@3.0.5':
|
||||
resolution: {integrity: sha512-lbWMjoHrm/v85j81UCmb/GNZFO3genxRYBW1Ob7rjRI+zxUBR+4tcFuOpKKsYQ1LYTYiy3356epLeYi/5zxUwA==}
|
||||
|
||||
'@types/express@4.17.21':
|
||||
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
|
||||
@ -1820,8 +1940,8 @@ packages:
|
||||
resolution: {integrity: sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==}
|
||||
engines: {node: '>= 8.0.0'}
|
||||
|
||||
ai@4.3.4:
|
||||
resolution: {integrity: sha512-uMjzrowIqfU8CCCxhx8QGl7ETydHBROeNL0VoEwetkmDCY6Q8ZTacj6jNNqGJOiCk595aUrGR9VHPY9Ylvy1fg==}
|
||||
ai@4.3.5:
|
||||
resolution: {integrity: sha512-hxJ+6YCdGOK1MVPGITmz1if+LXR/aW72w8TI8kiV+3R7lpK1hfpApR8EjqN2ag6cWa0R7OEI3gb/srWkQ3hT2Q==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
react: ^18 || ^19 || ^19.0.0-rc
|
||||
@ -2091,8 +2211,8 @@ packages:
|
||||
resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
chalk@5.3.0:
|
||||
resolution: {integrity: sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==}
|
||||
chalk@5.4.1:
|
||||
resolution: {integrity: sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==}
|
||||
engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
|
||||
|
||||
char-regex@1.0.2:
|
||||
@ -4350,8 +4470,8 @@ packages:
|
||||
resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
swr@2.3.2:
|
||||
resolution: {integrity: sha512-RosxFpiabojs75IwQ316DGoDRmOqtiAj0tg8wCcbEu4CiLZBs/a9QNtHV7TUfDXmmlgqij/NqzKq/eLelyv9xA==}
|
||||
swr@2.3.3:
|
||||
resolution: {integrity: sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==}
|
||||
peerDependencies:
|
||||
react: ^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
@ -4565,8 +4685,8 @@ packages:
|
||||
urlpattern-polyfill@10.0.0:
|
||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||
|
||||
use-sync-external-store@1.4.0:
|
||||
resolution: {integrity: sha512-9WXSPC5fMv61vaupRkCKCxsPxBocVnwakBEkMIHHpkTTg6icbJtg6jzgtLDm4bl3cSHAca52rYWih0k4K3PfHw==}
|
||||
use-sync-external-store@1.5.0:
|
||||
resolution: {integrity: sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==}
|
||||
peerDependencies:
|
||||
react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
@ -4773,8 +4893,8 @@ packages:
|
||||
resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
zod-to-json-schema@3.24.2:
|
||||
resolution: {integrity: sha512-pNUqrcSxuuB3/+jBbU8qKUbTbDqYUaG1vf5cXFjbhGgoUuA1amO/y4Q8lzfOhHU8HNPK6VFJ18lBDKj3OHyDsg==}
|
||||
zod-to-json-schema@3.24.5:
|
||||
resolution: {integrity: sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==}
|
||||
peerDependencies:
|
||||
zod: ^3.24.1
|
||||
|
||||
@ -4786,10 +4906,81 @@ packages:
|
||||
|
||||
snapshots:
|
||||
|
||||
'@ai-sdk/openai@1.3.10(zod@3.24.2)':
|
||||
'@ai-sdk/anthropic@1.2.10(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.2
|
||||
'@ai-sdk/provider-utils': 2.2.6(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.3
|
||||
'@ai-sdk/provider-utils': 2.2.7(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/anthropic@1.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/deepinfra@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/fireworks@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/openai-compatible': 0.2.4(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/google-vertex@2.2.15(encoding@0.1.13)(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/anthropic': 1.2.10(zod@3.24.2)
|
||||
'@ai-sdk/google': 1.2.11(zod@3.24.2)
|
||||
'@ai-sdk/provider': 1.1.3
|
||||
'@ai-sdk/provider-utils': 2.2.7(zod@3.24.2)
|
||||
google-auth-library: 9.15.1(encoding@0.1.13)
|
||||
zod: 3.24.2
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- supports-color
|
||||
|
||||
'@ai-sdk/google@1.2.11(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.3
|
||||
'@ai-sdk/provider-utils': 2.2.7(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/google@1.2.3(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/groq@1.2.1(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.1(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/openai-compatible@0.2.4(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
'@ai-sdk/provider-utils': 2.2.3(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/openai@1.3.12(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.3
|
||||
'@ai-sdk/provider-utils': 2.2.7(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.10(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.9
|
||||
eventsource-parser: 3.0.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.1.9(zod@3.24.2)':
|
||||
@ -4801,6 +4992,20 @@ snapshots:
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.1(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.3(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.0
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.6(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.2
|
||||
@ -4808,20 +5013,39 @@ snapshots:
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider-utils@2.2.7(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.3
|
||||
nanoid: 3.3.8
|
||||
secure-json-parse: 2.7.0
|
||||
zod: 3.24.2
|
||||
|
||||
'@ai-sdk/provider@1.0.8':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.0.9':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.1.0':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.1.2':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/provider@1.1.3':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/react@1.2.8(react@18.3.1)(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider-utils': 2.2.6(zod@3.24.2)
|
||||
'@ai-sdk/ui-utils': 1.2.7(zod@3.24.2)
|
||||
react: 18.3.1
|
||||
swr: 2.3.2(react@18.3.1)
|
||||
swr: 2.3.3(react@18.3.1)
|
||||
throttleit: 2.1.0
|
||||
optionalDependencies:
|
||||
zod: 3.24.2
|
||||
@ -4831,7 +5055,7 @@ snapshots:
|
||||
'@ai-sdk/provider': 1.1.2
|
||||
'@ai-sdk/provider-utils': 2.2.6(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
zod-to-json-schema: 3.24.2(zod@3.24.2)
|
||||
zod-to-json-schema: 3.24.5(zod@3.24.2)
|
||||
|
||||
'@ampproject/remapping@2.3.0':
|
||||
dependencies:
|
||||
@ -5906,6 +6130,12 @@ snapshots:
|
||||
|
||||
'@one-ini/wasm@0.1.1': {}
|
||||
|
||||
'@openrouter/ai-sdk-provider@0.4.5(zod@3.24.2)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.0.9
|
||||
'@ai-sdk/provider-utils': 2.1.10(zod@3.24.2)
|
||||
zod: 3.24.2
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
dependencies:
|
||||
'@opentelemetry/api': 1.9.0
|
||||
@ -6837,7 +7067,7 @@ snapshots:
|
||||
'@types/range-parser': 1.2.7
|
||||
'@types/send': 0.17.4
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
'@types/express-ws@3.0.5':
|
||||
dependencies:
|
||||
'@types/express': 4.17.21
|
||||
'@types/express-serve-static-core': 4.19.3
|
||||
@ -7025,7 +7255,7 @@ snapshots:
|
||||
dependencies:
|
||||
humanize-ms: 1.2.1
|
||||
|
||||
ai@4.3.4(react@18.3.1)(zod@3.24.2):
|
||||
ai@4.3.5(react@18.3.1)(zod@3.24.2):
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 1.1.2
|
||||
'@ai-sdk/provider-utils': 2.2.6(zod@3.24.2)
|
||||
@ -7350,7 +7580,7 @@ snapshots:
|
||||
ansi-styles: 4.3.0
|
||||
supports-color: 7.2.0
|
||||
|
||||
chalk@5.3.0: {}
|
||||
chalk@5.4.1: {}
|
||||
|
||||
char-regex@1.0.2: {}
|
||||
|
||||
@ -8829,7 +9059,7 @@ snapshots:
|
||||
jsondiffpatch@0.6.0:
|
||||
dependencies:
|
||||
'@types/diff-match-patch': 1.0.36
|
||||
chalk: 5.3.0
|
||||
chalk: 5.4.1
|
||||
diff-match-patch: 1.0.5
|
||||
|
||||
jsonfile@6.1.0:
|
||||
@ -9956,11 +10186,11 @@ snapshots:
|
||||
|
||||
supports-preserve-symlinks-flag@1.0.0: {}
|
||||
|
||||
swr@2.3.2(react@18.3.1):
|
||||
swr@2.3.3(react@18.3.1):
|
||||
dependencies:
|
||||
dequal: 2.0.3
|
||||
react: 18.3.1
|
||||
use-sync-external-store: 1.4.0(react@18.3.1)
|
||||
use-sync-external-store: 1.5.0(react@18.3.1)
|
||||
|
||||
sylvester@0.0.12: {}
|
||||
|
||||
@ -10158,7 +10388,7 @@ snapshots:
|
||||
|
||||
urlpattern-polyfill@10.0.0: {}
|
||||
|
||||
use-sync-external-store@1.4.0(react@18.3.1):
|
||||
use-sync-external-store@1.5.0(react@18.3.1):
|
||||
dependencies:
|
||||
react: 18.3.1
|
||||
|
||||
@ -10336,7 +10566,7 @@ snapshots:
|
||||
|
||||
yocto-queue@0.1.0: {}
|
||||
|
||||
zod-to-json-schema@3.24.2(zod@3.24.2):
|
||||
zod-to-json-schema@3.24.5(zod@3.24.2):
|
||||
dependencies:
|
||||
zod: 3.24.2
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Pick your baseUrl here:
|
||||
@baseUrl = http://localhost:3002
|
||||
#@baseUrl = https://api.firecrawl.dev
|
||||
# @baseUrl = http://localhost:3002
|
||||
@baseUrl = https://api.firecrawl.dev
|
||||
|
||||
### Scrape Website
|
||||
# @name scrape
|
||||
@ -9,7 +9,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"https://firecrawl.dev"
|
||||
"url": "https://firecrawl.dev"
|
||||
}
|
||||
|
||||
### Crawl Website
|
||||
@ -65,38 +65,6 @@ content-type: application/json
|
||||
"sitemapOnly": true
|
||||
}
|
||||
|
||||
### Extract Firecrawl Title
|
||||
# @name extractFirecrawl
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls": [
|
||||
"https://firecrawl.dev/blog"
|
||||
],
|
||||
"origin": "api-sdk",
|
||||
"prompt": "Extract all the blog titles from the page, is multity entity = true",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"blog_titles": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["blog_titles"]
|
||||
}
|
||||
}
|
||||
|
||||
###
|
||||
@extractFirecrawlId = {{extractFirecrawl.response.body.$.id}}
|
||||
# @name extractFirecrawlStatus
|
||||
GET {{baseUrl}}/v1/extract/{{extractFirecrawlId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
###
|
||||
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { parseApi } from "../lib/parseApi";
|
||||
import { getRateLimiter, isTestSuiteToken } from "../services/rate-limiter";
|
||||
import { getRateLimiter } from "../services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
@ -89,6 +89,8 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage
|
||||
preview: 5,
|
||||
crawlStatus: 500,
|
||||
extractStatus: 500,
|
||||
extractAgentPreview: 1,
|
||||
scrapeAgentPreview: 5,
|
||||
},
|
||||
price_credits: 99999999,
|
||||
credits_used: 0,
|
||||
@ -121,6 +123,8 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({
|
||||
preview: 99999999,
|
||||
crawlStatus: 99999999,
|
||||
extractStatus: 99999999,
|
||||
extractAgentPreview: 99999999,
|
||||
scrapeAgentPreview: 99999999,
|
||||
},
|
||||
price_credits: 99999999,
|
||||
credits_used: 0,
|
||||
|
@ -64,7 +64,7 @@ export async function getJobs(crawlId: string, ids: string[]): Promise<PseudoJob
|
||||
timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(),
|
||||
failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined,
|
||||
}
|
||||
|
||||
|
||||
jobs.push(job);
|
||||
}
|
||||
|
||||
|
@ -292,13 +292,6 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
}
|
||||
}
|
||||
|
||||
const { scrapeOptions } = fromLegacyScrapeOptions(
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
team_id,
|
||||
);
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
|
@ -279,7 +279,11 @@ export async function crawlStatusController(
|
||||
status,
|
||||
completed: doneJobsLength,
|
||||
total: totalCount,
|
||||
creditsUsed: totalCount,
|
||||
creditsUsed: totalCount * (
|
||||
sc.scrapeOptions?.extract
|
||||
? 5
|
||||
: 1
|
||||
),
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
next:
|
||||
status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this
|
||||
|
@ -72,5 +72,6 @@ export async function extractStatusController(
|
||||
steps: extract.showSteps ? extract.steps : undefined,
|
||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
||||
sources: extract.showSources ? extract.sources : undefined,
|
||||
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
|
||||
});
|
||||
}
|
||||
|
@ -53,6 +53,7 @@ export async function extractController(
|
||||
teamId: req.auth.team_id,
|
||||
subId: req.acuc?.sub_id,
|
||||
extractId,
|
||||
agent: req.body.agent,
|
||||
};
|
||||
|
||||
if (
|
||||
@ -71,6 +72,7 @@ export async function extractController(
|
||||
showSteps: req.body.__experimental_streamSteps,
|
||||
showLLMUsage: req.body.__experimental_llmUsage,
|
||||
showSources: req.body.__experimental_showSources || req.body.showSources,
|
||||
showCostTracking: req.body.__experimental_showCostTracking,
|
||||
});
|
||||
|
||||
if (Sentry.isInitialized()) {
|
||||
|
@ -27,8 +27,10 @@ export async function scrapeStatusController(req: any, res: any) {
|
||||
});
|
||||
}
|
||||
|
||||
const data = job?.docs[0];
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: job?.docs[0],
|
||||
data,
|
||||
});
|
||||
}
|
||||
|
@ -109,6 +109,9 @@ export async function scrapeController(
|
||||
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
if (req.body.agent?.model?.toLowerCase() === "fire-1") {
|
||||
creditsToBeBilled = 150;
|
||||
}
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
|
||||
(error) => {
|
||||
@ -125,6 +128,12 @@ export async function scrapeController(
|
||||
}
|
||||
}
|
||||
|
||||
const cost_tracking = doc?.metadata?.costTracking;
|
||||
|
||||
if (doc && doc.metadata) {
|
||||
delete doc.metadata.costTracking;
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: doc,
|
||||
|
@ -55,6 +55,17 @@ export const url = z.preprocess(
|
||||
const strictMessage =
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const agentExtractModelValue = 'fire-1'
|
||||
export const isAgentExtractModelValid = (x: string | undefined) => x?.toLowerCase() === agentExtractModelValue;
|
||||
|
||||
export const agentOptionsExtract = z
|
||||
.object({
|
||||
model: z.string().default(agentExtractModelValue),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
export type AgentOptions = z.infer<typeof agentOptionsExtract>;
|
||||
|
||||
export const extractOptions = z
|
||||
.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
@ -62,13 +73,53 @@ export const extractOptions = z
|
||||
systemPrompt: z
|
||||
.string()
|
||||
.max(10000)
|
||||
.default(
|
||||
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.",
|
||||
),
|
||||
.default(""),
|
||||
prompt: z.string().max(10000).optional(),
|
||||
temperature: z.number().optional(),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
.strict(strictMessage)
|
||||
.transform((data) => ({
|
||||
...data,
|
||||
systemPrompt: "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
|
||||
}));
|
||||
|
||||
export const extractOptionsWithAgent = z
|
||||
.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
schema: z.any().optional(),
|
||||
systemPrompt: z
|
||||
.string()
|
||||
.max(10000)
|
||||
.default(""),
|
||||
prompt: z.string().max(10000).optional(),
|
||||
temperature: z.number().optional(),
|
||||
agent: z
|
||||
.object({
|
||||
model: z.string().default(agentExtractModelValue),
|
||||
prompt: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.transform((data) => ({
|
||||
...data,
|
||||
systemPrompt: isAgentExtractModelValid(data.agent?.model)
|
||||
? `You are an expert web data extractor. Your task is to analyze the provided markdown content from a web page and generate a JSON object based *strictly* on the provided schema.
|
||||
|
||||
Key Instructions:
|
||||
1. **Schema Adherence:** Populate the JSON object according to the structure defined in the schema.
|
||||
2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
|
||||
3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
|
||||
4. **SmartScrape Recommendation:**
|
||||
* Assess if the *full* required data seems unavailable in the current markdown likely because:
|
||||
- Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling)
|
||||
- Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll)
|
||||
- Content is dynamically loaded after user actions
|
||||
* If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
|
||||
* If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`.
|
||||
5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`
|
||||
: "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
|
||||
}));
|
||||
|
||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
@ -253,17 +304,24 @@ const baseScrapeOptions = z
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
const fire1Refine = (obj) => {
|
||||
if (obj.agent?.model?.toLowerCase() === "fire-1" && obj.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
const fire1RefineOpts = {
|
||||
message: "You may only specify the FIRE-1 model in agent or jsonOptions.agent, but not both.",
|
||||
};
|
||||
const extractRefine = (obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions)
|
||||
|| (!hasExtractFormat && !hasExtractOptions)
|
||||
) && (
|
||||
(hasJsonFormat && hasJsonOptions)
|
||||
|| (!hasJsonFormat && !hasJsonOptions)
|
||||
((hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)) &&
|
||||
((hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions))
|
||||
);
|
||||
};
|
||||
const extractRefineOpts = {
|
||||
@ -277,7 +335,7 @@ const extractTransform = (obj) => {
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
(obj.timeout === 30000)
|
||||
obj.timeout === 30000
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
@ -290,6 +348,10 @@ const extractTransform = (obj) => {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if (obj.agent) {
|
||||
obj = { ...obj, timeout: 300000 };
|
||||
}
|
||||
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
@ -302,6 +364,7 @@ const extractTransform = (obj) => {
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
agent: obj.jsonOptions.agent,
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
@ -311,6 +374,16 @@ const extractTransform = (obj) => {
|
||||
};
|
||||
|
||||
export const scrapeOptions = baseScrapeOptions
|
||||
.extend({
|
||||
agent: z
|
||||
.object({
|
||||
model: z.string().default(agentExtractModelValue),
|
||||
prompt: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
extract: extractOptionsWithAgent.optional(),
|
||||
jsonOptions: extractOptionsWithAgent.optional(),
|
||||
})
|
||||
.refine(
|
||||
(obj) => {
|
||||
if (!obj.actions) return true;
|
||||
@ -324,11 +397,13 @@ export const scrapeOptions = baseScrapeOptions
|
||||
},
|
||||
)
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.refine(fire1Refine, fire1RefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
import Ajv from "ajv";
|
||||
import type { CostTracking } from "../../lib/extract/extraction-service";
|
||||
|
||||
const ajv = new Ajv();
|
||||
|
||||
@ -362,7 +437,7 @@ export const extractV1Options = z
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
enableWebSearch: z.boolean().default(false),
|
||||
scrapeOptions: scrapeOptions.default({ onlyMainContent: false }).optional(),
|
||||
scrapeOptions: baseScrapeOptions.default({ onlyMainContent: false }).optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
urlTrace: z.boolean().default(false),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
@ -375,14 +450,13 @@ export const extractV1Options = z
|
||||
.enum(["direct", "save", "load"])
|
||||
.default("direct")
|
||||
.optional(),
|
||||
agent: agentOptionsExtract.optional(),
|
||||
__experimental_showCostTracking: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => obj.urls || obj.prompt,
|
||||
{
|
||||
message: "Either 'urls' or 'prompt' must be provided.",
|
||||
},
|
||||
)
|
||||
.refine((obj) => obj.urls || obj.prompt, {
|
||||
message: "Either 'urls' or 'prompt' must be provided.",
|
||||
})
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
@ -391,6 +465,10 @@ export const extractV1Options = z
|
||||
(x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true),
|
||||
extractRefineOpts,
|
||||
)
|
||||
.refine(
|
||||
(x) => (x.scrapeOptions ? fire1Refine(x.scrapeOptions) : true),
|
||||
fire1RefineOpts,
|
||||
)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: x.scrapeOptions
|
||||
@ -407,11 +485,20 @@ export const scrapeRequestSchema = baseScrapeOptions
|
||||
.omit({ timeout: true })
|
||||
.extend({
|
||||
url,
|
||||
agent: z
|
||||
.object({
|
||||
model: z.string().default(agentExtractModelValue),
|
||||
prompt: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
extract: extractOptionsWithAgent.optional(),
|
||||
jsonOptions: extractOptionsWithAgent.optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.refine(fire1Refine, fire1RefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
@ -447,6 +534,7 @@ export const batchScrapeRequestSchema = baseScrapeOptions
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.refine(fire1Refine, fire1RefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
@ -459,6 +547,7 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.refine(fire1Refine, fire1RefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
@ -498,12 +587,13 @@ export const crawlRequestSchema = crawlerOptions
|
||||
.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.default({}),
|
||||
scrapeOptions: baseScrapeOptions.default({}),
|
||||
webhook: webhookSchema.optional(),
|
||||
limit: z.number().default(10000),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
@ -563,8 +653,8 @@ export type Document = {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
javascriptReturns?: {
|
||||
type: string,
|
||||
value: unknown
|
||||
type: string;
|
||||
value: unknown;
|
||||
}[];
|
||||
};
|
||||
changeTracking?: {
|
||||
@ -609,6 +699,7 @@ export type Document = {
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
favicon?: string;
|
||||
dcTermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
@ -628,7 +719,8 @@ export type Document = {
|
||||
statusCode: number;
|
||||
scrapeId?: string;
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
costTracking?: CostTracking;
|
||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||
};
|
||||
serpResults?: {
|
||||
title: string;
|
||||
@ -798,6 +890,8 @@ export type AuthCreditUsageChunk = {
|
||||
preview: number;
|
||||
crawlStatus: number;
|
||||
extractStatus: number;
|
||||
extractAgentPreview?: number;
|
||||
scrapeAgentPreview?: number;
|
||||
};
|
||||
concurrency: number;
|
||||
|
||||
@ -895,7 +989,7 @@ export function fromLegacyCrawlerOptions(x: any, teamId: string): {
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||
}),
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
teamId,
|
||||
@ -1054,6 +1148,7 @@ export const searchRequestSchema = z
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||
)
|
||||
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
@ -1099,6 +1194,6 @@ export type GenerateLLMsTextRequest = z.infer<
|
||||
|
||||
export class TimeoutSignal extends Error {
|
||||
constructor() {
|
||||
super("Operation timed out")
|
||||
super("Operation timed out");
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { RateLimiterMode } from "./types";
|
||||
import { attachWsProxy } from "./services/agentLivecastWS";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
@ -39,7 +40,9 @@ const cacheable = new CacheableLookup();
|
||||
cacheable.install(http.globalAgent);
|
||||
cacheable.install(https.globalAgent);
|
||||
|
||||
const ws = expressWs(express());
|
||||
// Initialize Express with WebSocket support
|
||||
const expressApp = express();
|
||||
const ws = expressWs(expressApp);
|
||||
const app = ws.app;
|
||||
|
||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||
@ -87,6 +90,9 @@ const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
// Attach WebSocket proxy to the Express app
|
||||
attachWsProxy(app);
|
||||
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
});
|
||||
|
@ -5,9 +5,12 @@ import {
|
||||
DeepResearchSource,
|
||||
updateDeepResearch,
|
||||
} from "./deep-research-redis";
|
||||
import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import {
|
||||
generateCompletions,
|
||||
trimToTokenLimit,
|
||||
} from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
import { openai } from "@ai-sdk/openai/dist";
|
||||
|
||||
import { getModel } from "../generic-ai";
|
||||
interface AnalysisResult {
|
||||
gaps: string[];
|
||||
@ -52,7 +55,7 @@ export class ResearchStateManager {
|
||||
}
|
||||
|
||||
async addActivity(activities: DeepResearchActivity[]): Promise<void> {
|
||||
if (activities.some(activity => activity.status === "complete")) {
|
||||
if (activities.some((activity) => activity.status === "complete")) {
|
||||
this.completedSteps++;
|
||||
}
|
||||
|
||||
@ -190,7 +193,7 @@ export class ResearchLLMService {
|
||||
Every search query is a new SERP query so make sure the whole context is added without overwhelming the search engine.
|
||||
The first SERP query you generate should be a very concise, simple version of the topic. `,
|
||||
},
|
||||
markdown: ""
|
||||
markdown: "",
|
||||
});
|
||||
|
||||
return extract.queries;
|
||||
@ -260,31 +263,31 @@ export class ResearchLLMService {
|
||||
formats?: string[],
|
||||
jsonOptions?: ExtractOptions,
|
||||
): Promise<any> {
|
||||
if(!formats) {
|
||||
formats = ['markdown'];
|
||||
if (!formats) {
|
||||
formats = ["markdown"];
|
||||
}
|
||||
if(!jsonOptions) {
|
||||
if (!jsonOptions) {
|
||||
jsonOptions = undefined;
|
||||
}
|
||||
|
||||
|
||||
const { extract } = await generateCompletions({
|
||||
logger: this.logger.child({
|
||||
method: "generateFinalAnalysis",
|
||||
}),
|
||||
mode: formats.includes('json') ? 'object' : 'no-object',
|
||||
mode: formats.includes("json") ? "object" : "no-object",
|
||||
options: {
|
||||
mode: "llm",
|
||||
...(formats.includes('json') && {
|
||||
...jsonOptions
|
||||
...(formats.includes("json") && {
|
||||
...jsonOptions,
|
||||
}),
|
||||
systemPrompt: formats.includes('json')
|
||||
systemPrompt: formats.includes("json")
|
||||
? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
|
||||
: "You are an expert research analyst who creates comprehensive, well-structured reports. Don't begin the report by saying 'Here is the report', nor 'Below is the report', nor something similar. ALWAYS start with a great title that reflects the research topic and findings. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
|
||||
new Date().toISOString().split("T")[0],
|
||||
prompt: trimToTokenLimit(
|
||||
analysisPrompt
|
||||
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
|
||||
: formats.includes('json')
|
||||
: formats.includes("json")
|
||||
? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
|
||||
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
|
||||
|
||||
@ -308,7 +311,7 @@ export class ResearchLLMService {
|
||||
).text,
|
||||
},
|
||||
markdown: "",
|
||||
model: getModel('o3-mini'),
|
||||
model: getModel("o3-mini"),
|
||||
});
|
||||
|
||||
return extract;
|
||||
|
@ -10,7 +10,7 @@ Provide a rephrased search query that:
|
||||
4. Is concise and focused
|
||||
5. Short is better than long
|
||||
6. It is a search engine, not a chatbot
|
||||
7. Concise
|
||||
7. Concise, no more than 3 words besides the site
|
||||
|
||||
Return only the rephrased search query, without any explanation or additional text.`;
|
||||
}
|
||||
@ -40,7 +40,20 @@ to determine their relevance to the user's query and intent.
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
||||
return `Given these URLs and their content, analyze their relevance to this extraction request: "${searchQuery}".
|
||||
|
||||
For each URL, consider:
|
||||
1. How well it matches the extraction needs
|
||||
2. The quantity and quality of extractable information
|
||||
3. Whether the content structure matches what we're looking for
|
||||
|
||||
Score each URL from 0-1 based on the scoring guidelines provided in the system prompt.
|
||||
|
||||
Provide detailed reasoning for each URL to explain why you assigned that score, considering:
|
||||
- Content relevance
|
||||
- Information completeness
|
||||
- Structure suitability
|
||||
- Potential extraction value`;
|
||||
}
|
||||
|
||||
// Multi entity schema anlayzer
|
||||
@ -73,7 +86,7 @@ export function buildAnalyzeSchemaUserPrompt(
|
||||
urls: string[],
|
||||
): string {
|
||||
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\n URLs: ${urls}`;
|
||||
}
|
||||
|
||||
// Should Extract
|
||||
@ -97,8 +110,7 @@ export function buildBatchExtractSystemPrompt(
|
||||
): string {
|
||||
return (
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
|
||||
links.join(", ")
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null.`
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -19,12 +19,16 @@ export async function analyzeSchemaAndPrompt(
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
reasoning?: string;
|
||||
keyIndicators?: string[];
|
||||
reasoning: string;
|
||||
keyIndicators: string[];
|
||||
tokenUsage: TokenUsage;
|
||||
cost: number;
|
||||
}> {
|
||||
let cost = 0;
|
||||
if (!schema) {
|
||||
schema = await generateSchemaFromPrompt(prompt);
|
||||
const genRes = await generateSchemaFromPrompt(prompt);
|
||||
schema = genRes.extract;
|
||||
cost = genRes.cost;
|
||||
}
|
||||
|
||||
const schemaString = JSON.stringify(schema);
|
||||
@ -44,7 +48,7 @@ export async function analyzeSchemaAndPrompt(
|
||||
);
|
||||
|
||||
try {
|
||||
const { extract: result, totalUsage } = await generateCompletions({
|
||||
const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({
|
||||
logger,
|
||||
options: {
|
||||
mode: "llm",
|
||||
@ -55,6 +59,7 @@ export async function analyzeSchemaAndPrompt(
|
||||
markdown: "",
|
||||
model,
|
||||
});
|
||||
cost += cost2;
|
||||
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||
checkSchema.parse(result);
|
||||
@ -65,6 +70,7 @@ export async function analyzeSchemaAndPrompt(
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: totalUsage,
|
||||
cost,
|
||||
};
|
||||
} catch (e) {
|
||||
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
||||
@ -83,5 +89,6 @@ export async function analyzeSchemaAndPrompt(
|
||||
totalTokens: 0,
|
||||
model: model.modelId,
|
||||
},
|
||||
cost: 0,
|
||||
};
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import {
|
||||
generateCompletions,
|
||||
GenerateCompletionsOptions,
|
||||
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
@ -7,6 +10,19 @@ import {
|
||||
buildBatchExtractPrompt,
|
||||
buildBatchExtractSystemPrompt,
|
||||
} from "../build-prompts";
|
||||
import { getModel } from "../../generic-ai";
|
||||
|
||||
import fs from "fs/promises";
|
||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||
|
||||
type BatchExtractOptions = {
|
||||
multiEntitySchema: any;
|
||||
links: string[];
|
||||
prompt: string;
|
||||
systemPrompt: string;
|
||||
doc: Document;
|
||||
useAgent: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* Batch extract information from a list of URLs using a multi-entity schema.
|
||||
@ -17,20 +33,21 @@ import {
|
||||
* @param doc - The document to extract information from
|
||||
* @returns The completion promise
|
||||
*/
|
||||
export async function batchExtractPromise(
|
||||
multiEntitySchema: any,
|
||||
links: string[],
|
||||
prompt: string,
|
||||
systemPrompt: string,
|
||||
doc: Document,
|
||||
): Promise<{
|
||||
extract: any;
|
||||
export async function batchExtractPromise(options: BatchExtractOptions): Promise<{
|
||||
extract: any; // array of extracted data
|
||||
numTokens: number;
|
||||
totalUsage: TokenUsage;
|
||||
warning?: string;
|
||||
sources: string[];
|
||||
smartScrapeCost: number;
|
||||
otherCost: number;
|
||||
smartScrapeCallCount: number;
|
||||
otherCallCount: number;
|
||||
}> {
|
||||
const completion = await generateCompletions({
|
||||
const { multiEntitySchema, links, prompt, systemPrompt, doc, useAgent } = options;
|
||||
|
||||
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
logger: logger.child({
|
||||
method: "extractService/generateCompletions",
|
||||
}),
|
||||
@ -45,13 +62,49 @@ export async function batchExtractPromise(
|
||||
schema: multiEntitySchema,
|
||||
},
|
||||
markdown: buildDocument(doc),
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gemini-2.0-flash", "google"),
|
||||
};
|
||||
|
||||
let extractedDataArray: any[] = [];
|
||||
let warning: string | undefined;
|
||||
let smCost = 0, oCost = 0, smCallCount = 0, oCallCount = 0;
|
||||
try {
|
||||
const { extractedDataArray: e, warning: w, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||
useAgent,
|
||||
});
|
||||
extractedDataArray = e;
|
||||
warning = w;
|
||||
smCost = smartScrapeCost;
|
||||
oCost = otherCost;
|
||||
smCallCount = smartScrapeCallCount;
|
||||
oCallCount = otherCallCount;
|
||||
} catch (error) {
|
||||
console.error(">>>>>>>error>>>>>\n", error);
|
||||
}
|
||||
|
||||
// await fs.writeFile(
|
||||
// `logs/extractedDataArray-${crypto.randomUUID()}.json`,
|
||||
// JSON.stringify(extractedDataArray, null, 2),
|
||||
// );
|
||||
|
||||
// TODO: fix this
|
||||
return {
|
||||
extract: completion.extract,
|
||||
numTokens: completion.numTokens,
|
||||
totalUsage: completion.totalUsage,
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""]
|
||||
extract: extractedDataArray,
|
||||
numTokens: 0,
|
||||
totalUsage: {
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: 0,
|
||||
model: "gemini-2.0-flash",
|
||||
},
|
||||
warning: warning,
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
||||
smartScrapeCost: smCost,
|
||||
otherCost: oCost,
|
||||
smartScrapeCallCount: smCallCount,
|
||||
otherCallCount: oCallCount,
|
||||
};
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ export async function checkShouldExtract(
|
||||
prompt: string,
|
||||
multiEntitySchema: any,
|
||||
doc: Document,
|
||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean }> {
|
||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean; cost: number }> {
|
||||
const shouldExtractCheck = await generateCompletions({
|
||||
logger: logger.child({ method: "extractService/checkShouldExtract" }),
|
||||
options: {
|
||||
@ -37,5 +37,6 @@ export async function checkShouldExtract(
|
||||
return {
|
||||
tokenUsage: shouldExtractCheck.totalUsage,
|
||||
extract: shouldExtractCheck.extract["extract"],
|
||||
cost: shouldExtractCheck.cost,
|
||||
};
|
||||
}
|
||||
|
@ -1,7 +1,12 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import {
|
||||
generateCompletions,
|
||||
GenerateCompletionsOptions,
|
||||
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||
|
||||
export async function singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
@ -9,34 +14,85 @@ export async function singleAnswerCompletion({
|
||||
links,
|
||||
prompt,
|
||||
systemPrompt,
|
||||
useAgent
|
||||
}: {
|
||||
singleAnswerDocs: Document[];
|
||||
rSchema: any;
|
||||
links: string[];
|
||||
prompt: string;
|
||||
systemPrompt: string;
|
||||
useAgent: boolean;
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
tokenUsage: TokenUsage;
|
||||
sources: string[];
|
||||
smartScrapeCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCallCount: number;
|
||||
otherCost: number;
|
||||
}> {
|
||||
const completion = await generateCompletions({
|
||||
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
||||
schema: rSchema,
|
||||
},
|
||||
markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||
isExtractEndpoint: true
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.",
|
||||
prompt: docsPrompt,
|
||||
schema: rSchema,
|
||||
},
|
||||
markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`,
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gemini-2.0-flash", "google"),
|
||||
};
|
||||
|
||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
||||
useAgent,
|
||||
});
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: completion.totalUsage,
|
||||
sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "")
|
||||
|
||||
const completion = {
|
||||
extract: extractedDataArray,
|
||||
tokenUsage: {
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: 0,
|
||||
model: "gemini-2.0-flash",
|
||||
},
|
||||
sources: singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
),
|
||||
};
|
||||
|
||||
// const completion = await generateCompletions({
|
||||
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
// options: {
|
||||
// mode: "llm",
|
||||
// systemPrompt:
|
||||
// (systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
// "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.",
|
||||
// prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
||||
// schema: rSchema,
|
||||
// },
|
||||
// markdown: singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||
// isExtractEndpoint: true,
|
||||
// model: getModel("gemini-2.0-flash", "google"),
|
||||
// });
|
||||
// await fs.writeFile(
|
||||
// `logs/singleAnswer-${crypto.randomUUID()}.json`,
|
||||
// JSON.stringify(completion, null, 2),
|
||||
// );
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: completion.tokenUsage,
|
||||
sources: singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
),
|
||||
smartScrapeCost,
|
||||
otherCost,
|
||||
smartScrapeCallCount,
|
||||
otherCallCount,
|
||||
};
|
||||
}
|
||||
|
@ -2,8 +2,8 @@ export const extractConfig = {
|
||||
RERANKING: {
|
||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||
MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
|
||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75,
|
||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5,
|
||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.00000001,
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
},
|
||||
DEDUPLICATION: {
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
import { logger as _logger } from "../logger";
|
||||
import { CostTracking } from "./extraction-service";
|
||||
|
||||
export enum ExtractStep {
|
||||
INITIAL = "initial",
|
||||
@ -32,6 +33,8 @@ export type StoredExtract = {
|
||||
showLLMUsage?: boolean;
|
||||
showSources?: boolean;
|
||||
llmUsage?: number;
|
||||
showCostTracking?: boolean;
|
||||
costTracking?: CostTracking;
|
||||
sources?: {
|
||||
[key: string]: string[];
|
||||
};
|
||||
|
@ -1,6 +1,7 @@
|
||||
import {
|
||||
Document,
|
||||
ExtractRequest,
|
||||
isAgentExtractModelValid,
|
||||
TokenUsage,
|
||||
URLTrace,
|
||||
} from "../../controllers/v1/types";
|
||||
@ -26,12 +27,8 @@ import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
||||
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
||||
import { areMergeable } from "./helpers/merge-null-val-objs";
|
||||
import { CUSTOM_U_TEAMS } from "./config";
|
||||
import {
|
||||
calculateFinalResultCost,
|
||||
estimateTotalCost,
|
||||
} from "./usage/llm-cost";
|
||||
import { calculateFinalResultCost, estimateTotalCost } from "./usage/llm-cost";
|
||||
import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt";
|
||||
import { checkShouldExtract } from "./completions/checkShouldExtract";
|
||||
import { batchExtractPromise } from "./completions/batchExtract";
|
||||
import { singleAnswerCompletion } from "./completions/singleAnswer";
|
||||
import { SourceTracker } from "./helpers/source-tracker";
|
||||
@ -39,13 +36,14 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||
import { normalizeUrl } from "../canonical-url";
|
||||
import { search } from "../../search";
|
||||
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||
|
||||
import fs from "fs/promises";
|
||||
interface ExtractServiceOptions {
|
||||
request: ExtractRequest;
|
||||
teamId: string;
|
||||
subId?: string;
|
||||
cacheMode?: "load" | "save" | "direct";
|
||||
cacheKey?: string;
|
||||
agent?: boolean;
|
||||
}
|
||||
|
||||
export interface ExtractResult {
|
||||
@ -69,6 +67,14 @@ type completions = {
|
||||
sources?: string[];
|
||||
};
|
||||
|
||||
export type CostTracking = {
|
||||
smartScrapeCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCallCount: number;
|
||||
otherCost: number;
|
||||
totalCost: number;
|
||||
costLimitExceededTokenUsage?: number;
|
||||
};
|
||||
|
||||
export async function performExtraction(
|
||||
extractId: string,
|
||||
@ -83,7 +89,18 @@ export async function performExtraction(
|
||||
let singleAnswerResult: any = {};
|
||||
let totalUrlsScraped = 0;
|
||||
let sources: Record<string, string[]> = {};
|
||||
let costTracking: CostTracking = {
|
||||
smartScrapeCallCount: 0,
|
||||
smartScrapeCost: 0,
|
||||
otherCallCount: 0,
|
||||
otherCost: 0,
|
||||
totalCost: 0,
|
||||
};
|
||||
|
||||
let log = {
|
||||
extractId,
|
||||
request,
|
||||
};
|
||||
|
||||
const logger = _logger.child({
|
||||
module: "extract",
|
||||
@ -97,13 +114,21 @@ export async function performExtraction(
|
||||
logger.debug("Generating URLs from prompt...", {
|
||||
prompt: request.prompt,
|
||||
});
|
||||
const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt));
|
||||
const rephrasedPrompt = await generateBasicCompletion(
|
||||
buildRephraseToSerpPrompt(request.prompt),
|
||||
);
|
||||
let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || "";
|
||||
if (rephrasedPrompt) {
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rephrasedPrompt.cost;
|
||||
costTracking.totalCost += rephrasedPrompt.cost;
|
||||
}
|
||||
const searchResults = await search({
|
||||
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||
query: rptxt,
|
||||
num_results: 10,
|
||||
});
|
||||
|
||||
request.urls = searchResults.map(result => result.url) as string[];
|
||||
request.urls = searchResults.map((result) => result.url) as string[];
|
||||
}
|
||||
if (request.urls && request.urls.length === 0) {
|
||||
logger.error("No search results found", {
|
||||
@ -118,7 +143,11 @@ export async function performExtraction(
|
||||
|
||||
const urls = request.urls || ([] as string[]);
|
||||
|
||||
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
|
||||
if (
|
||||
request.__experimental_cacheMode == "load" &&
|
||||
request.__experimental_cacheKey &&
|
||||
urls
|
||||
) {
|
||||
logger.debug("Loading cached docs...");
|
||||
try {
|
||||
const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
|
||||
@ -147,12 +176,66 @@ export async function performExtraction(
|
||||
],
|
||||
});
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt);
|
||||
reqSchema = schemaGenRes.extract;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += schemaGenRes.cost;
|
||||
costTracking.totalCost += schemaGenRes.cost;
|
||||
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
}
|
||||
|
||||
if (reqSchema) {
|
||||
reqSchema = await dereferenceSchema(reqSchema);
|
||||
}
|
||||
|
||||
logger.debug("Transformed schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
let rSchema = reqSchema;
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
cost: schemaAnalysisCost,
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "");
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += schemaAnalysisCost;
|
||||
costTracking.totalCost += schemaAnalysisCost;
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
let startMap = Date.now();
|
||||
let aggMapLinks: string[] = [];
|
||||
logger.debug("Processing URLs...", {
|
||||
urlCount: request.urls?.length || 0,
|
||||
});
|
||||
|
||||
|
||||
const urlPromises = urls.map((url) =>
|
||||
processUrl(
|
||||
{
|
||||
@ -164,6 +247,11 @@ export async function performExtraction(
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
schema: request.schema,
|
||||
log,
|
||||
isMultiEntity,
|
||||
reasoning,
|
||||
multiEntityKeys,
|
||||
keyIndicators,
|
||||
},
|
||||
urlTraces,
|
||||
(links: string[]) => {
|
||||
@ -180,6 +268,7 @@ export async function performExtraction(
|
||||
});
|
||||
},
|
||||
logger.child({ module: "extract", method: "processUrl", url }),
|
||||
costTracking,
|
||||
),
|
||||
);
|
||||
|
||||
@ -189,6 +278,9 @@ export async function performExtraction(
|
||||
linkCount: links.length,
|
||||
});
|
||||
|
||||
log["links"] = links;
|
||||
log["linksLength"] = links.length;
|
||||
|
||||
if (links.length === 0) {
|
||||
logger.error("0 links! Bailing.", {
|
||||
linkCount: links.length,
|
||||
@ -215,55 +307,8 @@ export async function performExtraction(
|
||||
],
|
||||
});
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
reqSchema = await generateSchemaFromPrompt(request.prompt);
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
}
|
||||
|
||||
if (reqSchema) {
|
||||
reqSchema = await dereferenceSchema(reqSchema);
|
||||
}
|
||||
|
||||
logger.debug("Transformed schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
// console.log("\nIs Multi Entity:", isMultiEntity);
|
||||
// console.log("\nMulti Entity Keys:", multiEntityKeys);
|
||||
// console.log("\nReasoning:", reasoning);
|
||||
// console.log("\nKey Indicators:", keyIndicators);
|
||||
|
||||
let rSchema = reqSchema;
|
||||
if (isMultiEntity && reqSchema) {
|
||||
log["isMultiEntity"] = true;
|
||||
logger.debug("=== MULTI-ENTITY ===");
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
@ -301,7 +346,8 @@ export async function performExtraction(
|
||||
|
||||
logger.debug("Starting multi-entity scrape...");
|
||||
let startScrape = Date.now();
|
||||
|
||||
log["docsSizeBeforeMultiEntityScrape"] = docsMap.size;
|
||||
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument(
|
||||
@ -323,7 +369,7 @@ export async function performExtraction(
|
||||
|
||||
// Needs to be true for multi-entity to work properly
|
||||
onlyMainContent: true,
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
@ -333,6 +379,8 @@ export async function performExtraction(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
|
||||
log["docsSizeAfterMultiEntityScrape"] = scrapePromises.length;
|
||||
|
||||
logger.debug("Multi-entity scrape finished.", {
|
||||
docCount: multyEntityDocs.length,
|
||||
});
|
||||
@ -365,7 +413,7 @@ export async function performExtraction(
|
||||
const chunkSize = 50;
|
||||
const timeoutCompletion = 45000; // 45 second timeout
|
||||
const chunks: Document[][] = [];
|
||||
const extractionResults: {extract: any, url: string}[] = [];
|
||||
const extractionResults: { extract: any; url: string }[] = [];
|
||||
|
||||
// Split into chunks
|
||||
for (let i = 0; i < multyEntityDocs.length; i += chunkSize) {
|
||||
@ -383,68 +431,36 @@ export async function performExtraction(
|
||||
setTimeout(() => resolve(null), timeoutCompletion);
|
||||
});
|
||||
|
||||
// Check if page should be extracted before proceeding
|
||||
const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
|
||||
request.prompt ?? "",
|
||||
const completionPromise = batchExtractPromise({
|
||||
multiEntitySchema,
|
||||
links,
|
||||
prompt: request.prompt ?? "",
|
||||
systemPrompt: request.systemPrompt ?? "",
|
||||
doc,
|
||||
);
|
||||
|
||||
tokenUsage.push(shouldExtractCheckTokenUsage);
|
||||
|
||||
if (!extract) {
|
||||
logger.info(
|
||||
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
// Add confidence score to schema with 5 levels
|
||||
const schemaWithConfidence = {
|
||||
...multiEntitySchema,
|
||||
properties: {
|
||||
...multiEntitySchema.properties,
|
||||
is_content_relevant: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
||||
},
|
||||
},
|
||||
required: [
|
||||
...(multiEntitySchema.required || []),
|
||||
"is_content_relevant",
|
||||
],
|
||||
};
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
||||
startedAt: startScrape,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: [
|
||||
doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
],
|
||||
},
|
||||
],
|
||||
useAgent: isAgentExtractModelValid(request.agent?.model)
|
||||
});
|
||||
|
||||
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
||||
|
||||
// Race between timeout and completion
|
||||
const multiEntityCompletion = (await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
])) as Awaited<ReturnType<typeof generateCompletions>>;
|
||||
const multiEntityCompletion = (await completionPromise) as Awaited<
|
||||
ReturnType<typeof batchExtractPromise>
|
||||
>;
|
||||
|
||||
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
||||
|
||||
// Track multi-entity extraction tokens
|
||||
if (multiEntityCompletion) {
|
||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||
|
||||
|
||||
costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount;
|
||||
costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost;
|
||||
costTracking.otherCallCount += multiEntityCompletion.otherCallCount;
|
||||
costTracking.otherCost += multiEntityCompletion.otherCost;
|
||||
costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost;
|
||||
|
||||
if (multiEntityCompletion.extract) {
|
||||
return {
|
||||
extract: multiEntityCompletion.extract,
|
||||
url: doc.metadata.url || doc.metadata.sourceURL || ""
|
||||
url: doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -490,42 +506,115 @@ export async function performExtraction(
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for current chunk to complete before processing next chunk
|
||||
const chunkResults = await Promise.all(chunkPromises);
|
||||
const validResults = chunkResults.filter((result): result is {extract: any, url: string} => result !== null);
|
||||
const validResults = chunkResults.filter(
|
||||
(result): result is { extract: any; url: string } => result !== null,
|
||||
);
|
||||
extractionResults.push(...validResults);
|
||||
multiEntityCompletions.push(...validResults.map(r => r.extract));
|
||||
// Merge all extracts from valid results into a single array
|
||||
const extractArrays = validResults.map((r) =>
|
||||
Array.isArray(r.extract) ? r.extract : [r.extract],
|
||||
);
|
||||
const mergedExtracts = extractArrays.flat();
|
||||
multiEntityCompletions.push(...mergedExtracts);
|
||||
multiEntityCompletions = multiEntityCompletions.filter((c) => c !== null);
|
||||
logger.debug("All multi-entity completion chunks finished.", {
|
||||
completionCount: multiEntityCompletions.length,
|
||||
});
|
||||
log["multiEntityCompletionsLength"] = multiEntityCompletions.length;
|
||||
}
|
||||
|
||||
try {
|
||||
// Use SourceTracker to handle source tracking
|
||||
const sourceTracker = new SourceTracker();
|
||||
|
||||
// Transform and merge results while preserving sources
|
||||
sourceTracker.transformResults(extractionResults, multiEntitySchema, false);
|
||||
|
||||
multiEntityResult = transformArrayToObject(
|
||||
multiEntitySchema,
|
||||
multiEntityCompletions,
|
||||
);
|
||||
|
||||
// Track sources before deduplication
|
||||
sourceTracker.trackPreDeduplicationSources(multiEntityResult);
|
||||
|
||||
// Apply deduplication and merge
|
||||
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
|
||||
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
||||
|
||||
// Map sources to final deduplicated/merged items
|
||||
const multiEntitySources = sourceTracker.mapSourcesToFinalItems(multiEntityResult, multiEntityKeys);
|
||||
Object.assign(sources, multiEntitySources);
|
||||
logger.debug("Created SourceTracker instance");
|
||||
|
||||
// Transform and merge results while preserving sources
|
||||
try {
|
||||
sourceTracker.transformResults(
|
||||
extractionResults,
|
||||
multiEntitySchema,
|
||||
false,
|
||||
);
|
||||
logger.debug("Successfully transformed results with sourceTracker");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in sourceTracker.transformResults: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in sourceTracker.transformResults:`, { error });
|
||||
throw error;
|
||||
}
|
||||
|
||||
try {
|
||||
multiEntityResult = transformArrayToObject(
|
||||
multiEntitySchema,
|
||||
multiEntityCompletions,
|
||||
);
|
||||
logger.debug("Successfully transformed array to object");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in transformArrayToObject: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in transformArrayToObject:`, { error });
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Track sources before deduplication
|
||||
try {
|
||||
sourceTracker.trackPreDeduplicationSources(multiEntityResult);
|
||||
logger.debug("Successfully tracked pre-deduplication sources");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in trackPreDeduplicationSources: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in trackPreDeduplicationSources:`, { error });
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Apply deduplication and merge
|
||||
try {
|
||||
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
|
||||
logger.debug("Successfully deduplicated objects array");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in deduplicateObjectsArray: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in deduplicateObjectsArray:`, { error });
|
||||
throw error;
|
||||
}
|
||||
|
||||
try {
|
||||
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
||||
logger.debug("Successfully merged null value objects");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in mergeNullValObjs: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in mergeNullValObjs:`, { error });
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Map sources to final deduplicated/merged items
|
||||
try {
|
||||
const multiEntitySources = sourceTracker.mapSourcesToFinalItems(
|
||||
multiEntityResult,
|
||||
multiEntityKeys,
|
||||
);
|
||||
Object.assign(sources, multiEntitySources);
|
||||
logger.debug("Successfully mapped sources to final items");
|
||||
} catch (error) {
|
||||
const errorLog = `[${new Date().toISOString()}] Error in mapSourcesToFinalItems: ${JSON.stringify(error, null, 2)}\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Error in mapSourcesToFinalItems:`, { error });
|
||||
throw error;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to transform array to object`, { error });
|
||||
const errorLog = `[${new Date().toISOString()}] Failed to transform array to object\nError: ${JSON.stringify(error, null, 2)}\nStack: ${error.stack}\nMultiEntityResult: ${JSON.stringify(multiEntityResult, null, 2)}\nMultiEntityCompletions: ${JSON.stringify(multiEntityCompletions, null, 2)}\nMultiEntitySchema: ${JSON.stringify(multiEntitySchema, null, 2)}\n\n`;
|
||||
await fs.appendFile('logs/extraction-errors.log', errorLog);
|
||||
logger.error(`Failed to transform array to object`, {
|
||||
error,
|
||||
errorMessage: error.message,
|
||||
errorStack: error.stack,
|
||||
multiEntityResult: JSON.stringify(multiEntityResult),
|
||||
multiEntityCompletions: JSON.stringify(multiEntityCompletions),
|
||||
multiEntitySchema: JSON.stringify(multiEntitySchema)
|
||||
});
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
@ -542,6 +631,7 @@ export async function performExtraction(
|
||||
rSchema.properties &&
|
||||
Object.keys(rSchema.properties).length > 0
|
||||
) {
|
||||
log["isSingleEntity"] = true;
|
||||
logger.debug("=== SINGLE PAGES ===", {
|
||||
linkCount: links.length,
|
||||
schema: rSchema,
|
||||
@ -564,6 +654,7 @@ export async function performExtraction(
|
||||
},
|
||||
],
|
||||
});
|
||||
log["docsSizeBeforeSingleEntityScrape"] = docsMap.size;
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument(
|
||||
@ -580,7 +671,7 @@ export async function performExtraction(
|
||||
url,
|
||||
isMultiEntity: false,
|
||||
}),
|
||||
request.scrapeOptions
|
||||
request.scrapeOptions,
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
@ -588,6 +679,7 @@ export async function performExtraction(
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
log["docsSizeAfterSingleEntityScrape"] = docsMap.size;
|
||||
|
||||
for (const doc of results) {
|
||||
if (doc?.metadata?.url) {
|
||||
@ -640,31 +732,53 @@ export async function performExtraction(
|
||||
|
||||
// Generate completions
|
||||
logger.debug("Generating singleAnswer completions...");
|
||||
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
|
||||
log["singleAnswerDocsLength"] = singleAnswerDocs.length;
|
||||
let {
|
||||
extract: completionResult,
|
||||
tokenUsage: singleAnswerTokenUsage,
|
||||
sources: singleAnswerSources,
|
||||
smartScrapeCost: singleAnswerSmartScrapeCost,
|
||||
otherCost: singleAnswerOtherCost,
|
||||
smartScrapeCallCount: singleAnswerSmartScrapeCallCount,
|
||||
otherCallCount: singleAnswerOtherCallCount,
|
||||
} = await singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
links,
|
||||
prompt: request.prompt ?? "",
|
||||
systemPrompt: request.systemPrompt ?? "",
|
||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||
});
|
||||
costTracking.smartScrapeCost += singleAnswerSmartScrapeCost;
|
||||
costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount;
|
||||
costTracking.otherCost += singleAnswerOtherCost;
|
||||
costTracking.otherCallCount += singleAnswerOtherCallCount;
|
||||
costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost;
|
||||
logger.debug("Done generating singleAnswer completions.");
|
||||
|
||||
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||
|
||||
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
||||
// Track single answer extraction tokens and sources
|
||||
if (completionResult) {
|
||||
tokenUsage.push(singleAnswerTokenUsage);
|
||||
|
||||
|
||||
// Add sources for top-level properties in single answer
|
||||
if (rSchema?.properties) {
|
||||
Object.keys(rSchema.properties).forEach(key => {
|
||||
Object.keys(rSchema.properties).forEach((key) => {
|
||||
if (completionResult[key] !== undefined) {
|
||||
sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "");
|
||||
sources[key] =
|
||||
singleAnswerSources ||
|
||||
singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
singleAnswerResult = completionResult;
|
||||
singleAnswerCompletions = singleAnswerResult;
|
||||
// singleAnswerResult = completionResult;
|
||||
// singleAnswerCompletions = singleAnswerResult;
|
||||
|
||||
// Update token usage in traces
|
||||
// if (completions && completions.numTokens) {
|
||||
@ -686,6 +800,9 @@ export async function performExtraction(
|
||||
// }
|
||||
}
|
||||
|
||||
log["singleAnswerResult"] = singleAnswerResult;
|
||||
log["multiEntityResult"] = multiEntityResult;
|
||||
|
||||
let finalResult = reqSchema
|
||||
? await mixSchemaObjects(
|
||||
reqSchema,
|
||||
@ -776,11 +893,13 @@ export async function performExtraction(
|
||||
num_tokens: totalTokensUsed,
|
||||
tokens_billed: tokensToBill,
|
||||
sources,
|
||||
cost_tracking: costTracking,
|
||||
}).then(() => {
|
||||
updateExtract(extractId, {
|
||||
status: "completed",
|
||||
llmUsage,
|
||||
sources,
|
||||
costTracking,
|
||||
}).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||
@ -790,15 +909,26 @@ export async function performExtraction(
|
||||
|
||||
logger.debug("Done!");
|
||||
|
||||
if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) {
|
||||
if (
|
||||
request.__experimental_cacheMode == "save" &&
|
||||
request.__experimental_cacheKey
|
||||
) {
|
||||
logger.debug("Saving cached docs...");
|
||||
try {
|
||||
await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey);
|
||||
await saveCachedDocs(
|
||||
[...docsMap.values()],
|
||||
request.__experimental_cacheKey,
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error("Error saving cached docs", { error });
|
||||
}
|
||||
}
|
||||
|
||||
// fs.writeFile(
|
||||
// `logs/${request.urls?.[0].replaceAll("https://", "").replaceAll("http://", "").replaceAll("/", "-").replaceAll(".", "-")}-extract-${extractId}.json`,
|
||||
// JSON.stringify(log, null, 2),
|
||||
// );
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: finalResult ?? {},
|
||||
|
17
apps/api/src/lib/extract/fire-0/build-document-f0.ts
Normal file
17
apps/api/src/lib/extract/fire-0/build-document-f0.ts
Normal file
@ -0,0 +1,17 @@
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
|
||||
export function buildDocument_F0(document: Document): string {
|
||||
const metadata = document.metadata;
|
||||
const markdown = document.markdown;
|
||||
|
||||
// for each key in the metadata allow up to 250 characters
|
||||
const metadataString = Object.entries(metadata)
|
||||
.map(([key, value]) => {
|
||||
return `${key}: ${value?.toString().slice(0, 250)}`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
|
||||
const documentString = `${markdown}${documentMetadataString}`;
|
||||
return documentString;
|
||||
}
|
115
apps/api/src/lib/extract/fire-0/build-prompts-f0.ts
Normal file
115
apps/api/src/lib/extract/fire-0/build-prompts-f0.ts
Normal file
@ -0,0 +1,115 @@
|
||||
export function buildRefrasedPrompt_F0(prompt: string, url: string): string {
|
||||
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
|
||||
|
||||
Original prompt: "${prompt}"
|
||||
|
||||
Provide a rephrased search query that:
|
||||
1. Maintains the core intent of the original prompt with ONLY the keywords
|
||||
2. Uses relevant keywords
|
||||
3. Is optimized for search engine results
|
||||
4. Is concise and focused
|
||||
5. Short is better than long
|
||||
6. It is a search engine, not a chatbot
|
||||
7. Concise
|
||||
|
||||
Return only the rephrased search query, without any explanation or additional text.`;
|
||||
}
|
||||
|
||||
export function buildPreRerankPrompt_F0(
|
||||
prompt: string | undefined,
|
||||
schema: any,
|
||||
url: string,
|
||||
): string {
|
||||
const schemaString = JSON.stringify(schema, null, 2);
|
||||
return `Create a concise search query that combines the key data points from both the schema and prompt. Focus on the core information needed while keeping it general enough to find relevant matches.
|
||||
|
||||
Schema: ${schemaString}
|
||||
Prompt: ${prompt}
|
||||
Website to get content from: ${url}
|
||||
|
||||
Return only a concise sentece or 2 focused on the essential data points that the user wants to extract. This will be used by an LLM to determine how releavant the links that are present are to the user's request.`;
|
||||
}
|
||||
|
||||
export function buildRerankerSystemPrompt_F0(): string {
|
||||
return `You are a relevance expert scoring links from a website the user is trying to extract information from. Analyze the provided URLs and their content
|
||||
to determine their relevance to the user's query and intent.
|
||||
For each URL, assign a relevance score between 0 and 1, where 1
|
||||
means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it.
|
||||
Always return all the links scored that you are giving. Do not omit links.
|
||||
Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`;
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt_F0(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
||||
}
|
||||
|
||||
// Multi entity schema anlayzer
|
||||
export function buildAnalyzeSchemaPrompt_F0(): string {
|
||||
return `You are a query classifier for a web scraping system. Classify the data extraction query as either:
|
||||
A) Single-Answer: One answer across a few pages, possibly containing small arrays.
|
||||
B) Multi-Entity: Many items across many pages, often involving large arrays.
|
||||
|
||||
Consider:
|
||||
1. Answer Cardinality: Single or multiple items?
|
||||
2. Page Distribution: Found on 1-3 pages or many?
|
||||
3. Verification Needs: Cross-page verification or independent extraction?
|
||||
|
||||
Provide:
|
||||
- Method: [Single-Answer/Multi-Entity]
|
||||
- Confidence: [0-100%]
|
||||
- Reasoning: Why this classification?
|
||||
- Key Indicators: Specific aspects leading to this decision.
|
||||
|
||||
Examples:
|
||||
- "Is this company a non-profit?" -> Single-Answer
|
||||
- "Extract all product prices" -> Multi-Entity
|
||||
|
||||
For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').`;
|
||||
}
|
||||
|
||||
export function buildAnalyzeSchemaUserPrompt_F0(
|
||||
schemaString: string,
|
||||
prompt: string,
|
||||
urls: string[],
|
||||
): string {
|
||||
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
|
||||
}
|
||||
|
||||
// Should Extract
|
||||
|
||||
export function buildShouldExtractSystemPrompt_F0(): string {
|
||||
return `You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.`;
|
||||
}
|
||||
|
||||
export function buildShouldExtractUserPrompt_F0(
|
||||
prompt: string,
|
||||
schema: any,
|
||||
): string {
|
||||
return `Should the following content be used to extract information for this prompt: "${prompt}" User schema is: ${JSON.stringify(schema)}\nReturn only true or false.`;
|
||||
}
|
||||
|
||||
// Batch extract
|
||||
export function buildBatchExtractSystemPrompt_F0(
|
||||
systemPrompt: string,
|
||||
multiEntitySchema: any,
|
||||
links: string[],
|
||||
): string {
|
||||
return (
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
|
||||
links.join(", ")
|
||||
);
|
||||
}
|
||||
|
||||
export function buildBatchExtractPrompt_F0(prompt: string): string {
|
||||
return `Today is: ${new Date().toISOString()}\n${prompt}`;
|
||||
}
|
||||
|
||||
|
||||
export function buildRephraseToSerpPrompt_F0(prompt: string): string {
|
||||
return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results:
|
||||
|
||||
Original Prompt: "${prompt}"`;
|
||||
}
|
||||
|
@ -0,0 +1,87 @@
|
||||
import { TokenUsage } from "../../../../controllers/v1/types";
|
||||
import { z } from "zod";
|
||||
import {
|
||||
buildAnalyzeSchemaPrompt,
|
||||
buildAnalyzeSchemaUserPrompt,
|
||||
} from "../../build-prompts";
|
||||
import { logger } from "../../../logger";
|
||||
import { jsonSchema } from "ai";
|
||||
import { getModel } from "../../../generic-ai";
|
||||
import {
|
||||
generateCompletions_F0,
|
||||
generateSchemaFromPrompt_F0,
|
||||
} from "../llmExtract-f0";
|
||||
|
||||
export async function analyzeSchemaAndPrompt_F0(
|
||||
urls: string[],
|
||||
schema: any,
|
||||
prompt: string,
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
reasoning?: string;
|
||||
keyIndicators?: string[];
|
||||
tokenUsage: TokenUsage;
|
||||
}> {
|
||||
if (!schema) {
|
||||
schema = await generateSchemaFromPrompt_F0(prompt);
|
||||
}
|
||||
|
||||
const schemaString = JSON.stringify(schema);
|
||||
|
||||
const model = getModel("gpt-4o");
|
||||
|
||||
const checkSchema = z
|
||||
.object({
|
||||
isMultiEntity: z.boolean(),
|
||||
multiEntityKeys: z.array(z.string()).optional().default([]),
|
||||
reasoning: z.string(),
|
||||
keyIndicators: z.array(z.string()),
|
||||
})
|
||||
.refine(
|
||||
(x) => !x.isMultiEntity || x.multiEntityKeys.length > 0,
|
||||
"isMultiEntity was true, but no multiEntityKeys",
|
||||
);
|
||||
|
||||
try {
|
||||
const { extract: result, totalUsage } = await generateCompletions_F0({
|
||||
logger,
|
||||
options: {
|
||||
mode: "llm",
|
||||
schema: checkSchema,
|
||||
prompt: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls),
|
||||
systemPrompt: buildAnalyzeSchemaPrompt(),
|
||||
},
|
||||
markdown: "",
|
||||
model,
|
||||
});
|
||||
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||
checkSchema.parse(result);
|
||||
|
||||
return {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: totalUsage,
|
||||
};
|
||||
} catch (e) {
|
||||
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
||||
error: e,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
isMultiEntity: false,
|
||||
multiEntityKeys: [],
|
||||
reasoning: "",
|
||||
keyIndicators: [],
|
||||
tokenUsage: {
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: 0,
|
||||
model: model.modelId,
|
||||
},
|
||||
};
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
import { logger } from "../../../../lib/logger";
|
||||
import { ExtractResponse, TokenUsage } from "../../../../controllers/v1/types";
|
||||
import { Document } from "../../../../controllers/v1/types";
|
||||
import { generateCompletions_F0 } from "../llmExtract-f0";
|
||||
import { buildBatchExtractPrompt_F0, buildBatchExtractSystemPrompt_F0 } from "../build-prompts-f0";
|
||||
import { buildDocument_F0 } from "../build-document-f0";
|
||||
|
||||
/**
|
||||
* Batch extract information from a list of URLs using a multi-entity schema.
|
||||
* @param multiEntitySchema - The schema for the multi-entity extraction
|
||||
* @param links - The URLs to extract information from
|
||||
* @param prompt - The prompt for the extraction
|
||||
* @param systemPrompt - The system prompt for the extraction
|
||||
* @param doc - The document to extract information from
|
||||
* @returns The completion promise
|
||||
*/
|
||||
export async function batchExtractPromise_F0(
|
||||
multiEntitySchema: any,
|
||||
links: string[],
|
||||
prompt: string,
|
||||
systemPrompt: string,
|
||||
doc: Document,
|
||||
): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
totalUsage: TokenUsage;
|
||||
warning?: string;
|
||||
sources: string[];
|
||||
}> {
|
||||
const completion = await generateCompletions_F0({
|
||||
logger: logger.child({
|
||||
method: "extractService/generateCompletions",
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: buildBatchExtractSystemPrompt_F0(
|
||||
systemPrompt,
|
||||
multiEntitySchema,
|
||||
links,
|
||||
),
|
||||
prompt: buildBatchExtractPrompt_F0(prompt),
|
||||
schema: multiEntitySchema,
|
||||
},
|
||||
markdown: buildDocument_F0(doc),
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
|
||||
return {
|
||||
extract: completion.extract,
|
||||
numTokens: completion.numTokens,
|
||||
totalUsage: completion.totalUsage,
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""]
|
||||
};
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
import { logger } from "../../../../lib/logger";
|
||||
import { buildDocument } from "../../build-document";
|
||||
import { Document, TokenUsage } from "../../../../controllers/v1/types";
|
||||
import { generateCompletions_F0 } from "../llmExtract-f0";
|
||||
import { buildShouldExtractSystemPrompt_F0, buildShouldExtractUserPrompt_F0 } from "../build-prompts-f0";
|
||||
import { getModel } from "../../../../lib/generic-ai";
|
||||
|
||||
|
||||
export async function checkShouldExtract_F0(
|
||||
prompt: string,
|
||||
multiEntitySchema: any,
|
||||
doc: Document,
|
||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean }> {
|
||||
const shouldExtractCheck = await generateCompletions_F0({
|
||||
logger: logger.child({ method: "extractService/checkShouldExtract" }),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: buildShouldExtractSystemPrompt_F0(),
|
||||
prompt: buildShouldExtractUserPrompt_F0(prompt, multiEntitySchema),
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
extract: {
|
||||
type: "boolean",
|
||||
},
|
||||
},
|
||||
required: ["extract"],
|
||||
},
|
||||
},
|
||||
markdown: buildDocument(doc),
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gpt-4o-mini"),
|
||||
});
|
||||
|
||||
return {
|
||||
tokenUsage: shouldExtractCheck.totalUsage,
|
||||
extract: shouldExtractCheck.extract["extract"],
|
||||
};
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
import { logger } from "../../../../lib/logger";
|
||||
import { generateCompletions_F0 } from "../llmExtract-f0";
|
||||
import { buildDocument_F0 } from "../build-document-f0";
|
||||
import { Document, TokenUsage } from "../../../../controllers/v1/types";
|
||||
|
||||
export async function singleAnswerCompletion_F0({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
links,
|
||||
prompt,
|
||||
systemPrompt,
|
||||
}: {
|
||||
singleAnswerDocs: Document[];
|
||||
rSchema: any;
|
||||
links: string[];
|
||||
prompt: string;
|
||||
systemPrompt: string;
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
tokenUsage: TokenUsage;
|
||||
sources: string[];
|
||||
}> {
|
||||
const completion = await generateCompletions_F0({
|
||||
logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(systemPrompt ? `${systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
|
||||
schema: rSchema,
|
||||
},
|
||||
markdown: singleAnswerDocs.map((x) => buildDocument_F0(x)).join("\n"),
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: completion.totalUsage,
|
||||
sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "")
|
||||
};
|
||||
}
|
98
apps/api/src/lib/extract/fire-0/document-scraper-f0.ts
Normal file
98
apps/api/src/lib/extract/fire-0/document-scraper-f0.ts
Normal file
@ -0,0 +1,98 @@
|
||||
import { Document, ScrapeOptions, URLTrace, scrapeOptions } from "../../../controllers/v1/types";
|
||||
import { logger } from "../../logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { waitForJob } from "../../../services/queue-jobs";
|
||||
import { addScrapeJob } from "../../../services/queue-jobs";
|
||||
import { getJobPriority } from "../../job-priority";
|
||||
import type { Logger } from "winston";
|
||||
|
||||
interface ScrapeDocumentOptions {
|
||||
url: string;
|
||||
teamId: string;
|
||||
origin: string;
|
||||
timeout: number;
|
||||
isSingleUrl?: boolean;
|
||||
}
|
||||
|
||||
export async function scrapeDocument_F0(
|
||||
options: ScrapeDocumentOptions,
|
||||
urlTraces: URLTrace[],
|
||||
logger: Logger,
|
||||
internalScrapeOptions: Partial<ScrapeOptions> = { onlyMainContent: false },
|
||||
): Promise<Document | null> {
|
||||
const trace = urlTraces.find((t) => t.url === options.url);
|
||||
if (trace) {
|
||||
trace.status = "scraped";
|
||||
trace.timing.scrapedAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
async function attemptScrape(timeout: number) {
|
||||
const jobId = crypto.randomUUID();
|
||||
const jobPriority = await getJobPriority({
|
||||
team_id: options.teamId,
|
||||
basePriority: 10,
|
||||
from_extract: true,
|
||||
});
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url: options.url,
|
||||
mode: "single_urls",
|
||||
team_id: options.teamId,
|
||||
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
|
||||
internalOptions: {
|
||||
useCache: true,
|
||||
teamId: options.teamId,
|
||||
},
|
||||
origin: options.origin,
|
||||
is_scrape: true,
|
||||
from_extract: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority,
|
||||
);
|
||||
|
||||
const doc = await waitForJob(jobId, timeout);
|
||||
await getScrapeQueue().remove(jobId);
|
||||
|
||||
if (trace) {
|
||||
trace.timing.completedAt = new Date().toISOString();
|
||||
trace.contentStats = {
|
||||
rawContentLength: doc.markdown?.length || 0,
|
||||
processedContentLength: doc.markdown?.length || 0,
|
||||
tokensUsed: 0,
|
||||
};
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
try {
|
||||
try {
|
||||
logger.debug("Attempting scrape...");
|
||||
const x = await attemptScrape(options.timeout);
|
||||
logger.debug("Scrape finished!");
|
||||
return x;
|
||||
} catch (timeoutError) {
|
||||
logger.warn("Scrape failed.", { error: timeoutError });
|
||||
|
||||
if (options.isSingleUrl) {
|
||||
// For single URLs, try again with double timeout
|
||||
logger.debug("Attempting scrape...");
|
||||
const x = await attemptScrape(options.timeout * 2);
|
||||
logger.debug("Scrape finished!");
|
||||
return x;
|
||||
}
|
||||
|
||||
throw timeoutError;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`error in scrapeDocument`, { error });
|
||||
if (trace) {
|
||||
trace.status = "error";
|
||||
trace.error = error.message;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
807
apps/api/src/lib/extract/fire-0/extraction-service-f0.ts
Normal file
807
apps/api/src/lib/extract/fire-0/extraction-service-f0.ts
Normal file
@ -0,0 +1,807 @@
|
||||
import {
|
||||
Document,
|
||||
ExtractRequest,
|
||||
TokenUsage,
|
||||
URLTrace,
|
||||
} from "../../../controllers/v1/types";
|
||||
import { logger as _logger } from "../../logger";
|
||||
import { scrapeDocument_F0 } from "./document-scraper-f0";
|
||||
import { billTeam } from "../../../services/billing/credit_billing";
|
||||
import { logJob } from "../../../services/logging/log_job";
|
||||
import { _addScrapeJobToBullMQ } from "../../../services/queue-jobs";
|
||||
import { spreadSchemas_F0 } from "./helpers/spread-schemas-f0";
|
||||
import Ajv from "ajv";
|
||||
const ajv = new Ajv();
|
||||
|
||||
import { ExtractStep, updateExtract } from "../extract-redis";
|
||||
import { CUSTOM_U_TEAMS } from "../config";
|
||||
import { getCachedDocs, saveCachedDocs } from "../helpers/cached-docs";
|
||||
import { normalizeUrl } from "../../canonical-url";
|
||||
import { search } from "../../../search";
|
||||
import { buildRephraseToSerpPrompt_F0 } from "./build-prompts-f0";
|
||||
import { processUrl_F0, generateBasicCompletion_FO } from "./url-processor-f0";
|
||||
import { generateCompletions_F0, generateSchemaFromPrompt_F0 } from "./llmExtract-f0";
|
||||
import { dereferenceSchema_F0 } from "./helpers/dereference-schema-f0";
|
||||
import { analyzeSchemaAndPrompt_F0 } from "./completions/analyzeSchemaAndPrompt-f0";
|
||||
import { checkShouldExtract_F0 } from "./completions/checkShouldExtract-f0";
|
||||
import { batchExtractPromise_F0 } from "./completions/batchExtract-f0";
|
||||
import { transformArrayToObject_F0 } from "./helpers/transform-array-to-obj-f0";
|
||||
import { deduplicateObjectsArray_F0 } from "./helpers/deduplicate-objs-array-f0";
|
||||
import { mergeNullValObjs_F0 } from "./helpers/merge-null-val-objs-f0";
|
||||
import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0";
|
||||
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
|
||||
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
|
||||
import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
||||
|
||||
|
||||
interface ExtractServiceOptions {
|
||||
request: ExtractRequest;
|
||||
teamId: string;
|
||||
subId?: string;
|
||||
cacheMode?: "load" | "save" | "direct";
|
||||
cacheKey?: string;
|
||||
}
|
||||
|
||||
export interface ExtractResult {
|
||||
success: boolean;
|
||||
data?: any;
|
||||
extractId: string;
|
||||
warning?: string;
|
||||
urlTrace?: URLTrace[];
|
||||
error?: string;
|
||||
tokenUsageBreakdown?: TokenUsage[];
|
||||
llmUsage?: number;
|
||||
totalUrlsScraped?: number;
|
||||
sources?: Record<string, string[]>;
|
||||
}
|
||||
|
||||
type completions = {
|
||||
extract: Record<string, any>;
|
||||
numTokens: number;
|
||||
totalUsage: TokenUsage;
|
||||
warning?: string;
|
||||
sources?: string[];
|
||||
};
|
||||
|
||||
|
||||
export async function performExtraction_F0(
|
||||
extractId: string,
|
||||
options: ExtractServiceOptions,
|
||||
): Promise<ExtractResult> {
|
||||
const { request, teamId, subId } = options;
|
||||
const urlTraces: URLTrace[] = [];
|
||||
let docsMap: Map<string, Document> = new Map();
|
||||
let singleAnswerCompletions: completions | null = null;
|
||||
let multiEntityCompletions: completions[] = [];
|
||||
let multiEntityResult: any = {};
|
||||
let singleAnswerResult: any = {};
|
||||
let totalUrlsScraped = 0;
|
||||
let sources: Record<string, string[]> = {};
|
||||
|
||||
|
||||
const logger = _logger.child({
|
||||
module: "extract",
|
||||
method: "performExtraction",
|
||||
extractId,
|
||||
teamId,
|
||||
});
|
||||
|
||||
// If no URLs are provided, generate URLs from the prompt
|
||||
if ((!request.urls || request.urls.length === 0) && request.prompt) {
|
||||
logger.debug("Generating URLs from prompt...", {
|
||||
prompt: request.prompt,
|
||||
});
|
||||
const rephrasedPrompt = await generateBasicCompletion_FO(buildRephraseToSerpPrompt_F0(request.prompt));
|
||||
const searchResults = await search({
|
||||
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||
num_results: 10,
|
||||
});
|
||||
|
||||
request.urls = searchResults.map(result => result.url) as string[];
|
||||
}
|
||||
if (request.urls && request.urls.length === 0) {
|
||||
logger.error("No search results found", {
|
||||
query: request.prompt,
|
||||
});
|
||||
return {
|
||||
success: false,
|
||||
error: "No search results found",
|
||||
extractId,
|
||||
};
|
||||
}
|
||||
|
||||
const urls = request.urls || ([] as string[]);
|
||||
|
||||
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
|
||||
logger.debug("Loading cached docs...");
|
||||
try {
|
||||
const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
|
||||
for (const doc of cache) {
|
||||
if (doc.metadata.url) {
|
||||
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Error loading cached docs", { error });
|
||||
}
|
||||
}
|
||||
|
||||
// Token tracking
|
||||
let tokenUsage: TokenUsage[] = [];
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.INITIAL,
|
||||
startedAt: Date.now(),
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: request.urls,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let startMap = Date.now();
|
||||
let aggMapLinks: string[] = [];
|
||||
logger.debug("Processing URLs...", {
|
||||
urlCount: request.urls?.length || 0,
|
||||
});
|
||||
|
||||
const urlPromises = urls.map((url) =>
|
||||
processUrl_F0(
|
||||
{
|
||||
url,
|
||||
prompt: request.prompt,
|
||||
teamId,
|
||||
allowExternalLinks: request.allowExternalLinks,
|
||||
origin: request.origin,
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
schema: request.schema,
|
||||
},
|
||||
urlTraces,
|
||||
(links: string[]) => {
|
||||
aggMapLinks.push(...links);
|
||||
updateExtract(extractId, {
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MAP,
|
||||
startedAt: startMap,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: aggMapLinks,
|
||||
},
|
||||
],
|
||||
});
|
||||
},
|
||||
logger.child({ module: "extract", method: "processUrl", url }),
|
||||
),
|
||||
);
|
||||
|
||||
const processedUrls = await Promise.all(urlPromises);
|
||||
const links = processedUrls.flat().filter((url) => url);
|
||||
logger.debug("Processed URLs.", {
|
||||
linkCount: links.length,
|
||||
});
|
||||
|
||||
if (links.length === 0) {
|
||||
logger.error("0 links! Bailing.", {
|
||||
linkCount: links.length,
|
||||
});
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MAP_RERANK,
|
||||
startedAt: startMap,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: links,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
reqSchema = await generateSchemaFromPrompt_F0(request.prompt);
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
}
|
||||
|
||||
if (reqSchema) {
|
||||
reqSchema = await dereferenceSchema_F0(reqSchema);
|
||||
}
|
||||
|
||||
logger.debug("Transformed schema.", {
|
||||
originalSchema: request.schema,
|
||||
schema: reqSchema,
|
||||
});
|
||||
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
} = await analyzeSchemaAndPrompt_F0(links, reqSchema, request.prompt ?? "");
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
// console.log("\nIs Multi Entity:", isMultiEntity);
|
||||
// console.log("\nMulti Entity Keys:", multiEntityKeys);
|
||||
// console.log("\nReasoning:", reasoning);
|
||||
// console.log("\nKey Indicators:", keyIndicators);
|
||||
|
||||
let rSchema = reqSchema;
|
||||
if (isMultiEntity && reqSchema) {
|
||||
logger.debug("=== MULTI-ENTITY ===");
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas_F0(
|
||||
reqSchema,
|
||||
multiEntityKeys,
|
||||
);
|
||||
rSchema = singleAnswerSchema;
|
||||
logger.debug("Spread schemas.", { singleAnswerSchema, multiEntitySchema });
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY,
|
||||
startedAt: startAnalyze,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: [],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const timeout = 60000;
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_SCRAPE,
|
||||
startedAt: startAnalyze,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: links,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
logger.debug("Starting multi-entity scrape...");
|
||||
let startScrape = Date.now();
|
||||
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument_F0(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
logger.child({
|
||||
module: "extract",
|
||||
method: "scrapeDocument",
|
||||
url,
|
||||
isMultiEntity: true,
|
||||
}),
|
||||
{
|
||||
...request.scrapeOptions,
|
||||
|
||||
// Needs to be true for multi-entity to work properly
|
||||
onlyMainContent: true,
|
||||
}
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
});
|
||||
|
||||
let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
|
||||
logger.debug("Multi-entity scrape finished.", {
|
||||
docCount: multyEntityDocs.length,
|
||||
});
|
||||
|
||||
totalUrlsScraped += multyEntityDocs.length;
|
||||
|
||||
let endScrape = Date.now();
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_SCRAPE,
|
||||
startedAt: startScrape,
|
||||
finishedAt: endScrape,
|
||||
discoveredLinks: links,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
for (const doc of multyEntityDocs) {
|
||||
if (doc?.metadata?.url) {
|
||||
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
||||
|
||||
// Process docs in chunks with queue style processing
|
||||
const chunkSize = 50;
|
||||
const timeoutCompletion = 45000; // 45 second timeout
|
||||
const chunks: Document[][] = [];
|
||||
const extractionResults: {extract: any, url: string}[] = [];
|
||||
|
||||
// Split into chunks
|
||||
for (let i = 0; i < multyEntityDocs.length; i += chunkSize) {
|
||||
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// Process chunks sequentially with timeout
|
||||
for (const chunk of chunks) {
|
||||
const chunkPromises = chunk.map(async (doc) => {
|
||||
try {
|
||||
ajv.compile(multiEntitySchema);
|
||||
|
||||
// Wrap in timeout promise
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve(null), timeoutCompletion);
|
||||
});
|
||||
|
||||
// Check if page should be extracted before proceeding
|
||||
const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract_F0(
|
||||
request.prompt ?? "",
|
||||
multiEntitySchema,
|
||||
doc,
|
||||
);
|
||||
|
||||
tokenUsage.push(shouldExtractCheckTokenUsage);
|
||||
|
||||
if (!extract) {
|
||||
logger.info(
|
||||
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
// Add confidence score to schema with 5 levels
|
||||
const schemaWithConfidence = {
|
||||
...multiEntitySchema,
|
||||
properties: {
|
||||
...multiEntitySchema.properties,
|
||||
is_content_relevant: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
|
||||
},
|
||||
},
|
||||
required: [
|
||||
...(multiEntitySchema.required || []),
|
||||
"is_content_relevant",
|
||||
],
|
||||
};
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.MULTI_ENTITY_EXTRACT,
|
||||
startedAt: startScrape,
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: [
|
||||
doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const completionPromise = batchExtractPromise_F0(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
|
||||
|
||||
// Race between timeout and completion
|
||||
const multiEntityCompletion = (await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
])) as Awaited<ReturnType<typeof generateCompletions_F0>>;
|
||||
|
||||
// Track multi-entity extraction tokens
|
||||
if (multiEntityCompletion) {
|
||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||
|
||||
if (multiEntityCompletion.extract) {
|
||||
return {
|
||||
extract: multiEntityCompletion.extract,
|
||||
url: doc.metadata.url || doc.metadata.sourceURL || ""
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// console.log(multiEntityCompletion.extract)
|
||||
// if (!multiEntityCompletion.extract?.is_content_relevant) {
|
||||
// console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// Update token usage in traces
|
||||
// if (multiEntityCompletion && multiEntityCompletion.numTokens) {
|
||||
// const totalLength = docs.reduce(
|
||||
// (sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
// 0,
|
||||
// );
|
||||
// docs.forEach((doc) => {
|
||||
// if (doc.metadata?.sourceURL) {
|
||||
// const trace = urlTraces.find(
|
||||
// (t) => t.url === doc.metadata.sourceURL,
|
||||
// );
|
||||
// if (trace && trace.contentStats) {
|
||||
// trace.contentStats.tokensUsed = Math.floor(
|
||||
// ((doc.markdown?.length || 0) / totalLength) *
|
||||
// (multiEntityCompletion?.numTokens || 0),
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
|
||||
// if (multiEntityCompletion.extract && multiEntityCompletion.extract.extraction_confidence < 3) {
|
||||
// console.log(`Skipping extraction for ${doc.metadata.url} as confidence is too low (${multiEntityCompletion.extract.extraction_confidence})`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to process document.`, {
|
||||
error,
|
||||
url: doc.metadata.url ?? doc.metadata.sourceURL!,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for current chunk to complete before processing next chunk
|
||||
const chunkResults = await Promise.all(chunkPromises);
|
||||
const validResults = chunkResults.filter((result): result is {extract: any, url: string} => result !== null);
|
||||
extractionResults.push(...validResults);
|
||||
multiEntityCompletions.push(...validResults.map(r => r.extract));
|
||||
logger.debug("All multi-entity completion chunks finished.", {
|
||||
completionCount: multiEntityCompletions.length,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// Use SourceTracker to handle source tracking
|
||||
const sourceTracker = new SourceTracker_F0();
|
||||
|
||||
// Transform and merge results while preserving sources
|
||||
sourceTracker.transformResults_F0(extractionResults, multiEntitySchema, false);
|
||||
|
||||
multiEntityResult = transformArrayToObject_F0(
|
||||
multiEntitySchema,
|
||||
multiEntityCompletions,
|
||||
);
|
||||
|
||||
// Track sources before deduplication
|
||||
sourceTracker.trackPreDeduplicationSources_F0(multiEntityResult);
|
||||
|
||||
// Apply deduplication and merge
|
||||
multiEntityResult = deduplicateObjectsArray_F0(multiEntityResult);
|
||||
multiEntityResult = mergeNullValObjs_F0(multiEntityResult);
|
||||
|
||||
// Map sources to final deduplicated/merged items
|
||||
const multiEntitySources = sourceTracker.mapSourcesToFinalItems_F0(multiEntityResult, multiEntityKeys);
|
||||
Object.assign(sources, multiEntitySources);
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to transform array to object`, { error });
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (
|
||||
rSchema &&
|
||||
Object.keys(rSchema).length > 0 &&
|
||||
rSchema.properties &&
|
||||
Object.keys(rSchema.properties).length > 0
|
||||
) {
|
||||
logger.debug("=== SINGLE PAGES ===", {
|
||||
linkCount: links.length,
|
||||
schema: rSchema,
|
||||
});
|
||||
|
||||
// Scrape documents
|
||||
const timeout = 60000;
|
||||
let singleAnswerDocs: Document[] = [];
|
||||
|
||||
// let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces);
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.SCRAPE,
|
||||
startedAt: Date.now(),
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: links,
|
||||
},
|
||||
],
|
||||
});
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(normalizeUrl(url))) {
|
||||
return scrapeDocument_F0(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
logger.child({
|
||||
module: "extract",
|
||||
method: "scrapeDocument",
|
||||
url,
|
||||
isMultiEntity: false,
|
||||
}),
|
||||
request.scrapeOptions
|
||||
);
|
||||
}
|
||||
return docsMap.get(normalizeUrl(url));
|
||||
});
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
|
||||
for (const doc of results) {
|
||||
if (doc?.metadata?.url) {
|
||||
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||
}
|
||||
}
|
||||
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
||||
|
||||
const validResults = results.filter(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
singleAnswerDocs.push(...validResults);
|
||||
totalUrlsScraped += validResults.length;
|
||||
|
||||
logger.debug("Scrapes finished.", { docCount: validResults.length });
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped,
|
||||
};
|
||||
}
|
||||
|
||||
if (docsMap.size == 0) {
|
||||
// All urls are invalid
|
||||
logger.error("All provided URLs are invalid!");
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"All provided URLs are invalid. Please check your input and try again.",
|
||||
extractId,
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
totalUrlsScraped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
await updateExtract(extractId, {
|
||||
status: "processing",
|
||||
steps: [
|
||||
{
|
||||
step: ExtractStep.EXTRACT,
|
||||
startedAt: Date.now(),
|
||||
finishedAt: Date.now(),
|
||||
discoveredLinks: links,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Generate completions
|
||||
logger.debug("Generating singleAnswer completions...");
|
||||
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion_F0({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
links,
|
||||
prompt: request.prompt ?? "",
|
||||
systemPrompt: request.systemPrompt ?? ""
|
||||
});
|
||||
logger.debug("Done generating singleAnswer completions.");
|
||||
|
||||
// Track single answer extraction tokens and sources
|
||||
if (completionResult) {
|
||||
tokenUsage.push(singleAnswerTokenUsage);
|
||||
|
||||
// Add sources for top-level properties in single answer
|
||||
if (rSchema?.properties) {
|
||||
Object.keys(rSchema.properties).forEach(key => {
|
||||
if (completionResult[key] !== undefined) {
|
||||
sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "");
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
singleAnswerResult = completionResult;
|
||||
singleAnswerCompletions = singleAnswerResult;
|
||||
|
||||
// Update token usage in traces
|
||||
// if (completions && completions.numTokens) {
|
||||
// const totalLength = docs.reduce(
|
||||
// (sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
// 0,
|
||||
// );
|
||||
// docs.forEach((doc) => {
|
||||
// if (doc.metadata?.sourceURL) {
|
||||
// const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
||||
// if (trace && trace.contentStats) {
|
||||
// trace.contentStats.tokensUsed = Math.floor(
|
||||
// ((doc.markdown?.length || 0) / totalLength) *
|
||||
// (completions?.numTokens || 0),
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
let finalResult = reqSchema
|
||||
? await mixSchemaObjects_F0(
|
||||
reqSchema,
|
||||
singleAnswerResult,
|
||||
multiEntityResult,
|
||||
logger.child({ method: "mixSchemaObjects" }),
|
||||
)
|
||||
: singleAnswerResult || multiEntityResult;
|
||||
|
||||
// Tokenize final result to get token count
|
||||
// let finalResultTokens = 0;
|
||||
// if (finalResult) {
|
||||
// const finalResultStr = JSON.stringify(finalResult);
|
||||
// finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
|
||||
|
||||
// }
|
||||
// // Deduplicate and validate final result against schema
|
||||
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
|
||||
// const schemaValidation = await generateCompletions(
|
||||
// logger.child({ method: "extractService/validateAndDeduplicate" }),
|
||||
// {
|
||||
// mode: "llm",
|
||||
// systemPrompt: `You are a data validator and deduplicator. Your task is to:
|
||||
// 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema
|
||||
// 2. Ensure all data matches the provided schema
|
||||
// 3. Keep only the highest quality and most complete entries when duplicates are found.
|
||||
|
||||
// Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`,
|
||||
// prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n
|
||||
|
||||
// <start of extract data>
|
||||
|
||||
// ${JSON.stringify(finalResult)}
|
||||
|
||||
// <end of extract data>
|
||||
|
||||
// <start of schema>
|
||||
|
||||
// ${JSON.stringify(reqSchema)}
|
||||
|
||||
// <end of schema>
|
||||
// `,
|
||||
// schema: reqSchema,
|
||||
// },
|
||||
// undefined,
|
||||
// undefined,
|
||||
// true,
|
||||
// "gpt-4o"
|
||||
// );
|
||||
// console.log("schemaValidation", schemaValidation);
|
||||
|
||||
// console.log("schemaValidation", finalResult);
|
||||
|
||||
// if (schemaValidation?.extract) {
|
||||
// tokenUsage.push(schemaValidation.totalUsage);
|
||||
// finalResult = schemaValidation.extract;
|
||||
// }
|
||||
// }
|
||||
|
||||
const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0);
|
||||
const llmUsage = estimateTotalCost_F0(tokenUsage);
|
||||
let tokensToBill = calculateFinalResultCost_F0(finalResult);
|
||||
|
||||
if (CUSTOM_U_TEAMS.includes(teamId)) {
|
||||
tokensToBill = 1;
|
||||
}
|
||||
|
||||
// Bill team for usage
|
||||
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${teamId} for ${tokensToBill} tokens: ${error}`,
|
||||
);
|
||||
});
|
||||
|
||||
// Log job with token usage and sources
|
||||
logJob({
|
||||
job_id: extractId,
|
||||
success: true,
|
||||
message: "Extract completed",
|
||||
num_docs: 1,
|
||||
docs: finalResult ?? {},
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
team_id: teamId,
|
||||
mode: "extract",
|
||||
url: request.urls?.join(", ") || "",
|
||||
scrapeOptions: request,
|
||||
origin: request.origin ?? "api",
|
||||
num_tokens: totalTokensUsed,
|
||||
tokens_billed: tokensToBill,
|
||||
sources,
|
||||
}).then(() => {
|
||||
updateExtract(extractId, {
|
||||
status: "completed",
|
||||
llmUsage,
|
||||
sources,
|
||||
}).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
logger.debug("Done!");
|
||||
|
||||
if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) {
|
||||
logger.debug("Saving cached docs...");
|
||||
try {
|
||||
await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey);
|
||||
} catch (error) {
|
||||
logger.error("Error saving cached docs", { error });
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: finalResult ?? {},
|
||||
extractId,
|
||||
warning: undefined,
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
llmUsage,
|
||||
totalUrlsScraped,
|
||||
sources,
|
||||
};
|
||||
}
|
||||
|
@ -0,0 +1,29 @@
|
||||
export function deduplicateObjectsArray_F0(objArray: { [key: string]: any[] }): {
|
||||
[key: string]: any[];
|
||||
} {
|
||||
const deduplicatedObjArray: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
const seen = new Set();
|
||||
deduplicatedObjArray[key] = objArray[key].filter((item) => {
|
||||
// Create a unique identifier for each item based on its properties
|
||||
const identifier = JSON.stringify(item);
|
||||
|
||||
// Check if this identifier has been seen before
|
||||
if (seen.has(identifier)) {
|
||||
return false; // Duplicate found, filter it out
|
||||
}
|
||||
|
||||
// Add the identifier to the set and keep the item
|
||||
seen.add(identifier);
|
||||
return true;
|
||||
});
|
||||
} else {
|
||||
// If the value is not an array, just copy it as is
|
||||
deduplicatedObjArray[key] = objArray[key];
|
||||
}
|
||||
}
|
||||
|
||||
return deduplicatedObjArray;
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
import { dereference } from "@apidevtools/json-schema-ref-parser";
|
||||
|
||||
export async function dereferenceSchema_F0(schema: any): Promise<any> {
|
||||
try {
|
||||
return await dereference(schema);
|
||||
} catch (error) {
|
||||
console.error("Failed to dereference schema:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
@ -0,0 +1,153 @@
|
||||
import { deduplicateObjectsArray_F0 } from "./deduplicate-objs-array-f0";
|
||||
|
||||
/**
|
||||
* Convert "null" strings to actual null values for easier comparison.
|
||||
*/
|
||||
function unifyValue(val: any): any {
|
||||
return val === "null" ? null : val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all "null" strings in an object to actual null values.
|
||||
*/
|
||||
function unifyItemValues<T extends object>(item: T): T {
|
||||
const unifiedItem: any = {};
|
||||
for (const key of Object.keys(item)) {
|
||||
unifiedItem[key] = unifyValue(item[key]);
|
||||
}
|
||||
return unifiedItem;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two objects are mergeable by comparing their non-null values
|
||||
*/
|
||||
export function areMergeable_F0(obj1: any, obj2: any): boolean {
|
||||
const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]);
|
||||
let matchingNonNullValues = 0;
|
||||
let nonNullComparisons = 0;
|
||||
|
||||
for (const key of allKeys) {
|
||||
const val1 = obj1[key];
|
||||
const val2 = obj2[key];
|
||||
|
||||
// Skip array comparisons - they'll be merged separately
|
||||
if (Array.isArray(val1) || Array.isArray(val2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If both values exist and are not null
|
||||
if (val1 !== null && val2 !== null) {
|
||||
nonNullComparisons++;
|
||||
if (val1 === val2) {
|
||||
matchingNonNullValues++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Objects are mergeable if they have at least one matching non-null value
|
||||
// and all their non-null values match when both objects have them
|
||||
return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge arrays and remove duplicates
|
||||
*/
|
||||
function mergeArrays(arr1: any[], arr2: any[]): any[] {
|
||||
const combined = [...arr1, ...arr2];
|
||||
return combined.filter((item, index) => {
|
||||
const stringified = JSON.stringify(item);
|
||||
return (
|
||||
combined.findIndex((other) => JSON.stringify(other) === stringified) ===
|
||||
index
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two objects, taking non-null values over null values
|
||||
*/
|
||||
function mergeObjects(obj1: any, obj2: any): any {
|
||||
const result = { ...obj1 };
|
||||
|
||||
for (const key in obj2) {
|
||||
if (obj2.hasOwnProperty(key)) {
|
||||
// If obj2's value is non-null, it should override obj1's value
|
||||
if (obj2[key] !== null) {
|
||||
if (Array.isArray(obj2[key])) {
|
||||
// If both are arrays, merge them
|
||||
if (Array.isArray(result[key])) {
|
||||
result[key] = mergeArrays(result[key], obj2[key]);
|
||||
} else {
|
||||
// If only obj2's value is an array, use it
|
||||
result[key] = [...obj2[key]];
|
||||
}
|
||||
} else if (typeof obj2[key] === "object") {
|
||||
// If both are objects (but not arrays), merge them
|
||||
if (typeof result[key] === "object" && !Array.isArray(result[key])) {
|
||||
result[key] = mergeObjects(result[key], obj2[key]);
|
||||
} else {
|
||||
result[key] = { ...obj2[key] };
|
||||
}
|
||||
} else {
|
||||
// For primitive values, obj2's non-null value always wins
|
||||
result[key] = obj2[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges arrays of objects by combining those that are identical except for
|
||||
* null-equivalent fields, filling in null fields with the corresponding
|
||||
* non-null fields from the other object.
|
||||
*/
|
||||
export function mergeNullValObjs_F0(objArray: { [key: string]: any[] }): {
|
||||
[key: string]: any[];
|
||||
} {
|
||||
const result: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
// If array contains only primitive values, return as is
|
||||
if (
|
||||
objArray[key].every((item) => typeof item !== "object" || item === null)
|
||||
) {
|
||||
result[key] = [...objArray[key]];
|
||||
continue;
|
||||
}
|
||||
|
||||
const items = objArray[key].map(unifyItemValues);
|
||||
const mergedItems: any[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
let merged = false;
|
||||
|
||||
for (let i = 0; i < mergedItems.length; i++) {
|
||||
if (areMergeable_F0(mergedItems[i], item)) {
|
||||
mergedItems[i] = mergeObjects(mergedItems[i], item);
|
||||
merged = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!merged) {
|
||||
mergedItems.push({ ...item });
|
||||
}
|
||||
}
|
||||
|
||||
// Final deduplication pass
|
||||
result[key] = deduplicateObjectsArray_F0({ [key]: mergedItems })[key];
|
||||
} else {
|
||||
console.warn(
|
||||
`Expected an array at objArray[${key}], but found:`,
|
||||
objArray[key],
|
||||
);
|
||||
return objArray;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
import type { Logger } from "winston";
|
||||
|
||||
export async function mixSchemaObjects_F0(
|
||||
finalSchema: any,
|
||||
singleAnswerResult: any,
|
||||
multiEntityResult: any,
|
||||
logger?: Logger
|
||||
) {
|
||||
const finalResult: any = {};
|
||||
logger?.debug("Mixing schema objects.");
|
||||
|
||||
// Recursive helper function to merge results based on schema
|
||||
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
||||
const result: any = {};
|
||||
for (const key in schema.properties) {
|
||||
if (
|
||||
schema.properties[key].type === "object" &&
|
||||
schema.properties[key].properties
|
||||
) {
|
||||
// If the property is an object, recursively merge its properties
|
||||
result[key] = mergeResults(
|
||||
schema.properties[key],
|
||||
singleResult[key] || {},
|
||||
multiResult[key] || {},
|
||||
);
|
||||
} else if (
|
||||
schema.properties[key].type === "array" &&
|
||||
Array.isArray(multiResult[key])
|
||||
) {
|
||||
// If the property is an array, flatten the arrays from multiResult
|
||||
result[key] = multiResult[key].flat();
|
||||
} else if (singleResult.hasOwnProperty(key)) {
|
||||
result[key] = singleResult[key];
|
||||
} else if (multiResult.hasOwnProperty(key)) {
|
||||
result[key] = multiResult[key];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Merge the properties from the final schema
|
||||
Object.assign(
|
||||
finalResult,
|
||||
mergeResults(finalSchema, singleAnswerResult, multiEntityResult),
|
||||
);
|
||||
|
||||
return finalResult;
|
||||
}
|
151
apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts
Normal file
151
apps/api/src/lib/extract/fire-0/helpers/source-tracker-f0.ts
Normal file
@ -0,0 +1,151 @@
|
||||
import { logger } from "../../../../lib/logger";
|
||||
import { areMergeable_F0 } from "./merge-null-val-objs-f0";
|
||||
import { transformArrayToObject_F0 } from "./transform-array-to-obj-f0";
|
||||
|
||||
interface TransformedResult {
|
||||
transformed: { [key: string]: any[] } | any[];
|
||||
url: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tracks sources through the transformation, deduplication, and merging process
|
||||
*/
|
||||
export class SourceTracker_F0 {
|
||||
private transformedResults: TransformedResult[];
|
||||
private preDedupeSourceMap: Map<string, string[]>;
|
||||
|
||||
constructor() {
|
||||
this.transformedResults = [];
|
||||
this.preDedupeSourceMap = new Map();
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform raw extraction results into a format that preserves source information
|
||||
*/
|
||||
transformResults_F0(extractionResults: { extract: any; url: string }[], schema: any, withTransform: boolean = true) {
|
||||
// Handle array outputs
|
||||
if (Array.isArray(extractionResults[0]?.extract)) {
|
||||
this.transformedResults = extractionResults.map(result => ({
|
||||
transformed: result.extract,
|
||||
url: result.url
|
||||
}));
|
||||
|
||||
if (withTransform) {
|
||||
// Combine all extracts to match original behavior
|
||||
const combinedExtracts = extractionResults.map(r => r.extract).flat();
|
||||
return combinedExtracts;
|
||||
}
|
||||
return this.transformedResults;
|
||||
}
|
||||
|
||||
// Handle object outputs (original behavior)
|
||||
this.transformedResults = extractionResults.map(result => ({
|
||||
transformed: transformArrayToObject_F0(schema, [result.extract]),
|
||||
url: result.url
|
||||
}));
|
||||
|
||||
if (withTransform) {
|
||||
// Then combine all extracts and transform them together to match original behavior
|
||||
const combinedExtracts = extractionResults.map(r => r.extract);
|
||||
return transformArrayToObject_F0(schema, combinedExtracts);
|
||||
}
|
||||
return this.transformedResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Track sources for each item before deduplication
|
||||
*/
|
||||
trackPreDeduplicationSources_F0(multiEntityResult: { [key: string]: any[] } | any[]) {
|
||||
try {
|
||||
if (Array.isArray(multiEntityResult)) {
|
||||
// Handle array outputs
|
||||
multiEntityResult.forEach((item: any) => {
|
||||
const itemKey = JSON.stringify(item);
|
||||
const matchingSources = this.transformedResults
|
||||
.filter(result =>
|
||||
Array.isArray(result.transformed) &&
|
||||
result.transformed.some((resultItem: any) =>
|
||||
JSON.stringify(resultItem) === itemKey
|
||||
)
|
||||
)
|
||||
.map(result => result.url);
|
||||
this.preDedupeSourceMap.set(itemKey, matchingSources);
|
||||
});
|
||||
} else {
|
||||
// Handle object outputs (original behavior)
|
||||
Object.keys(multiEntityResult).forEach(key => {
|
||||
multiEntityResult[key].forEach((item: any) => {
|
||||
const itemKey = JSON.stringify(item);
|
||||
const matchingSources = this.transformedResults
|
||||
.filter(result =>
|
||||
result.transformed[key]?.some((resultItem: any) =>
|
||||
JSON.stringify(resultItem) === itemKey
|
||||
)
|
||||
)
|
||||
.map(result => result.url);
|
||||
this.preDedupeSourceMap.set(itemKey, matchingSources);
|
||||
});
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to track pre-deduplication sources`, { error });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map sources to final deduplicated/merged items
|
||||
*/
|
||||
mapSourcesToFinalItems_F0(
|
||||
multiEntityResult: { [key: string]: any[] } | any[],
|
||||
multiEntityKeys: string[]
|
||||
): Record<string, string[]> {
|
||||
try {
|
||||
const sources: Record<string, string[]> = {};
|
||||
|
||||
if (Array.isArray(multiEntityResult)) {
|
||||
// Handle array outputs
|
||||
multiEntityResult.forEach((item: any, finalIndex: number) => {
|
||||
const sourceKey = `[${finalIndex}]`;
|
||||
const itemSources = new Set<string>();
|
||||
|
||||
this.transformedResults.forEach(result => {
|
||||
if (Array.isArray(result.transformed)) {
|
||||
result.transformed.forEach((originalItem: any) => {
|
||||
if (areMergeable_F0(item, originalItem)) {
|
||||
itemSources.add(result.url);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
sources[sourceKey] = Array.from(itemSources);
|
||||
});
|
||||
} else {
|
||||
// Handle object outputs (original behavior)
|
||||
multiEntityKeys.forEach(key => {
|
||||
if (multiEntityResult[key] && Array.isArray(multiEntityResult[key])) {
|
||||
multiEntityResult[key].forEach((item: any, finalIndex: number) => {
|
||||
const sourceKey = `${key}[${finalIndex}]`;
|
||||
const itemSources = new Set<string>();
|
||||
|
||||
this.transformedResults.forEach(result => {
|
||||
result.transformed[key]?.forEach((originalItem: any) => {
|
||||
if (areMergeable_F0(item, originalItem)) {
|
||||
itemSources.add(result.url);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
sources[sourceKey] = Array.from(itemSources);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return sources;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to map sources to final items`, { error });
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
82
apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts
Normal file
82
apps/api/src/lib/extract/fire-0/helpers/spread-schemas-f0.ts
Normal file
@ -0,0 +1,82 @@
|
||||
export async function spreadSchemas_F0(
|
||||
schema: any,
|
||||
keys: string[],
|
||||
): Promise<{
|
||||
singleAnswerSchema: any;
|
||||
multiEntitySchema: any;
|
||||
}> {
|
||||
let singleAnswerSchema = { ...schema, properties: { ...schema.properties } };
|
||||
let multiEntitySchema: any = {
|
||||
type: "object",
|
||||
properties: {},
|
||||
...(schema.required ? { required: [] } : {})
|
||||
};
|
||||
|
||||
// Helper function to check if a property path exists in schema
|
||||
const hasPropertyPath = (schema: any, path: string[]): boolean => {
|
||||
let current = schema.properties;
|
||||
for (let i = 0; i < path.length; i++) {
|
||||
if (!current[path[i]]) return false;
|
||||
if (current[path[i]].type === "array" && current[path[i]].items) {
|
||||
current = current[path[i]].items.properties;
|
||||
} else {
|
||||
current = current[path[i]].properties;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// Helper function to get the root property of a dot path
|
||||
const getRootProperty = (path: string): string => {
|
||||
return path.split('.')[0];
|
||||
};
|
||||
|
||||
keys.forEach((key) => {
|
||||
const rootProperty = getRootProperty(key);
|
||||
if (singleAnswerSchema.properties[rootProperty]) {
|
||||
multiEntitySchema.properties[rootProperty] = singleAnswerSchema.properties[rootProperty];
|
||||
delete singleAnswerSchema.properties[rootProperty];
|
||||
|
||||
// Move required field if it exists
|
||||
if (schema.required?.includes(rootProperty)) {
|
||||
multiEntitySchema.required.push(rootProperty);
|
||||
singleAnswerSchema.required = schema.required.filter((k: string) => k !== rootProperty);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Recursively delete empty properties in singleAnswerSchema
|
||||
const deleteEmptyProperties = (schema: any) => {
|
||||
for (const key in schema.properties) {
|
||||
if (
|
||||
schema.properties[key].properties &&
|
||||
Object.keys(schema.properties[key].properties).length === 0
|
||||
) {
|
||||
delete schema.properties[key];
|
||||
} else if (schema.properties[key].properties) {
|
||||
deleteEmptyProperties(schema.properties[key]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
deleteEmptyProperties(singleAnswerSchema);
|
||||
deleteEmptyProperties(multiEntitySchema);
|
||||
|
||||
// If singleAnswerSchema has no properties left, return an empty object
|
||||
if (Object.keys(singleAnswerSchema.properties).length === 0) {
|
||||
singleAnswerSchema = {};
|
||||
} else if (singleAnswerSchema.required?.length === 0) {
|
||||
delete singleAnswerSchema.required;
|
||||
}
|
||||
|
||||
if (Object.keys(multiEntitySchema.properties).length === 0) {
|
||||
multiEntitySchema = {};
|
||||
} else if (multiEntitySchema.required?.length === 0) {
|
||||
delete multiEntitySchema.required;
|
||||
}
|
||||
|
||||
return {
|
||||
singleAnswerSchema,
|
||||
multiEntitySchema,
|
||||
};
|
||||
}
|
@ -0,0 +1,167 @@
|
||||
import isEqual from "lodash/isEqual";
|
||||
|
||||
/**
|
||||
* Transforms an array of objects into a single object, merging properties with the same name.
|
||||
* @param originalSchema - The schema of the original data.
|
||||
* @param arrayData - The array of objects to transform.
|
||||
* @returns A single object with merged properties.
|
||||
*/
|
||||
export function transformArrayToObject_F0(
|
||||
originalSchema: any,
|
||||
arrayData: any[],
|
||||
): any {
|
||||
if (Object.keys(originalSchema).length == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const transformedResult: any = {};
|
||||
|
||||
// Function to find the array key in a nested schema
|
||||
function findArrayKey(schema: any): string | null {
|
||||
for (const key in schema.properties) {
|
||||
if (schema.properties[key].type === "array") {
|
||||
return key;
|
||||
} else if (schema.properties[key].type === "object") {
|
||||
const nestedKey = findArrayKey(schema.properties[key]);
|
||||
if (nestedKey) {
|
||||
return `${key}.${nestedKey}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const arrayKeyPath = findArrayKey(originalSchema);
|
||||
if (!arrayKeyPath) {
|
||||
return arrayData.reduce((acc, item) => {
|
||||
for (const key in item) {
|
||||
if (!acc[key]) {
|
||||
acc[key] = item[key];
|
||||
} else if (
|
||||
typeof acc[key] === "object" &&
|
||||
typeof item[key] === "object"
|
||||
) {
|
||||
acc[key] = { ...acc[key], ...item[key] };
|
||||
}
|
||||
}
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
const arrayKeyParts = arrayKeyPath.split(".");
|
||||
const arrayKey = arrayKeyParts.pop();
|
||||
if (!arrayKey) {
|
||||
throw new Error("Array key not found in schema");
|
||||
}
|
||||
|
||||
const parentSchema = arrayKeyParts.reduce(
|
||||
(schema, key) => schema.properties[key],
|
||||
originalSchema,
|
||||
);
|
||||
const itemSchema = parentSchema.properties[arrayKey].items;
|
||||
if (!itemSchema) {
|
||||
throw new Error("Item schema not found for array key");
|
||||
}
|
||||
|
||||
// Initialize the array in the transformed result
|
||||
let currentLevel = transformedResult;
|
||||
arrayKeyParts.forEach((part) => {
|
||||
if (!currentLevel[part]) {
|
||||
currentLevel[part] = {};
|
||||
}
|
||||
currentLevel = currentLevel[part];
|
||||
});
|
||||
currentLevel[arrayKey] = [];
|
||||
|
||||
// Helper function to check if an object is already in the array
|
||||
function isDuplicateObject(array: any[], obj: any): boolean {
|
||||
return array.some((existingItem) => isEqual(existingItem, obj));
|
||||
}
|
||||
|
||||
// Helper function to validate if an object follows the schema
|
||||
function isValidObject(obj: any, schema: any): boolean {
|
||||
return Object.keys(schema.properties).every((key) => {
|
||||
return (
|
||||
obj.hasOwnProperty(key) &&
|
||||
typeof obj[key] === schema.properties[key].type
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
// Iterate over each item in the arrayData
|
||||
arrayData.forEach((item) => {
|
||||
let currentItem = item;
|
||||
arrayKeyParts.forEach((part) => {
|
||||
if (currentItem[part]) {
|
||||
currentItem = currentItem[part];
|
||||
}
|
||||
});
|
||||
|
||||
// Copy non-array properties from the parent object
|
||||
for (const key in parentSchema.properties) {
|
||||
if (
|
||||
key !== arrayKey &&
|
||||
currentItem.hasOwnProperty(key) &&
|
||||
!currentLevel.hasOwnProperty(key)
|
||||
) {
|
||||
currentLevel[key] = currentItem[key];
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||
if (Array.isArray(currentItem[arrayKey])) {
|
||||
currentItem[arrayKey].forEach((subItem: any) => {
|
||||
if (
|
||||
typeof subItem === "object" &&
|
||||
subItem !== null &&
|
||||
isValidObject(subItem, itemSchema)
|
||||
) {
|
||||
// For arrays of objects, add only unique objects
|
||||
const transformedItem: any = {};
|
||||
let hasValidData = false;
|
||||
|
||||
for (const key in itemSchema.properties) {
|
||||
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
|
||||
transformedItem[key] = subItem[key];
|
||||
hasValidData = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
hasValidData &&
|
||||
!isDuplicateObject(currentLevel[arrayKey], transformedItem)
|
||||
) {
|
||||
currentLevel[arrayKey].push(transformedItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.warn(
|
||||
`Expected an array at ${arrayKey}, but found:`,
|
||||
currentItem[arrayKey],
|
||||
);
|
||||
}
|
||||
|
||||
// Handle merging of array properties
|
||||
for (const key in parentSchema.properties) {
|
||||
if (
|
||||
parentSchema.properties[key].type === "array" &&
|
||||
Array.isArray(currentItem[key])
|
||||
) {
|
||||
if (!currentLevel[key]) {
|
||||
currentLevel[key] = [];
|
||||
}
|
||||
currentItem[key].forEach((value: any) => {
|
||||
if (
|
||||
!currentLevel[key].includes(value) &&
|
||||
!isDuplicateObject(currentLevel[arrayKey], value)
|
||||
) {
|
||||
currentLevel[key].push(value);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return transformedResult;
|
||||
}
|
469
apps/api/src/lib/extract/fire-0/llmExtract-f0.ts
Normal file
469
apps/api/src/lib/extract/fire-0/llmExtract-f0.ts
Normal file
@ -0,0 +1,469 @@
|
||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import {
|
||||
Document,
|
||||
ExtractOptions,
|
||||
TokenUsage,
|
||||
} from "../../../controllers/v1/types";
|
||||
import { Logger } from "winston";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
||||
import { generateObject, generateText, LanguageModel } from 'ai';
|
||||
import { jsonSchema } from 'ai';
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { z } from "zod";
|
||||
import { EngineResultsTracker, Meta } from "../../../scraper/scrapeURL";
|
||||
|
||||
// Get max tokens from model prices
|
||||
const getModelLimits_F0 = (model: string) => {
|
||||
const modelConfig = modelPrices[model];
|
||||
if (!modelConfig) {
|
||||
// Default fallback values
|
||||
return {
|
||||
maxInputTokens: 8192,
|
||||
maxOutputTokens: 4096,
|
||||
maxTokens: 12288,
|
||||
};
|
||||
}
|
||||
return {
|
||||
maxInputTokens: modelConfig.max_input_tokens || modelConfig.max_tokens,
|
||||
maxOutputTokens: modelConfig.max_output_tokens || modelConfig.max_tokens,
|
||||
maxTokens: modelConfig.max_tokens,
|
||||
};
|
||||
};
|
||||
|
||||
export class LLMRefusalError extends Error {
|
||||
public refusal: string;
|
||||
public results: EngineResultsTracker | undefined;
|
||||
|
||||
constructor(refusal: string) {
|
||||
super("LLM refused to extract the website's content");
|
||||
this.refusal = refusal;
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeSchema(x: any): any {
|
||||
if (typeof x !== "object" || x === null) return x;
|
||||
|
||||
if (x["$defs"] !== null && typeof x["$defs"] === "object") {
|
||||
x["$defs"] = Object.fromEntries(
|
||||
Object.entries(x["$defs"]).map(([name, schema]) => [
|
||||
name,
|
||||
normalizeSchema(schema),
|
||||
]),
|
||||
);
|
||||
}
|
||||
|
||||
if (x && x.anyOf) {
|
||||
x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
|
||||
}
|
||||
|
||||
if (x && x.oneOf) {
|
||||
x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
|
||||
}
|
||||
|
||||
if (x && x.allOf) {
|
||||
x.allOf = x.allOf.map((x) => normalizeSchema(x));
|
||||
}
|
||||
|
||||
if (x && x.not) {
|
||||
x.not = normalizeSchema(x.not);
|
||||
}
|
||||
|
||||
if (x && x.type === "object") {
|
||||
return {
|
||||
...x,
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(x.properties || {}).map(([k, v]) => [k, normalizeSchema(v)]),
|
||||
),
|
||||
required: Object.keys(x.properties || {}),
|
||||
additionalProperties: false,
|
||||
};
|
||||
} else if (x && x.type === "array") {
|
||||
return {
|
||||
...x,
|
||||
items: normalizeSchema(x.items),
|
||||
};
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
interface TrimResult {
|
||||
text: string;
|
||||
numTokens: number;
|
||||
warning?: string;
|
||||
}
|
||||
|
||||
export function trimToTokenLimit_F0(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
|
||||
try {
|
||||
const encoder = encoding_for_model(modelId as TiktokenModel);
|
||||
try {
|
||||
const tokens = encoder.encode(text);
|
||||
const numTokens = tokens.length;
|
||||
|
||||
if (numTokens <= maxTokens) {
|
||||
return { text, numTokens };
|
||||
}
|
||||
|
||||
const modifier = 3;
|
||||
// Start with 3 chars per token estimation
|
||||
let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
|
||||
|
||||
// Keep trimming until we're under the token limit
|
||||
while (true) {
|
||||
const currentTokens = encoder.encode(currentText);
|
||||
if (currentTokens.length <= maxTokens) {
|
||||
const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||
return {
|
||||
text: currentText,
|
||||
numTokens: currentTokens.length,
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
};
|
||||
}
|
||||
const overflow = currentTokens.length * modifier - maxTokens - 1;
|
||||
// If still over limit, remove another chunk
|
||||
currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
throw e;
|
||||
} finally {
|
||||
encoder.free();
|
||||
}
|
||||
} catch (error) {
|
||||
// Fallback to a more conservative character-based approach
|
||||
const estimatedCharsPerToken = 2.8;
|
||||
const safeLength = maxTokens * estimatedCharsPerToken;
|
||||
const trimmedText = text.slice(0, Math.floor(safeLength));
|
||||
|
||||
const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
|
||||
return {
|
||||
text: trimmedText,
|
||||
numTokens: maxTokens, // We assume we hit the max in this fallback case
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateCompletions_F0({
|
||||
logger,
|
||||
options,
|
||||
markdown,
|
||||
previousWarning,
|
||||
isExtractEndpoint,
|
||||
model = getModel("gpt-4o-mini"),
|
||||
mode = "object",
|
||||
}: {
|
||||
model?: LanguageModel;
|
||||
logger: Logger;
|
||||
options: ExtractOptions;
|
||||
markdown?: string;
|
||||
previousWarning?: string;
|
||||
isExtractEndpoint?: boolean;
|
||||
mode?: "object" | "no-object";
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
warning: string | undefined;
|
||||
totalUsage: TokenUsage;
|
||||
model: string;
|
||||
}> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
|
||||
const { maxInputTokens, maxOutputTokens } = getModelLimits_F0(model.modelId);
|
||||
// Calculate 80% of max input tokens (for content)
|
||||
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
|
||||
|
||||
// Use the new trimming function
|
||||
const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit_F0(
|
||||
markdown,
|
||||
maxTokensSafe,
|
||||
model.modelId,
|
||||
previousWarning
|
||||
);
|
||||
|
||||
markdown = trimmedMarkdown;
|
||||
warning = trimWarning;
|
||||
|
||||
try {
|
||||
const prompt = options.prompt !== undefined
|
||||
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
|
||||
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
|
||||
|
||||
if (mode === "no-object") {
|
||||
const result = await generateText({
|
||||
model: model,
|
||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||
temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: model.modelId,
|
||||
};
|
||||
}
|
||||
|
||||
let schema = options.schema;
|
||||
// Normalize the bad json schema users write (mogery)
|
||||
if (schema && !(schema instanceof z.ZodType)) {
|
||||
// let schema = options.schema;
|
||||
if (schema) {
|
||||
schema = removeDefaultProperty_F0(schema);
|
||||
}
|
||||
|
||||
if (schema && schema.type === "array") {
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
items: options.schema,
|
||||
},
|
||||
required: ["items"],
|
||||
additionalProperties: false,
|
||||
};
|
||||
} else if (schema && typeof schema === "object" && !schema.type) {
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(schema).map(([key, value]) => {
|
||||
return [key, removeDefaultProperty_F0(value)];
|
||||
}),
|
||||
),
|
||||
required: Object.keys(schema),
|
||||
additionalProperties: false,
|
||||
};
|
||||
}
|
||||
|
||||
schema = normalizeSchema(schema);
|
||||
}
|
||||
|
||||
const repairConfig = {
|
||||
experimental_repairText: async ({ text, error }) => {
|
||||
// AI may output a markdown JSON code block. Remove it - mogery
|
||||
if (typeof text === "string" && text.trim().startsWith("```")) {
|
||||
if (text.trim().startsWith("```json")) {
|
||||
text = text.trim().slice("```json".length).trim();
|
||||
} else {
|
||||
text = text.trim().slice("```".length).trim();
|
||||
}
|
||||
|
||||
if (text.trim().endsWith("```")) {
|
||||
text = text.trim().slice(0, -"```".length).trim();
|
||||
}
|
||||
|
||||
// If this fixes the JSON, just return it. If not, continue - mogery
|
||||
try {
|
||||
JSON.parse(text);
|
||||
return text;
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
const { text: fixedText } = await generateText({
|
||||
model: model,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON."
|
||||
});
|
||||
return fixedText;
|
||||
}
|
||||
};
|
||||
|
||||
const generateObjectConfig = {
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
||||
...(!schema && { output: 'no-schema' as const }),
|
||||
...repairConfig,
|
||||
...(!schema && {
|
||||
onError: (error: Error) => {
|
||||
console.error(error);
|
||||
}
|
||||
})
|
||||
} satisfies Parameters<typeof generateObject>[0];
|
||||
|
||||
const result = await generateObject(generateObjectConfig);
|
||||
extract = result.object;
|
||||
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
if (
|
||||
options.schema &&
|
||||
options.schema.type === "array" &&
|
||||
!schema?.required?.includes("items")
|
||||
) {
|
||||
extract = extract?.items;
|
||||
}
|
||||
|
||||
// Since generateObject doesn't provide token usage, we'll estimate it
|
||||
const promptTokens = numTokens;
|
||||
const completionTokens = result?.usage?.completionTokens ?? 0;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens,
|
||||
completionTokens,
|
||||
totalTokens: promptTokens + completionTokens,
|
||||
},
|
||||
model: model.modelId,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error.message?.includes('refused')) {
|
||||
throw new LLMRefusalError(error.message);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function performLLMExtract(
|
||||
meta: Meta,
|
||||
document: Document,
|
||||
): Promise<Document> {
|
||||
if (meta.options.formats.includes("extract")) {
|
||||
meta.internalOptions.abort?.throwIfAborted();
|
||||
const { extract, warning } = await generateCompletions_F0({
|
||||
logger: meta.logger.child({
|
||||
method: "performLLMExtract/generateCompletions",
|
||||
}),
|
||||
options: meta.options.extract!,
|
||||
markdown: document.markdown,
|
||||
previousWarning: document.warning
|
||||
});
|
||||
|
||||
if (meta.options.formats.includes("json")) {
|
||||
document.json = extract;
|
||||
} else {
|
||||
document.extract = extract;
|
||||
}
|
||||
document.warning = warning;
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
export function removeDefaultProperty_F0(schema: any): any {
|
||||
if (typeof schema !== "object" || schema === null) return schema;
|
||||
|
||||
const rest = { ...schema };
|
||||
|
||||
// unsupported global keys
|
||||
delete rest.default;
|
||||
|
||||
// unsupported object keys
|
||||
delete rest.patternProperties;
|
||||
delete rest.unevaluatedProperties;
|
||||
delete rest.propertyNames;
|
||||
delete rest.minProperties;
|
||||
delete rest.maxProperties;
|
||||
|
||||
// unsupported string keys
|
||||
delete rest.minLength;
|
||||
delete rest.maxLength;
|
||||
delete rest.pattern;
|
||||
delete rest.format;
|
||||
|
||||
// unsupported number keys
|
||||
delete rest.minimum;
|
||||
delete rest.maximum;
|
||||
delete rest.multipleOf;
|
||||
|
||||
// unsupported array keys
|
||||
delete rest.unevaluatedItems;
|
||||
delete rest.contains;
|
||||
delete rest.minContains;
|
||||
delete rest.maxContains;
|
||||
delete rest.minItems;
|
||||
delete rest.maxItems;
|
||||
delete rest.uniqueItems;
|
||||
|
||||
for (const key in rest) {
|
||||
if (Array.isArray(rest[key])) {
|
||||
rest[key] = rest[key].map((item: any) => removeDefaultProperty_F0(item));
|
||||
} else if (typeof rest[key] === "object" && rest[key] !== null) {
|
||||
rest[key] = removeDefaultProperty_F0(rest[key]);
|
||||
}
|
||||
}
|
||||
|
||||
return rest;
|
||||
}
|
||||
|
||||
export async function generateSchemaFromPrompt_F0(prompt: string): Promise<any> {
|
||||
const model = getModel("gpt-4o");
|
||||
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (const temp of temperatures) {
|
||||
try {
|
||||
const { extract } = await generateCompletions_F0({
|
||||
logger: logger.child({
|
||||
method: "generateSchemaFromPrompt/generateCompletions",
|
||||
}),
|
||||
model: model,
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
|
||||
Consider:
|
||||
1. The type of data being requested
|
||||
2. Required fields vs optional fields
|
||||
3. Appropriate data types for each field
|
||||
4. Nested objects and arrays where appropriate
|
||||
|
||||
Valid JSON schema, has to be simple. No crazy properties. OpenAI has to support it.
|
||||
Supported types
|
||||
The following types are supported for Structured Outputs:
|
||||
|
||||
String
|
||||
Number
|
||||
Boolean
|
||||
Integer
|
||||
Object
|
||||
Array
|
||||
Enum
|
||||
anyOf
|
||||
|
||||
Formats are not supported. Min/max are not supported. Anything beyond the above is not supported. Keep it simple with types and descriptions.
|
||||
Optionals are not supported.
|
||||
DO NOT USE FORMATS.
|
||||
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
|
||||
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
|
||||
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
||||
temperature: temp
|
||||
},
|
||||
markdown: prompt
|
||||
});
|
||||
|
||||
return extract;
|
||||
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If we get here, all attempts failed
|
||||
throw new Error(
|
||||
`Failed to generate schema after all attempts. Last error: ${lastError?.message}`,
|
||||
);
|
||||
}
|
86
apps/api/src/lib/extract/fire-0/ranker-f0.ts
Normal file
86
apps/api/src/lib/extract/fire-0/ranker-f0.ts
Normal file
@ -0,0 +1,86 @@
|
||||
import { embed } from "ai";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { getEmbeddingModel } from "../../../lib/generic-ai";
|
||||
|
||||
configDotenv();
|
||||
|
||||
async function getEmbedding(text: string) {
|
||||
const { embedding } = await embed({
|
||||
model: getEmbeddingModel("text-embedding-3-small"),
|
||||
value: text,
|
||||
});
|
||||
|
||||
return embedding;
|
||||
}
|
||||
|
||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||
const magnitude1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0));
|
||||
const magnitude2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0));
|
||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||
return dotProduct / (magnitude1 * magnitude2);
|
||||
};
|
||||
|
||||
// Function to convert text to vector
|
||||
const textToVector = (searchQuery: string, text: string): number[] => {
|
||||
const words = searchQuery.toLowerCase().split(/\W+/);
|
||||
return words.map((word) => {
|
||||
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
|
||||
.length;
|
||||
return count / text.length;
|
||||
});
|
||||
};
|
||||
|
||||
async function performRanking_F0(
|
||||
linksWithContext: string[],
|
||||
links: string[],
|
||||
searchQuery: string,
|
||||
) {
|
||||
try {
|
||||
// Handle invalid inputs
|
||||
if (!searchQuery || !linksWithContext.length || !links.length) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Sanitize search query by removing null characters
|
||||
const sanitizedQuery = searchQuery;
|
||||
|
||||
// Generate embeddings for the search query
|
||||
const queryEmbedding = await getEmbedding(sanitizedQuery);
|
||||
|
||||
// Generate embeddings for each link and calculate similarity in parallel
|
||||
const linksAndScores = await Promise.all(
|
||||
linksWithContext.map((linkWithContext, index) =>
|
||||
getEmbedding(linkWithContext)
|
||||
.then((linkEmbedding) => {
|
||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||
return {
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score,
|
||||
originalIndex: index,
|
||||
};
|
||||
})
|
||||
.catch(() => ({
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score: 0,
|
||||
originalIndex: index,
|
||||
})),
|
||||
),
|
||||
);
|
||||
|
||||
// Sort links based on similarity scores while preserving original order for equal scores
|
||||
linksAndScores.sort((a, b) => {
|
||||
const scoreDiff = b.score - a.score;
|
||||
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
|
||||
});
|
||||
|
||||
return linksAndScores;
|
||||
} catch (error) {
|
||||
console.error(`Error performing semantic search: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export { performRanking_F0 };
|
293
apps/api/src/lib/extract/fire-0/reranker-f0.ts
Normal file
293
apps/api/src/lib/extract/fire-0/reranker-f0.ts
Normal file
@ -0,0 +1,293 @@
|
||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||
import { logger } from "../../logger";
|
||||
import { CohereClient } from "cohere-ai";
|
||||
import { extractConfig } from "../config";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { performRanking_F0 } from "./ranker-f0";
|
||||
import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0";
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
|
||||
interface RankingResult {
|
||||
mappedLinks: MapDocument[];
|
||||
linksAndScores: {
|
||||
link: string;
|
||||
linkWithContext: string;
|
||||
score: number;
|
||||
originalIndex: number;
|
||||
}[];
|
||||
}
|
||||
|
||||
export async function rerankDocuments_FO(
|
||||
documents: (string | Record<string, string>)[],
|
||||
query: string,
|
||||
topN = 3,
|
||||
model = "rerank-english-v3.0",
|
||||
) {
|
||||
const rerank = await cohere.v2.rerank({
|
||||
documents,
|
||||
query,
|
||||
topN,
|
||||
model,
|
||||
returnDocuments: true,
|
||||
});
|
||||
|
||||
return rerank.results
|
||||
.sort((a, b) => b.relevanceScore - a.relevanceScore)
|
||||
.map((x) => ({
|
||||
document: x.document,
|
||||
index: x.index,
|
||||
relevanceScore: x.relevanceScore,
|
||||
}));
|
||||
}
|
||||
|
||||
export async function rerankLinks_F0(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<MapDocument[]> {
|
||||
// console.log("Going to rerank links");
|
||||
const mappedLinksRerank = mappedLinks.map(
|
||||
(x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||
);
|
||||
|
||||
const linksAndScores = await performRanking_F0(
|
||||
mappedLinksRerank,
|
||||
mappedLinks.map((l) => l.url),
|
||||
searchQuery,
|
||||
);
|
||||
|
||||
// First try with high threshold
|
||||
let filteredLinks = filterAndProcessLinks_F0(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
if (filteredLinks.length < extractConfig.RERANKING.MIN_REQUIRED_LINKS) {
|
||||
logger.info(
|
||||
`Only found ${filteredLinks.length} links with score > ${extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE}. Trying lower threshold...`,
|
||||
);
|
||||
filteredLinks = filterAndProcessLinks_F0(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
// If still no results, take top N results regardless of score
|
||||
logger.warn(
|
||||
`No links found with score > ${extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE}. Taking top ${extractConfig.RERANKING.MIN_REQUIRED_LINKS} results.`,
|
||||
);
|
||||
filteredLinks = linksAndScores
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, extractConfig.RERANKING.MIN_REQUIRED_LINKS)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Update URL traces with relevance scores and mark filtered out URLs
|
||||
linksAndScores.forEach((score) => {
|
||||
const trace = urlTraces.find((t) => t.url === score.link);
|
||||
if (trace) {
|
||||
trace.relevanceScore = score.score;
|
||||
// If URL didn't make it through filtering, mark it as filtered out
|
||||
if (!filteredLinks.some((link) => link.url === score.link)) {
|
||||
trace.warning = `Relevance score ${score.score} below threshold`;
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const rankedLinks = filteredLinks.slice(
|
||||
0,
|
||||
extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
// Mark URLs that will be used in completion
|
||||
rankedLinks.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.usedInCompletion = true;
|
||||
}
|
||||
});
|
||||
|
||||
// Mark URLs that were dropped due to ranking limit
|
||||
filteredLinks
|
||||
.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE)
|
||||
.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = "Excluded due to ranking limit";
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
});
|
||||
|
||||
// console.log("Reranked links: ", rankedLinks.length);
|
||||
|
||||
return rankedLinks;
|
||||
}
|
||||
|
||||
function filterAndProcessLinks_F0(
|
||||
mappedLinks: MapDocument[],
|
||||
linksAndScores: {
|
||||
link: string;
|
||||
linkWithContext: string;
|
||||
score: number;
|
||||
originalIndex: number;
|
||||
}[],
|
||||
threshold: number,
|
||||
): MapDocument[] {
|
||||
return linksAndScores
|
||||
.filter((x) => x.score > threshold)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
);
|
||||
}
|
||||
|
||||
export type RerankerResult = {
|
||||
mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[];
|
||||
tokensUsed: number;
|
||||
};
|
||||
|
||||
export type RerankerOptions = {
|
||||
links: MapDocument[];
|
||||
searchQuery: string;
|
||||
urlTraces: URLTrace[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
chunks.push(links.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
|
||||
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
relevantLinks: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" },
|
||||
reason: { type: "string", description: "The reason why you chose the score for this link given the intent." },
|
||||
},
|
||||
required: ["url", "relevanceScore", "reason"],
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["relevantLinks"],
|
||||
};
|
||||
|
||||
const results = await Promise.all(
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
|
||||
const linksContent = chunk
|
||||
.map(
|
||||
(link) =>
|
||||
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`,
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
const timeoutPromise = new Promise<null>((resolve) => {
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
const completionPromise = generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: buildRerankerSystemPrompt_F0(),
|
||||
prompt: buildRerankerUserPrompt_F0(searchQuery),
|
||||
schema: schema,
|
||||
},
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
|
||||
const completion = await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
]);
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!completion.extract?.relevantLinks) {
|
||||
// console.warn(`Chunk ${chunkIndex + 1}: No relevant links found in completion response`);
|
||||
return [];
|
||||
}
|
||||
|
||||
totalTokensUsed += completion.numTokens || 0;
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
|
||||
return completion.extract.relevantLinks;
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||
error,
|
||||
);
|
||||
if (retry === MAX_RETRIES) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
return [];
|
||||
}),
|
||||
);
|
||||
|
||||
// console.log(`Processed ${results.length} chunks`);
|
||||
|
||||
// Flatten results and sort by relevance score
|
||||
const flattenedResults = results
|
||||
.flat()
|
||||
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
|
||||
}
|
||||
return undefined;
|
||||
})
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
tokensUsed: totalTokensUsed,
|
||||
};
|
||||
}
|
250
apps/api/src/lib/extract/fire-0/url-processor-f0.ts
Normal file
250
apps/api/src/lib/extract/fire-0/url-processor-f0.ts
Normal file
@ -0,0 +1,250 @@
|
||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
||||
import { getMapResults } from "../../../controllers/v1/map";
|
||||
import { removeDuplicateUrls } from "../../validateUrl";
|
||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||
import { buildPreRerankPrompt, buildRefrasedPrompt } from "../build-prompts";
|
||||
import { rerankLinksWithLLM_F0 } from "./reranker-f0";
|
||||
import { extractConfig } from "../config";
|
||||
import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../../generic-ai";
|
||||
|
||||
export async function generateBasicCompletion_FO(prompt: string) {
|
||||
const { text } = await generateText({
|
||||
model: getModel("gpt-4o"),
|
||||
prompt: prompt,
|
||||
temperature: 0
|
||||
});
|
||||
return text;
|
||||
}
|
||||
interface ProcessUrlOptions {
|
||||
url: string;
|
||||
prompt?: string;
|
||||
schema?: any;
|
||||
teamId: string;
|
||||
allowExternalLinks?: boolean;
|
||||
origin?: string;
|
||||
limit?: number;
|
||||
includeSubdomains?: boolean;
|
||||
}
|
||||
|
||||
export async function processUrl_F0(
|
||||
options: ProcessUrlOptions,
|
||||
urlTraces: URLTrace[],
|
||||
updateExtractCallback: (links: string[]) => void,
|
||||
logger: Logger,
|
||||
): Promise<string[]> {
|
||||
const trace: URLTrace = {
|
||||
url: options.url,
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
};
|
||||
urlTraces.push(trace);
|
||||
|
||||
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
||||
if (!isUrlBlocked(options.url)) {
|
||||
trace.usedInCompletion = true;
|
||||
return [options.url];
|
||||
}
|
||||
logger.warn("URL is blocked");
|
||||
trace.status = "error";
|
||||
trace.error = "URL is blocked";
|
||||
trace.usedInCompletion = false;
|
||||
return [];
|
||||
}
|
||||
|
||||
const baseUrl = options.url.replace("/*", "");
|
||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||
|
||||
let searchQuery = options.prompt;
|
||||
if (options.prompt) {
|
||||
searchQuery =
|
||||
(
|
||||
await generateBasicCompletion_FO(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
)
|
||||
)
|
||||
?.replace('"', "")
|
||||
.replace("/", "") ?? options.prompt;
|
||||
}
|
||||
|
||||
try {
|
||||
logger.debug("Running map...", {
|
||||
search: searchQuery,
|
||||
});
|
||||
const mapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
search: searchQuery,
|
||||
teamId: options.teamId,
|
||||
allowExternalLinks: options.allowExternalLinks,
|
||||
origin: options.origin,
|
||||
limit: options.limit,
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||
let uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
logger.debug("Map finished.", {
|
||||
linkCount: allUrls.length,
|
||||
uniqueLinkCount: uniqueUrls.length,
|
||||
});
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
usedInCompletion: false,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// retry if only one url is returned
|
||||
if (uniqueUrls.length <= 1) {
|
||||
logger.debug("Running map... (pass 2)");
|
||||
const retryMapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
teamId: options.teamId,
|
||||
allowExternalLinks: options.allowExternalLinks,
|
||||
origin: options.origin,
|
||||
limit: options.limit,
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
});
|
||||
|
||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||
uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
logger.debug("Map finished. (pass 2)", {
|
||||
linkCount: allUrls.length,
|
||||
uniqueLinkCount: uniqueUrls.length,
|
||||
});
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: "mapped",
|
||||
warning: "Broader search. Not limiting map results to prompt.",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
usedInCompletion: false,
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
usedInCompletion: false,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
const existingUrls = new Set(mappedLinks.map((m) => m.url));
|
||||
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
|
||||
|
||||
mappedLinks = [
|
||||
...mappedLinks,
|
||||
...newUrls.map((url) => ({ url, title: "", description: "" })),
|
||||
];
|
||||
|
||||
if (mappedLinks.length === 0) {
|
||||
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||
}
|
||||
|
||||
// Limit initial set of links (1000)
|
||||
mappedLinks = mappedLinks.slice(
|
||||
0,
|
||||
extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT,
|
||||
);
|
||||
|
||||
updateExtractCallback(mappedLinks.map((x) => x.url));
|
||||
|
||||
let rephrasedPrompt = options.prompt ?? searchQuery;
|
||||
try {
|
||||
rephrasedPrompt =
|
||||
(await generateBasicCompletion_FO(
|
||||
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
||||
)) ??
|
||||
"Extract the data according to the schema: " +
|
||||
JSON.stringify(options.schema, null, 2);
|
||||
} catch (error) {
|
||||
console.error("Error generating search query from schema:", error);
|
||||
rephrasedPrompt =
|
||||
"Extract the data according to the schema: " +
|
||||
JSON.stringify(options.schema, null, 2) +
|
||||
" " +
|
||||
options?.prompt; // Fallback to just the domain
|
||||
}
|
||||
|
||||
// "mapped-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
|
||||
logger.info("Generated rephrased prompt.", {
|
||||
rephrasedPrompt,
|
||||
});
|
||||
|
||||
logger.info("Reranking pass 1 (threshold 0.8)...");
|
||||
const rerankerResult = await rerankLinksWithLLM_F0({
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
const rerankerResult = await rerankLinksWithLLM_F0({
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
}
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
// Remove title and description from mappedLinks
|
||||
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
|
||||
return mappedLinks.map((x) => x.url);
|
||||
} catch (error) {
|
||||
trace.status = "error";
|
||||
trace.error = error.message;
|
||||
trace.usedInCompletion = false;
|
||||
return [];
|
||||
}
|
||||
}
|
61
apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts
Normal file
61
apps/api/src/lib/extract/fire-0/usage/llm-cost-f0.ts
Normal file
@ -0,0 +1,61 @@
|
||||
import { TokenUsage } from "../../../../controllers/v1/types";
|
||||
import { logger } from "../../../../lib/logger";
|
||||
import { modelPrices } from "../../usage/model-prices";
|
||||
|
||||
interface ModelPricing {
|
||||
input_cost_per_token?: number;
|
||||
output_cost_per_token?: number;
|
||||
input_cost_per_request?: number;
|
||||
mode: string;
|
||||
}
|
||||
const tokenPerCharacter = 4;
|
||||
const baseTokenCost = 300;
|
||||
|
||||
export function calculateFinalResultCost_F0(data: any): number {
|
||||
return Math.floor(
|
||||
JSON.stringify(data).length / tokenPerCharacter + baseTokenCost,
|
||||
);
|
||||
}
|
||||
|
||||
export function estimateTotalCost_F0(tokenUsage: TokenUsage[]): number {
|
||||
return tokenUsage.reduce((total, usage) => {
|
||||
return total + estimateCost_F0(usage);
|
||||
}, 0);
|
||||
}
|
||||
|
||||
export function estimateCost_F0(tokenUsage: TokenUsage): number {
|
||||
let totalCost = 0;
|
||||
try {
|
||||
let model = tokenUsage.model ?? (process.env.MODEL_NAME || "gpt-4o-mini");
|
||||
const pricing = modelPrices[model] as ModelPricing;
|
||||
|
||||
if (!pricing) {
|
||||
logger.error(`No pricing information found for model: ${model}`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pricing.mode !== "chat") {
|
||||
logger.error(`Model ${model} is not a chat model`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Add per-request cost if applicable (Only Perplexity supports this)
|
||||
if (pricing.input_cost_per_request) {
|
||||
totalCost += pricing.input_cost_per_request;
|
||||
}
|
||||
|
||||
// Add token-based costs
|
||||
if (pricing.input_cost_per_token) {
|
||||
totalCost += tokenUsage.promptTokens * pricing.input_cost_per_token;
|
||||
}
|
||||
|
||||
if (pricing.output_cost_per_token) {
|
||||
totalCost += tokenUsage.completionTokens * pricing.output_cost_per_token;
|
||||
}
|
||||
|
||||
return Number(totalCost.toFixed(7));
|
||||
} catch (error) {
|
||||
logger.error(`Error estimating cost: ${error}`);
|
||||
return totalCost;
|
||||
}
|
||||
}
|
@ -145,6 +145,11 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): {
|
||||
`Expected an array at objArray[${key}], but found:`,
|
||||
objArray[key],
|
||||
);
|
||||
|
||||
// create an array if it doesn't exist
|
||||
if (objArray[key] === undefined) {
|
||||
objArray[key] = [];
|
||||
}
|
||||
return objArray;
|
||||
}
|
||||
}
|
||||
|
@ -91,12 +91,23 @@ export function transformArrayToObject(
|
||||
// Iterate over each item in the arrayData
|
||||
arrayData.forEach((item) => {
|
||||
let currentItem = item;
|
||||
// Skip null items
|
||||
if (currentItem === null) {
|
||||
return;
|
||||
}
|
||||
arrayKeyParts.forEach((part) => {
|
||||
if (currentItem[part]) {
|
||||
if (currentItem && currentItem[part]) {
|
||||
currentItem = currentItem[part];
|
||||
} else {
|
||||
currentItem = null;
|
||||
}
|
||||
});
|
||||
|
||||
// Skip if we couldn't find the nested path
|
||||
if (currentItem === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy non-array properties from the parent object
|
||||
for (const key in parentSchema.properties) {
|
||||
if (
|
||||
@ -108,8 +119,8 @@ export function transformArrayToObject(
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||
if (Array.isArray(currentItem[arrayKey])) {
|
||||
// Ensure that the currentItem[arrayKey] exists and is an array before mapping
|
||||
if (currentItem && currentItem[arrayKey] && Array.isArray(currentItem[arrayKey])) {
|
||||
currentItem[arrayKey].forEach((subItem: any) => {
|
||||
if (
|
||||
typeof subItem === "object" &&
|
||||
@ -138,14 +149,20 @@ export function transformArrayToObject(
|
||||
} else {
|
||||
console.warn(
|
||||
`Expected an array at ${arrayKey}, but found:`,
|
||||
currentItem[arrayKey],
|
||||
currentItem ? currentItem[arrayKey] : 'undefined'
|
||||
);
|
||||
|
||||
// create an array if it doesn't exist
|
||||
if (currentLevel[arrayKey] === undefined) {
|
||||
currentLevel[arrayKey] = [];
|
||||
}
|
||||
}
|
||||
|
||||
// Handle merging of array properties
|
||||
for (const key in parentSchema.properties) {
|
||||
if (
|
||||
parentSchema.properties[key].type === "array" &&
|
||||
currentItem &&
|
||||
Array.isArray(currentItem[key])
|
||||
) {
|
||||
if (!currentLevel[key]) {
|
||||
|
@ -9,6 +9,11 @@ import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExt
|
||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||
import { dumpToFile } from "./helpers/dump-to-file";
|
||||
import { getModel } from "../generic-ai";
|
||||
import fs from "fs/promises";
|
||||
|
||||
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
@ -161,22 +166,42 @@ function filterAndProcessLinks(
|
||||
export type RerankerResult = {
|
||||
mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[];
|
||||
tokensUsed: number;
|
||||
cost: number;
|
||||
};
|
||||
|
||||
export type RerankerOptions = {
|
||||
links: MapDocument[];
|
||||
searchQuery: string;
|
||||
urlTraces: URLTrace[];
|
||||
isMultiEntity: boolean;
|
||||
reasoning: string;
|
||||
multiEntityKeys: string[];
|
||||
keyIndicators: string[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
export async function rerankLinksWithLLM(
|
||||
options: RerankerOptions,
|
||||
): Promise<RerankerResult> {
|
||||
const {
|
||||
links,
|
||||
searchQuery,
|
||||
urlTraces,
|
||||
isMultiEntity,
|
||||
reasoning,
|
||||
multiEntityKeys,
|
||||
keyIndicators,
|
||||
} = options;
|
||||
const chunkSize = 5000;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const TIMEOUT_MS = 60000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
// await fs.writeFile(
|
||||
// `logs/links-${crypto.randomUUID()}.txt`,
|
||||
// JSON.stringify(links, null, 2),
|
||||
// );
|
||||
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
chunks.push(links.slice(i, i + chunkSize));
|
||||
@ -194,7 +219,11 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" },
|
||||
reason: { type: "string", description: "The reason why you chose the score for this link given the intent." },
|
||||
reason: {
|
||||
type: "string",
|
||||
description:
|
||||
"The reason why you chose the score for this link given the intent.",
|
||||
},
|
||||
},
|
||||
required: ["url", "relevanceScore", "reason"],
|
||||
},
|
||||
@ -203,6 +232,8 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
required: ["relevantLinks"],
|
||||
};
|
||||
|
||||
let totalCost = 0;
|
||||
|
||||
const results = await Promise.all(
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
@ -214,33 +245,91 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
// fs.writeFile(
|
||||
// `logs/links-content-${crypto.randomUUID()}.txt`,
|
||||
// linksContent,
|
||||
// );
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
const timeoutPromise = new Promise<null>((resolve) => {
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
const completionPromise = generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: buildRerankerSystemPrompt(),
|
||||
prompt: buildRerankerUserPrompt(searchQuery),
|
||||
schema: schema,
|
||||
},
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
});
|
||||
const systemPrompt = `You are analyzing URLs for ${isMultiEntity ? "collecting multiple items" : "specific information"}.
|
||||
The user's query is: ${searchQuery}
|
||||
${
|
||||
isMultiEntity
|
||||
? `IMPORTANT: This is a multi-entity extraction task looking for ${multiEntityKeys.join(", ")}.
|
||||
Score URLs higher if they contain ANY instance of the target entities.
|
||||
Key indicators to look for: ${keyIndicators.join(", ")}`
|
||||
: `IMPORTANT: This is a specific information task.
|
||||
Score URLs based on precision and relevance to answering the query.`
|
||||
}
|
||||
|
||||
Scoring guidelines:
|
||||
${
|
||||
isMultiEntity
|
||||
? `
|
||||
- 1.0: Contains ANY instance of target entities, even just one. Give this score if page has any relevant entity. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains entity but may be incomplete information
|
||||
- 0.6: Mentions entity type but no clear instance
|
||||
- 0.4: Only tangentially related to entity type
|
||||
- Below 0.4: No mention of relevant entities, or duplicates
|
||||
|
||||
Reason: ${reasoning}
|
||||
`
|
||||
: `
|
||||
- 1.0: Contains direct, authoritative answer to query. Give this score if unsure about relevance. If you are not sure if this page is relevant or not, give it a score of 1.0
|
||||
- 0.8: Contains information that directly helps answer the query
|
||||
- 0.6: Contains related information that partially answers query
|
||||
- Below 0.6: Information too general or not focused on query
|
||||
`
|
||||
}`;
|
||||
|
||||
const completion = await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
]);
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
// const gemini = getGemini();
|
||||
// const model = getGemini()
|
||||
let completion: any;
|
||||
try {
|
||||
const completionPromise = generateCompletions({
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
logger: logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: systemPrompt,
|
||||
prompt: buildRerankerUserPrompt(searchQuery),
|
||||
schema: schema,
|
||||
// temperature: isMultiEntity ? 0.5 : 0.3,
|
||||
},
|
||||
// providerOptions: {
|
||||
// anthropic: {
|
||||
// thinking: { type: 'enabled', budgetTokens: 12000 },
|
||||
// tool_choice: "auto",
|
||||
// },
|
||||
// },
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true,
|
||||
});
|
||||
|
||||
completion = await completionPromise;
|
||||
totalCost += completion.cost;
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
|
||||
// await fs.writeFile(
|
||||
// `logs/reranker-${crypto.randomUUID()}.json`,
|
||||
// JSON.stringify(completion, null, 2),
|
||||
// );
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
@ -278,19 +367,48 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise<Rera
|
||||
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
// Map back to MapDocument format, keeping ALL links for testing
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
|
||||
if (
|
||||
result.relevanceScore >
|
||||
(isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE)
|
||||
) {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return {
|
||||
...link,
|
||||
relevanceScore: result.relevanceScore
|
||||
? parseFloat(result.relevanceScore)
|
||||
: 0,
|
||||
reason: result.reason,
|
||||
};
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
})
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
// Add debug logging for testing
|
||||
// fs.writeFile(
|
||||
// `logs/reranker-aaa-${crypto.randomUUID()}.json`,
|
||||
// JSON.stringify(
|
||||
// {
|
||||
// totalResults: relevantLinks.length,
|
||||
// scores: relevantLinks.map((l) => ({
|
||||
// url: l.url,
|
||||
// score: l.relevanceScore,
|
||||
// reason: l.reason,
|
||||
// })),
|
||||
// },
|
||||
// null,
|
||||
// 2,
|
||||
// ),
|
||||
// );
|
||||
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
tokensUsed: totalTokensUsed,
|
||||
cost: totalCost,
|
||||
};
|
||||
}
|
||||
|
@ -8,14 +8,42 @@ import { extractConfig } from "./config";
|
||||
import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../generic-ai";
|
||||
import { calculateCost } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import type { CostTracking } from "./extraction-service";
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
const { text } = await generateText({
|
||||
model: getModel("gpt-4o"),
|
||||
prompt: prompt,
|
||||
temperature: 0
|
||||
});
|
||||
return text;
|
||||
export async function generateBasicCompletion(prompt: string): Promise<{ text: string, cost: number } | null> {
|
||||
try {
|
||||
const result = await generateText({
|
||||
model: getModel("gpt-4o", "openai"),
|
||||
prompt: prompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
}
|
||||
});
|
||||
return { text: result.text, cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
||||
} catch (error) {
|
||||
console.error("Error generating basic completion:", error);
|
||||
if (error?.type == "rate_limit_error") {
|
||||
try {
|
||||
const result = await generateText({
|
||||
model: getModel("gpt-4o-mini", "openai"),
|
||||
prompt: prompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
}
|
||||
});
|
||||
return { text: result.text, cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
||||
} catch (fallbackError) {
|
||||
console.error("Error generating basic completion with fallback model:", fallbackError);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
interface ProcessUrlOptions {
|
||||
url: string;
|
||||
@ -26,6 +54,11 @@ interface ProcessUrlOptions {
|
||||
origin?: string;
|
||||
limit?: number;
|
||||
includeSubdomains?: boolean;
|
||||
log?: any;
|
||||
isMultiEntity: boolean;
|
||||
reasoning: string;
|
||||
multiEntityKeys: string[];
|
||||
keyIndicators: string[];
|
||||
}
|
||||
|
||||
export async function processUrl(
|
||||
@ -33,6 +66,7 @@ export async function processUrl(
|
||||
urlTraces: URLTrace[],
|
||||
updateExtractCallback: (links: string[]) => void,
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
): Promise<string[]> {
|
||||
const trace: URLTrace = {
|
||||
url: options.url,
|
||||
@ -60,14 +94,16 @@ export async function processUrl(
|
||||
|
||||
let searchQuery = options.prompt;
|
||||
if (options.prompt) {
|
||||
searchQuery =
|
||||
(
|
||||
await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
)
|
||||
)
|
||||
?.replace('"', "")
|
||||
.replace("/", "") ?? options.prompt;
|
||||
const res = await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
);
|
||||
|
||||
if (res) {
|
||||
searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += res.cost;
|
||||
costTracking.totalCost += res.cost;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
@ -93,6 +129,7 @@ export async function processUrl(
|
||||
linkCount: allUrls.length,
|
||||
uniqueLinkCount: uniqueUrls.length,
|
||||
});
|
||||
options.log["uniqueUrlsLength-1"] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
@ -146,6 +183,8 @@ export async function processUrl(
|
||||
});
|
||||
}
|
||||
|
||||
options.log["uniqueUrlsLength-2"] = uniqueUrls.length;
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
@ -182,12 +221,20 @@ export async function processUrl(
|
||||
|
||||
let rephrasedPrompt = options.prompt ?? searchQuery;
|
||||
try {
|
||||
rephrasedPrompt =
|
||||
(await generateBasicCompletion(
|
||||
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
||||
)) ??
|
||||
"Extract the data according to the schema: " +
|
||||
const res = await generateBasicCompletion(
|
||||
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
||||
);
|
||||
|
||||
if (res) {
|
||||
rephrasedPrompt = res.text;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += res.cost;
|
||||
costTracking.totalCost += res.cost;
|
||||
} else {
|
||||
rephrasedPrompt =
|
||||
"Extract the data according to the schema: " +
|
||||
JSON.stringify(options.schema, null, 2);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error generating search query from schema:", error);
|
||||
rephrasedPrompt =
|
||||
@ -211,13 +258,20 @@ export async function processUrl(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
isMultiEntity: options.isMultiEntity,
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
});
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rerankerResult.cost;
|
||||
costTracking.totalCost += rerankerResult.cost;
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
|
||||
options.log["rerankerResult-1"] = mappedLinks.length;
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
@ -225,13 +279,21 @@ export async function processUrl(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
isMultiEntity: options.isMultiEntity,
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
});
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rerankerResult.cost;
|
||||
costTracking.totalCost += rerankerResult.cost;
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
linkCount: mappedLinks.length,
|
||||
});
|
||||
}
|
||||
options.log["rerankerResult-2"] = mappedLinks.length;
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
|
@ -1,17 +1,62 @@
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { createOllama } from "ollama-ai-provider";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { groq } from "@ai-sdk/groq";
|
||||
import { google } from "@ai-sdk/google";
|
||||
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
||||
import { fireworks } from "@ai-sdk/fireworks";
|
||||
import { deepinfra } from "@ai-sdk/deepinfra";
|
||||
import { createVertex } from "@ai-sdk/google-vertex";
|
||||
|
||||
const modelAdapter = process.env.OLLAMA_BASE_URL ? createOllama({
|
||||
type Provider =
|
||||
| "openai"
|
||||
| "ollama"
|
||||
| "anthropic"
|
||||
| "groq"
|
||||
| "google"
|
||||
| "openrouter"
|
||||
| "fireworks"
|
||||
| "deepinfra"
|
||||
| "vertex";
|
||||
const defaultProvider: Provider = process.env.OLLAMA_BASE_URL
|
||||
? "ollama"
|
||||
: "openai";
|
||||
|
||||
const providerList: Record<Provider, any> = {
|
||||
openai, //OPENAI_API_KEY
|
||||
ollama: createOllama({
|
||||
baseURL: process.env.OLLAMA_BASE_URL,
|
||||
}) : createOpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
baseURL: process.env.OPENAI_BASE_URL,
|
||||
});
|
||||
}),
|
||||
anthropic, //ANTHROPIC_API_KEY
|
||||
groq, //GROQ_API_KEY
|
||||
google, //GOOGLE_GENERATIVE_AI_API_KEY
|
||||
openrouter: createOpenRouter({
|
||||
apiKey: process.env.OPENROUTER_API_KEY,
|
||||
}),
|
||||
fireworks, //FIREWORKS_API_KEY
|
||||
deepinfra, //DEEPINFRA_API_KEY
|
||||
vertex: createVertex({
|
||||
project: "firecrawl",
|
||||
location: "us-central1",
|
||||
googleAuthOptions: process.env.VERTEX_CREDENTIALS ? {
|
||||
credentials: JSON.parse(atob(process.env.VERTEX_CREDENTIALS)),
|
||||
} : {
|
||||
keyFile: "./gke-key.json",
|
||||
},
|
||||
}),
|
||||
};
|
||||
|
||||
export function getModel(name: string) {
|
||||
return process.env.MODEL_NAME ? modelAdapter(process.env.MODEL_NAME) : modelAdapter(name);
|
||||
export function getModel(name: string, provider: Provider = defaultProvider) {
|
||||
return process.env.MODEL_NAME
|
||||
? providerList[provider](process.env.MODEL_NAME)
|
||||
: providerList[provider](name);
|
||||
}
|
||||
|
||||
export function getEmbeddingModel(name: string) {
|
||||
return process.env.MODEL_EMBEDDING_NAME ? modelAdapter.embedding(process.env.MODEL_EMBEDDING_NAME) : modelAdapter.embedding(name);
|
||||
export function getEmbeddingModel(
|
||||
name: string,
|
||||
provider: Provider = defaultProvider,
|
||||
) {
|
||||
return process.env.MODEL_EMBEDDING_NAME
|
||||
? providerList[provider].embedding(process.env.MODEL_EMBEDDING_NAME)
|
||||
: providerList[provider].embedding(name);
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import { crawlStatusController } from "../controllers/v1/crawl-status";
|
||||
import { mapController } from "../controllers/v1/map";
|
||||
import {
|
||||
ErrorResponse,
|
||||
isAgentExtractModelValid,
|
||||
RequestWithACUC,
|
||||
RequestWithAuth,
|
||||
RequestWithMaybeAuth,
|
||||
@ -93,6 +94,14 @@ export function authMiddleware(
|
||||
): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (rateLimiterMode === RateLimiterMode.Extract && isAgentExtractModelValid((req.body as any)?.agent?.model)) {
|
||||
rateLimiterMode = RateLimiterMode.ExtractAgentPreview;
|
||||
}
|
||||
|
||||
if (rateLimiterMode === RateLimiterMode.Scrape && isAgentExtractModelValid((req.body as any)?.agent?.model)) {
|
||||
rateLimiterMode = RateLimiterMode.ScrapeAgentPreview;
|
||||
}
|
||||
|
||||
const auth = await authenticateUser(req, res, rateLimiterMode);
|
||||
|
||||
if (!auth.success) {
|
||||
|
347
apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
Normal file
347
apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
Normal file
@ -0,0 +1,347 @@
|
||||
import { Logger } from "winston";
|
||||
import { z } from "zod";
|
||||
import {
|
||||
generateCompletions,
|
||||
GenerateCompletionsOptions,
|
||||
generateSchemaFromPrompt,
|
||||
} from "../transformers/llmExtract";
|
||||
import { smartScrape } from "./smartScrape";
|
||||
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { TokenUsage } from "../../../controllers/v1/types";
|
||||
import type { SmartScrapeResult } from "./smartScrape";
|
||||
|
||||
const commonSmartScrapeProperties = {
|
||||
shouldUseSmartscrape: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, login, inputs etc.). SmartScrape can perform these actions to access the data.",
|
||||
},
|
||||
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
||||
};
|
||||
|
||||
// Define common properties for reasoning and prompt
|
||||
const commonReasoningPromptProperties = {
|
||||
smartscrape_reasoning: {
|
||||
type: ["string", "null"],
|
||||
// Using the more detailed multi-step description as the common one
|
||||
description:
|
||||
"Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
|
||||
},
|
||||
smartscrape_prompt: {
|
||||
type: ["string", "null"],
|
||||
description: `A clear, outcome-focused prompt describing what information to find on the page.
|
||||
Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs".
|
||||
Used by the smart scraping agent to determine what actions to take.
|
||||
Dont mention anything about extraction, smartscrape just returns page content.`,
|
||||
},
|
||||
};
|
||||
|
||||
// Schema for single-step SmartScrape interaction
|
||||
const smartScrapeWrapperSchemaDefinition = {
|
||||
type: "object",
|
||||
properties: {
|
||||
...commonSmartScrapeProperties, // Include shared base properties
|
||||
...commonReasoningPromptProperties, // Include shared reasoning/prompt properties
|
||||
// extractedData will be added dynamically
|
||||
},
|
||||
additionalProperties: false,
|
||||
required: ["extractedData", "shouldUseSmartscrape"],
|
||||
};
|
||||
|
||||
// Schema for multi-step SmartScrape interaction
|
||||
const multiSmartScrapeWrapperSchemaDefinition = {
|
||||
type: "object",
|
||||
properties: {
|
||||
...commonSmartScrapeProperties, // Include shared base properties
|
||||
smartScrapePages: {
|
||||
type: "array",
|
||||
description:
|
||||
"Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
page_index: {
|
||||
// Specific to items within the array
|
||||
type: "number",
|
||||
description: "The index of the page in the SmartScrape process.",
|
||||
},
|
||||
...commonReasoningPromptProperties, // Include shared reasoning/prompt properties here too
|
||||
},
|
||||
// required: ["page_index", "smartscrape_reasoning", "smartscrape_prompt"], // If needed per step
|
||||
// additionalProperties: false,
|
||||
},
|
||||
},
|
||||
// extractedData will be added dynamically
|
||||
},
|
||||
additionalProperties: false,
|
||||
required: ["extractedData", "shouldUseSmartscrape"],
|
||||
};
|
||||
|
||||
//TODO: go over and check
|
||||
// should add null to all types
|
||||
// type:string should be type:["string","null"]
|
||||
export function makeSchemaNullable(schema: any): any {
|
||||
if (typeof schema !== "object" || schema === null) {
|
||||
return schema; // Base case: not an object/array or is null
|
||||
}
|
||||
|
||||
if (Array.isArray(schema)) {
|
||||
return schema.map(makeSchemaNullable); // Recurse for array items
|
||||
}
|
||||
|
||||
// Process object properties
|
||||
const newSchema: { [key: string]: any } = {};
|
||||
let isObject = false; // Flag to track if this level is an object type
|
||||
|
||||
for (const key in schema) {
|
||||
if (key === "additionalProperties") {
|
||||
continue; // Skip existing additionalProperties, we'll set it later if needed
|
||||
}
|
||||
|
||||
if (key === "type") {
|
||||
const currentType = schema[key];
|
||||
let finalType: string | string[];
|
||||
|
||||
if (typeof currentType === "string") {
|
||||
if (currentType === "object") isObject = true;
|
||||
finalType =
|
||||
currentType === "null" ? currentType : [currentType, "null"];
|
||||
} else if (Array.isArray(currentType)) {
|
||||
if (currentType.includes("object")) isObject = true;
|
||||
finalType = currentType.includes("null")
|
||||
? currentType
|
||||
: [...currentType, "null"];
|
||||
} else {
|
||||
finalType = currentType; // Handle unexpected types?
|
||||
}
|
||||
newSchema[key] = finalType;
|
||||
} else if (typeof schema[key] === "object" && schema[key] !== null) {
|
||||
// Recurse for nested objects (properties, items, definitions, etc.)
|
||||
newSchema[key] = makeSchemaNullable(schema[key]);
|
||||
if (key === "properties") {
|
||||
// Having a 'properties' key strongly implies an object type
|
||||
isObject = true;
|
||||
}
|
||||
} else {
|
||||
// Copy other properties directly (like required, description, etc.)
|
||||
newSchema[key] = schema[key];
|
||||
}
|
||||
}
|
||||
|
||||
// **Crucial Fix:** If this schema represents an object type, add additionalProperties: false
|
||||
if (isObject) {
|
||||
// Ensure 'properties' exists if 'type' was 'object' but 'properties' wasn't defined
|
||||
if (!newSchema.properties) {
|
||||
newSchema.properties = {};
|
||||
}
|
||||
newSchema.additionalProperties = false;
|
||||
}
|
||||
|
||||
return newSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps the original schema with SmartScrape fields if an original schema exists.
|
||||
*
|
||||
* @param originalSchema The user-provided schema (JSON Schema object or Zod schema).
|
||||
* @param logger Winston logger instance.
|
||||
* @returns An object containing the schema to use for the LLM call and whether wrapping occurred.
|
||||
*/
|
||||
export function prepareSmartScrapeSchema(
|
||||
originalSchema: any | z.ZodTypeAny | undefined,
|
||||
logger: Logger,
|
||||
isSingleUrl: boolean,
|
||||
) {
|
||||
// Make the user's schema nullable *and* ensure nested objects have additionalProperties:false
|
||||
const nullableAndStrictSchema = originalSchema;
|
||||
|
||||
let smartScrapeWrapScehma;
|
||||
if (isSingleUrl) {
|
||||
smartScrapeWrapScehma = smartScrapeWrapperSchemaDefinition;
|
||||
} else {
|
||||
smartScrapeWrapScehma = multiSmartScrapeWrapperSchemaDefinition;
|
||||
}
|
||||
|
||||
const wrappedSchema = {
|
||||
...smartScrapeWrapScehma, // Uses the wrapper defined above
|
||||
properties: {
|
||||
extractedData: nullableAndStrictSchema, // Nest the modified original schema
|
||||
...smartScrapeWrapScehma.properties, // Add smartscrape fields
|
||||
},
|
||||
// required is inherited from smartScrapeWrapperSchemaDefinition
|
||||
// additionalProperties:false is inherited from smartScrapeWrapperSchemaDefinition for the top level
|
||||
};
|
||||
|
||||
logger.info("Wrapping original schema with SmartScrape fields.", {
|
||||
// Limit logging potentially large schemas
|
||||
wrappedSchemaKeys: Object.keys(wrappedSchema.properties),
|
||||
});
|
||||
return { schemaToUse: wrappedSchema };
|
||||
}
|
||||
|
||||
export async function extractData({
|
||||
extractOptions,
|
||||
urls,
|
||||
useAgent,
|
||||
}: {
|
||||
extractOptions: GenerateCompletionsOptions;
|
||||
urls: string[];
|
||||
useAgent: boolean;
|
||||
}): Promise<{
|
||||
extractedDataArray: any[];
|
||||
warning: any;
|
||||
smartScrapeCallCount: number;
|
||||
otherCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCost: number;
|
||||
costLimitExceededTokenUsage: number | null;
|
||||
}> {
|
||||
let schema = extractOptions.options.schema;
|
||||
const logger = extractOptions.logger;
|
||||
const isSingleUrl = urls.length === 1;
|
||||
let smartScrapeCost = 0;
|
||||
let otherCost = 0;
|
||||
let smartScrapeCallCount = 0;
|
||||
let otherCallCount = 0;
|
||||
let costLimitExceededTokenUsage: number | null = null;
|
||||
// TODO: remove the "required" fields here!! it breaks o3-mini
|
||||
|
||||
if (!schema && extractOptions.options.prompt) {
|
||||
logger.info("Generating schema from prompt");
|
||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt);
|
||||
otherCallCount++;
|
||||
otherCost += genRes.cost;
|
||||
schema = genRes.extract;
|
||||
}
|
||||
|
||||
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
||||
const extractOptionsNewSchema = {
|
||||
...extractOptions,
|
||||
options: { ...extractOptions.options, schema: schemaToUse },
|
||||
};
|
||||
// console.log("schema", schema);
|
||||
// console.log("schemaToUse", schemaToUse);
|
||||
|
||||
let extract: any,
|
||||
warning: string | undefined,
|
||||
totalUsage: TokenUsage | undefined;
|
||||
|
||||
// checks if using smartScrape is needed for this case
|
||||
try {
|
||||
const {
|
||||
extract: e,
|
||||
warning: w,
|
||||
totalUsage: t,
|
||||
cost: c,
|
||||
} = await generateCompletions({
|
||||
...extractOptionsNewSchema,
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
});
|
||||
extract = e;
|
||||
warning = w;
|
||||
totalUsage = t;
|
||||
otherCost += c;
|
||||
otherCallCount++;
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"failed during extractSmartScrape.ts:generateCompletions",
|
||||
error,
|
||||
);
|
||||
// console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
||||
}
|
||||
|
||||
let extractedData = extract?.extractedData;
|
||||
|
||||
// console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape);
|
||||
// console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
||||
// console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
||||
try {
|
||||
console.log("=========================================");
|
||||
console.log(
|
||||
"useAgent:",
|
||||
useAgent,
|
||||
"shouldUseSmartscrape:",
|
||||
extract?.shouldUseSmartscrape,
|
||||
);
|
||||
console.log("url:", urls);
|
||||
console.log("prompt:", extract?.smartscrape_prompt);
|
||||
console.log("=========================================");
|
||||
|
||||
if (useAgent && extract?.shouldUseSmartscrape) {
|
||||
let smartscrapeResults: SmartScrapeResult[];
|
||||
if (isSingleUrl) {
|
||||
smartscrapeResults = [
|
||||
await smartScrape(urls[0], extract?.smartscrape_prompt),
|
||||
];
|
||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||
smartScrapeCallCount++;
|
||||
} else {
|
||||
const pages = extract?.smartscrapePages;
|
||||
//do it async promiseall instead
|
||||
smartscrapeResults = await Promise.all(
|
||||
pages.map(async (page) => {
|
||||
return await smartScrape(
|
||||
urls[page.page_index],
|
||||
page.smartscrape_prompt,
|
||||
);
|
||||
}),
|
||||
);
|
||||
smartScrapeCost += smartscrapeResults.reduce(
|
||||
(acc, result) => acc + result.tokenUsage,
|
||||
0,
|
||||
);
|
||||
smartScrapeCallCount += pages.length;
|
||||
}
|
||||
// console.log("smartscrapeResults", smartscrapeResults);
|
||||
|
||||
const scrapedPages = smartscrapeResults.map(
|
||||
(result) => result.scrapedPages,
|
||||
);
|
||||
// console.log("scrapedPages", scrapedPages);
|
||||
const htmls = scrapedPages.flat().map((page) => page.html);
|
||||
// console.log("htmls", htmls);
|
||||
const markdowns = await Promise.all(
|
||||
htmls.map(async (html) => await parseMarkdown(html)),
|
||||
);
|
||||
// console.log("markdowns", markdowns);
|
||||
extractedData = await Promise.all(
|
||||
markdowns.map(async (markdown) => {
|
||||
const newExtractOptions = {
|
||||
...extractOptions,
|
||||
markdown: markdown,
|
||||
};
|
||||
const { extract, warning, totalUsage, model, cost } =
|
||||
await generateCompletions(newExtractOptions);
|
||||
otherCost += cost;
|
||||
otherCallCount++;
|
||||
return extract;
|
||||
}),
|
||||
);
|
||||
|
||||
// console.log("markdowns", markdowns);
|
||||
// extractedData = smartscrapeResult;
|
||||
} else {
|
||||
extractedData = [extractedData];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error);
|
||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||
costLimitExceededTokenUsage = (error as any).cause.tokenUsage;
|
||||
warning = "Smart scrape cost limit exceeded." + (warning ? " " + warning : "")
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
extractedDataArray: extractedData,
|
||||
warning: warning,
|
||||
smartScrapeCallCount: smartScrapeCallCount,
|
||||
otherCallCount: otherCallCount,
|
||||
smartScrapeCost: smartScrapeCost,
|
||||
otherCost: otherCost,
|
||||
costLimitExceededTokenUsage: costLimitExceededTokenUsage,
|
||||
};
|
||||
}
|
164
apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
Normal file
164
apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
Normal file
@ -0,0 +1,164 @@
|
||||
import { z } from "zod";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { robustFetch } from "./fetch";
|
||||
import fs from "fs/promises";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
||||
configDotenv();
|
||||
|
||||
// Define schemas outside the function scope
|
||||
const tokenUsageDetailSchema = z.object({
|
||||
input_tokens: z.number().int(),
|
||||
output_tokens: z.number().int(),
|
||||
total_cost: z.number().nullable(), // Allows number or null
|
||||
});
|
||||
|
||||
// Schema for an individual scraped page object
|
||||
const scrapedPageSchema = z.object({
|
||||
html: z.string(),
|
||||
reason: z.string(),
|
||||
page: z.union([z.string(), z.number()]),
|
||||
});
|
||||
|
||||
// Main schema for the structure returned by the smart-scrape endpoint
|
||||
const smartScrapeResultSchema = z.object({
|
||||
sessionId: z.string(),
|
||||
success: z.boolean(),
|
||||
scrapedPages: z.array(scrapedPageSchema),
|
||||
tokenUsage: z.number(),
|
||||
|
||||
// z.record(
|
||||
// z.string(), // Key is the model name (string)
|
||||
// tokenUsageDetailSchema, // Value matches the detail schema
|
||||
// ),
|
||||
});
|
||||
|
||||
// Infer the TypeScript type from the Zod schema
|
||||
export type SmartScrapeResult = z.infer<typeof smartScrapeResultSchema>;
|
||||
|
||||
/**
|
||||
* Sends a POST request to the internal /smart-scrape endpoint to extract
|
||||
* structured data from a URL based on a prompt.
|
||||
*
|
||||
* @param url The URL of the page to scrape.
|
||||
* @param prompt The prompt guiding the data extraction.
|
||||
* @returns A promise that resolves to an object matching the SmartScrapeResult type.
|
||||
* @throws Throws an error if the request fails or the response is invalid.
|
||||
*/
|
||||
export async function smartScrape(
|
||||
url: string,
|
||||
prompt: string,
|
||||
sessionId?: string,
|
||||
): Promise<SmartScrapeResult> {
|
||||
try {
|
||||
logger.info("Initiating smart scrape request", { url, prompt });
|
||||
|
||||
// Pass schema type as generic parameter to robustFeth
|
||||
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
||||
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
|
||||
method: "POST",
|
||||
body: {
|
||||
url,
|
||||
prompt,
|
||||
userProvidedId: sessionId ?? undefined,
|
||||
models: {
|
||||
thinkingModel: {
|
||||
model: "gemini-2.5-pro-preview-03-25",
|
||||
provider: "vertex",
|
||||
supportTools: true,
|
||||
toolChoice: "required",
|
||||
cost: {
|
||||
input: 1.3,
|
||||
output: 5,
|
||||
},
|
||||
},
|
||||
toolModel: {
|
||||
model: "gemini-2.0-flash",
|
||||
provider: "google",
|
||||
},
|
||||
},
|
||||
},
|
||||
schema: smartScrapeResultSchema, // Pass the schema instance for validation
|
||||
logger,
|
||||
mock: null, // Keep mock null if not mocking
|
||||
});
|
||||
|
||||
// Check if the response indicates a 500 error
|
||||
// Use type assertion to handle the error response structure
|
||||
const errorResponse = response as unknown as {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
details?: string;
|
||||
};
|
||||
|
||||
if (
|
||||
errorResponse &&
|
||||
errorResponse.success === false &&
|
||||
errorResponse.error
|
||||
) {
|
||||
if (errorResponse.error === "Cost limit exceeded") {
|
||||
throw new Error("Cost limit exceeded", {
|
||||
cause: { tokenUsage: (errorResponse as any).tokenUsage },
|
||||
});
|
||||
}
|
||||
|
||||
logger.error("Smart scrape returned error response", {
|
||||
url,
|
||||
prompt,
|
||||
error: errorResponse.error,
|
||||
details: errorResponse.details || "No details provided",
|
||||
});
|
||||
throw new Error(
|
||||
`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ""}`,
|
||||
);
|
||||
}
|
||||
|
||||
logger.info("Smart scrape successful", {
|
||||
url,
|
||||
prompt,
|
||||
sessionId: response.sessionId,
|
||||
});
|
||||
|
||||
logger.info("Smart scrape cost $" + response.tokenUsage);
|
||||
|
||||
return response; // The response type now matches SmartScrapeResult
|
||||
} catch (error) {
|
||||
// Safely extract error information without circular references
|
||||
const errorInfo = {
|
||||
message: error instanceof Error ? error.message : String(error),
|
||||
name: error instanceof Error ? error.name : "Unknown",
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
// Extract cause safely if it exists
|
||||
cause:
|
||||
error instanceof Error && error.cause
|
||||
? error.cause instanceof Error
|
||||
? {
|
||||
message: error.cause.message,
|
||||
name: error.cause.name,
|
||||
stack: error.cause.stack,
|
||||
}
|
||||
: typeof error.cause === "object"
|
||||
? {
|
||||
...Object.fromEntries(
|
||||
Object.entries(error.cause).filter(
|
||||
([_, v]) => v !== null && typeof v !== "object",
|
||||
),
|
||||
),
|
||||
error:
|
||||
(error.cause as any)?.error?.message ||
|
||||
(error.cause as any)?.error,
|
||||
}
|
||||
: String(error.cause)
|
||||
: undefined,
|
||||
};
|
||||
|
||||
logger.error("Smart scrape request failed", {
|
||||
url,
|
||||
prompt,
|
||||
error: JSON.stringify(errorInfo),
|
||||
});
|
||||
|
||||
// Rethrowing the error to be handled by the caller
|
||||
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
|
||||
}
|
||||
}
|
65
apps/api/src/scraper/scrapeURL/transformers/agent.ts
Normal file
65
apps/api/src/scraper/scrapeURL/transformers/agent.ts
Normal file
@ -0,0 +1,65 @@
|
||||
import {
|
||||
Document,
|
||||
} from "../../../controllers/v1/types";
|
||||
import { Meta } from "..";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||
import { smartScrape, SmartScrapeResult } from "../lib/smartScrape";
|
||||
|
||||
|
||||
export async function performAgent(
|
||||
meta: Meta,
|
||||
document: Document,
|
||||
): Promise<Document> {
|
||||
if (meta.options.agent?.prompt) {
|
||||
const url: string | undefined = document.url || document.metadata.sourceURL
|
||||
|
||||
if (!url) {
|
||||
logger.error("document.url or document.metadata.sourceURL is undefined -- this is unexpected");
|
||||
// throw new Error("document.url or document.metadata.sourceURL is undefined -- this is unexpected");
|
||||
return document;
|
||||
}
|
||||
|
||||
const prompt = meta.options.agent?.prompt ?? undefined
|
||||
const sessionId = meta.options.agent?.sessionId ?? undefined
|
||||
|
||||
let smartscrapeResults: SmartScrapeResult;
|
||||
try {
|
||||
smartscrapeResults = await smartScrape(url, prompt, sessionId)
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||
logger.error("Cost limit exceeded", { error })
|
||||
document.warning = "Smart scrape cost limit exceeded." + (document.warning ? " " + document.warning : "")
|
||||
return document;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const html = smartscrapeResults.scrapedPages[smartscrapeResults.scrapedPages.length - 1].html
|
||||
|
||||
if (meta.options.formats.includes("markdown")) {
|
||||
const markdown = await parseMarkdown(html)
|
||||
document.markdown = markdown
|
||||
}
|
||||
if (meta.options.formats.includes("html")) {
|
||||
document.html = html
|
||||
}
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.smartScrapeCallCount++;
|
||||
document.metadata.costTracking.smartScrapeCost = document.metadata.costTracking.smartScrapeCost + smartscrapeResults.tokenUsage;
|
||||
document.metadata.costTracking.totalCost = document.metadata.costTracking.totalCost + smartscrapeResults.tokenUsage;
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 1,
|
||||
smartScrapeCost: smartscrapeResults.tokenUsage,
|
||||
otherCallCount: 0,
|
||||
otherCost: 0,
|
||||
totalCost: smartscrapeResults.tokenUsage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
@ -6,9 +6,9 @@ import gitDiff from 'git-diff';
|
||||
import parseDiff from 'parse-diff';
|
||||
import { generateCompletions } from "./llmExtract";
|
||||
|
||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<any> {
|
||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> {
|
||||
try {
|
||||
const { extract } = await generateCompletions({
|
||||
const { extract, cost } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
method: "extractDataWithSchema/generateCompletions",
|
||||
}),
|
||||
@ -20,7 +20,7 @@ async function extractDataWithSchema(content: string, meta: Meta): Promise<any>
|
||||
},
|
||||
markdown: content
|
||||
});
|
||||
return extract;
|
||||
return { extract, cost };
|
||||
} catch (error) {
|
||||
meta.logger.error("Error extracting data with schema", { error });
|
||||
return null;
|
||||
@ -144,7 +144,20 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
await extractDataWithSchema(currentMarkdown, meta) : null;
|
||||
|
||||
if (previousData && currentData) {
|
||||
document.changeTracking.json = compareExtractedData(previousData, currentData);
|
||||
document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract);
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.otherCallCount += 2;
|
||||
document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost;
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 0,
|
||||
smartScrapeCost: 0,
|
||||
otherCallCount: 2,
|
||||
otherCost: previousData.cost + currentData.cost,
|
||||
totalCost: previousData.cost + currentData.cost
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const { extract } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
|
@ -8,6 +8,8 @@ import { performLLMExtract } from "./llmExtract";
|
||||
import { uploadScreenshot } from "./uploadScreenshot";
|
||||
import { removeBase64Images } from "./removeBase64Images";
|
||||
import { saveToCache } from "./cache";
|
||||
import { performAgent } from "./agent";
|
||||
|
||||
import { deriveDiff } from "./diff";
|
||||
export type Transformer = (
|
||||
meta: Meta,
|
||||
@ -193,6 +195,7 @@ export const transformerStack: Transformer[] = [
|
||||
deriveMetadataFromRawHTML,
|
||||
uploadScreenshot,
|
||||
performLLMExtract,
|
||||
performAgent,
|
||||
deriveDiff,
|
||||
coerceFieldsToFormats,
|
||||
removeBase64Images,
|
||||
|
@ -3,16 +3,36 @@ import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import {
|
||||
Document,
|
||||
ExtractOptions,
|
||||
isAgentExtractModelValid,
|
||||
TokenUsage,
|
||||
} from "../../../controllers/v1/types";
|
||||
import { Logger } from "winston";
|
||||
import { EngineResultsTracker, Meta } from "..";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
||||
import { generateObject, generateText, LanguageModel } from 'ai';
|
||||
import { jsonSchema } from 'ai';
|
||||
import {
|
||||
generateObject,
|
||||
generateText,
|
||||
LanguageModel,
|
||||
NoObjectGeneratedError,
|
||||
} from "ai";
|
||||
import { jsonSchema } from "ai";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { z } from "zod";
|
||||
import fs from "fs/promises";
|
||||
import Ajv from "ajv";
|
||||
import { extractData } from "../lib/extractSmartScrape";
|
||||
|
||||
// TODO: fix this, it's horrible
|
||||
type LanguageModelV1ProviderMetadata = {
|
||||
anthropic?: {
|
||||
thinking?: {
|
||||
type: "enabled" | "disabled";
|
||||
budgetTokens?: number;
|
||||
};
|
||||
tool_choice?: "auto" | "none" | "required";
|
||||
};
|
||||
};
|
||||
|
||||
// Get max tokens from model prices
|
||||
const getModelLimits = (model: string) => {
|
||||
@ -74,7 +94,10 @@ function normalizeSchema(x: any): any {
|
||||
return {
|
||||
...x,
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(x.properties || {}).map(([k, v]) => [k, normalizeSchema(v)]),
|
||||
Object.entries(x.properties || {}).map(([k, v]) => [
|
||||
k,
|
||||
normalizeSchema(v),
|
||||
]),
|
||||
),
|
||||
required: Object.keys(x.properties || {}),
|
||||
additionalProperties: false,
|
||||
@ -89,21 +112,24 @@ function normalizeSchema(x: any): any {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
interface TrimResult {
|
||||
text: string;
|
||||
numTokens: number;
|
||||
warning?: string;
|
||||
}
|
||||
|
||||
export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
|
||||
export function trimToTokenLimit(
|
||||
text: string,
|
||||
maxTokens: number,
|
||||
modelId: string = "gpt-4o",
|
||||
previousWarning?: string,
|
||||
): TrimResult {
|
||||
try {
|
||||
const encoder = encoding_for_model(modelId as TiktokenModel);
|
||||
try {
|
||||
const tokens = encoder.encode(text);
|
||||
const numTokens = tokens.length;
|
||||
|
||||
|
||||
if (numTokens <= maxTokens) {
|
||||
return { text, numTokens };
|
||||
}
|
||||
@ -111,7 +137,7 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
const modifier = 3;
|
||||
// Start with 3 chars per token estimation
|
||||
let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
|
||||
|
||||
|
||||
// Keep trimming until we're under the token limit
|
||||
while (true) {
|
||||
const currentTokens = encoder.encode(currentText);
|
||||
@ -120,14 +146,18 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
return {
|
||||
text: currentText,
|
||||
numTokens: currentTokens.length,
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
warning: previousWarning
|
||||
? `${warning} ${previousWarning}`
|
||||
: warning,
|
||||
};
|
||||
}
|
||||
const overflow = currentTokens.length * modifier - maxTokens - 1;
|
||||
// If still over limit, remove another chunk
|
||||
currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
|
||||
currentText = currentText.slice(
|
||||
0,
|
||||
Math.floor(currentText.length - overflow),
|
||||
);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
throw e;
|
||||
} finally {
|
||||
@ -138,88 +168,203 @@ export function trimToTokenLimit(text: string, maxTokens: number, modelId: strin
|
||||
const estimatedCharsPerToken = 2.8;
|
||||
const safeLength = maxTokens * estimatedCharsPerToken;
|
||||
const trimmedText = text.slice(0, Math.floor(safeLength));
|
||||
|
||||
|
||||
const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
|
||||
|
||||
return {
|
||||
text: trimmedText,
|
||||
numTokens: maxTokens, // We assume we hit the max in this fallback case
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning
|
||||
warning: previousWarning ? `${warning} ${previousWarning}` : warning,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateCompletions({
|
||||
logger,
|
||||
options,
|
||||
markdown,
|
||||
previousWarning,
|
||||
isExtractEndpoint,
|
||||
model = getModel("gpt-4o-mini"),
|
||||
mode = "object",
|
||||
}: {
|
||||
model?: LanguageModel;
|
||||
export function calculateCost(
|
||||
model: string,
|
||||
inputTokens: number,
|
||||
outputTokens: number,
|
||||
) {
|
||||
const modelCosts = {
|
||||
"openai/o3-mini": { input_cost: 1.1, output_cost: 4.4 },
|
||||
"google/gemini-2.0-flash-001": { input_cost: 0.15, output_cost: 0.6 },
|
||||
"deepseek/deepseek-r1": { input_cost: 0.55, output_cost: 2.19 },
|
||||
"google/gemini-2.0-flash-thinking-exp:free": {
|
||||
input_cost: 0.55,
|
||||
output_cost: 2.19,
|
||||
},
|
||||
};
|
||||
let modelCost = modelCosts[model] || { input_cost: 0, output_cost: 0 };
|
||||
//gemini-2.5-pro-exp-03-25 pricing
|
||||
if (
|
||||
model === "gemini-2.5-pro-exp-03-25" ||
|
||||
model === "gemini-2.5-pro-preview-03-25"
|
||||
) {
|
||||
let inputCost = 0;
|
||||
let outputCost = 0;
|
||||
if (inputTokens <= 200000) {
|
||||
inputCost = 1.25;
|
||||
} else {
|
||||
inputCost = 2.5;
|
||||
}
|
||||
if (outputTokens <= 200000) {
|
||||
outputCost = 10.0;
|
||||
} else {
|
||||
outputCost = 15.0;
|
||||
}
|
||||
modelCost = { input_cost: inputCost, output_cost: outputCost };
|
||||
}
|
||||
const totalCost =
|
||||
(inputTokens * modelCost.input_cost +
|
||||
outputTokens * modelCost.output_cost) /
|
||||
1_000_000;
|
||||
|
||||
return totalCost;
|
||||
}
|
||||
|
||||
export type GenerateCompletionsOptions = {
|
||||
model?: LanguageModel;
|
||||
logger: Logger;
|
||||
options: ExtractOptions;
|
||||
markdown?: string;
|
||||
previousWarning?: string;
|
||||
isExtractEndpoint?: boolean;
|
||||
mode?: "object" | "no-object";
|
||||
}): Promise<{
|
||||
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||
retryModel?: LanguageModel;
|
||||
};
|
||||
export async function generateCompletions({
|
||||
logger,
|
||||
options,
|
||||
markdown,
|
||||
previousWarning,
|
||||
isExtractEndpoint,
|
||||
model = getModel("gpt-4o-mini", "openai"),
|
||||
mode = "object",
|
||||
providerOptions,
|
||||
retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"),
|
||||
}: GenerateCompletionsOptions): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
warning: string | undefined;
|
||||
totalUsage: TokenUsage;
|
||||
model: string;
|
||||
cost: number;
|
||||
}> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
let currentModel = model;
|
||||
let lastError: Error | null = null;
|
||||
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
|
||||
const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId);
|
||||
const { maxInputTokens, maxOutputTokens } = getModelLimits(
|
||||
currentModel.modelId,
|
||||
);
|
||||
// Calculate 80% of max input tokens (for content)
|
||||
const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
|
||||
|
||||
// Use the new trimming function
|
||||
const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit(
|
||||
markdown,
|
||||
maxTokensSafe,
|
||||
model.modelId,
|
||||
previousWarning
|
||||
);
|
||||
const {
|
||||
text: trimmedMarkdown,
|
||||
numTokens,
|
||||
warning: trimWarning,
|
||||
} = trimToTokenLimit(markdown, maxTokensSafe, model.modelId, previousWarning);
|
||||
|
||||
markdown = trimmedMarkdown;
|
||||
warning = trimWarning;
|
||||
// WE USE BIG MODELS NOW
|
||||
// markdown = trimmedMarkdown;
|
||||
// warning = trimWarning;
|
||||
|
||||
try {
|
||||
const prompt = options.prompt !== undefined
|
||||
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
|
||||
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
|
||||
const prompt =
|
||||
options.prompt !== undefined
|
||||
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
|
||||
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
|
||||
|
||||
if (mode === "no-object") {
|
||||
const result = await generateText({
|
||||
model: model,
|
||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||
temperature: options.temperature ?? 0,
|
||||
system: options.systemPrompt,
|
||||
});
|
||||
try {
|
||||
const result = await generateText({
|
||||
model: currentModel,
|
||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||
system: options.systemPrompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: model.modelId,
|
||||
};
|
||||
extract = result.text;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
numTokens,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
if (
|
||||
error.message?.includes("Quota exceeded") ||
|
||||
error.message?.includes("You exceeded your current quota") ||
|
||||
error.message?.includes("rate limit")
|
||||
) {
|
||||
logger.warn("Quota exceeded, retrying with fallback model", {
|
||||
error: lastError.message,
|
||||
});
|
||||
currentModel = retryModel;
|
||||
try {
|
||||
const result = await generateText({
|
||||
model: currentModel,
|
||||
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
|
||||
system: options.systemPrompt,
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens: numTokens,
|
||||
completionTokens: result.usage?.completionTokens ?? 0,
|
||||
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
numTokens,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
} catch (retryError) {
|
||||
lastError = retryError as Error;
|
||||
logger.error("Failed with fallback model", {
|
||||
originalError: lastError.message,
|
||||
model: currentModel.modelId,
|
||||
});
|
||||
throw lastError;
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
}
|
||||
}
|
||||
|
||||
let schema = options.schema;
|
||||
@ -276,32 +421,114 @@ export async function generateCompletions({
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
const { text: fixedText } = await generateText({
|
||||
model: model,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system: "You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON."
|
||||
});
|
||||
return fixedText;
|
||||
}
|
||||
try {
|
||||
const { text: fixedText } = await generateText({
|
||||
model: currentModel,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system:
|
||||
"You are a JSON repair expert. Your only job is to fix malformed JSON and return valid JSON that matches the original structure and intent as closely as possible. Do not include any explanation or commentary - only return the fixed JSON. Do not return it in a Markdown code block, just plain JSON.",
|
||||
providerOptions: {
|
||||
anthropic: {
|
||||
thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
},
|
||||
},
|
||||
});
|
||||
return fixedText;
|
||||
} catch (repairError) {
|
||||
lastError = repairError as Error;
|
||||
logger.error("Failed to repair JSON", { error: lastError.message });
|
||||
throw lastError;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
const generateObjectConfig = {
|
||||
model: model,
|
||||
model: currentModel,
|
||||
prompt: prompt,
|
||||
temperature: options.temperature ?? 0,
|
||||
providerOptions: providerOptions || undefined,
|
||||
system: options.systemPrompt,
|
||||
...(schema && { schema: schema instanceof z.ZodType ? schema : jsonSchema(schema) }),
|
||||
...(!schema && { output: 'no-schema' as const }),
|
||||
...(schema && {
|
||||
schema: schema instanceof z.ZodType ? schema : jsonSchema(schema),
|
||||
}),
|
||||
...(!schema && { output: "no-schema" as const }),
|
||||
...repairConfig,
|
||||
...(!schema && {
|
||||
onError: (error: Error) => {
|
||||
lastError = error;
|
||||
console.error(error);
|
||||
}
|
||||
})
|
||||
},
|
||||
}),
|
||||
} satisfies Parameters<typeof generateObject>[0];
|
||||
|
||||
const result = await generateObject(generateObjectConfig);
|
||||
extract = result.object;
|
||||
// const now = new Date().getTime();
|
||||
// await fs.writeFile(
|
||||
// `logs/generateObjectConfig-${now}.json`,
|
||||
// JSON.stringify(generateObjectConfig, null, 2),
|
||||
// );
|
||||
|
||||
let result: { object: any; usage: TokenUsage } | undefined;
|
||||
try {
|
||||
result = await generateObject(generateObjectConfig);
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
if (
|
||||
error.message?.includes("Quota exceeded") ||
|
||||
error.message?.includes("You exceeded your current quota") ||
|
||||
error.message?.includes("rate limit")
|
||||
) {
|
||||
logger.warn("Quota exceeded, retrying with fallback model", {
|
||||
error: lastError.message,
|
||||
});
|
||||
currentModel = retryModel;
|
||||
try {
|
||||
const retryConfig = {
|
||||
...generateObjectConfig,
|
||||
model: currentModel,
|
||||
};
|
||||
result = await generateObject(retryConfig);
|
||||
} catch (retryError) {
|
||||
lastError = retryError as Error;
|
||||
logger.error("Failed with fallback model", {
|
||||
originalError: lastError.message,
|
||||
model: currentModel.modelId,
|
||||
});
|
||||
throw lastError;
|
||||
}
|
||||
} else if (NoObjectGeneratedError.isInstance(error)) {
|
||||
console.log("No object generated", error);
|
||||
if (
|
||||
error.text &&
|
||||
error.text.startsWith("```json") &&
|
||||
error?.text.endsWith("```")
|
||||
) {
|
||||
try {
|
||||
extract = JSON.parse(
|
||||
error.text.slice("```json".length, -"```".length).trim(),
|
||||
);
|
||||
result = {
|
||||
object: extract,
|
||||
usage: {
|
||||
promptTokens: error.usage?.promptTokens ?? 0,
|
||||
completionTokens: error.usage?.completionTokens ?? 0,
|
||||
totalTokens: error.usage?.totalTokens ?? 0,
|
||||
},
|
||||
};
|
||||
} catch (parseError) {
|
||||
lastError = parseError as Error;
|
||||
logger.error("Failed to parse JSON from error text", {
|
||||
error: lastError.message,
|
||||
});
|
||||
throw lastError;
|
||||
}
|
||||
} else {
|
||||
throw lastError;
|
||||
}
|
||||
} else {
|
||||
throw lastError;
|
||||
}
|
||||
}
|
||||
|
||||
extract = result?.object;
|
||||
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
@ -326,13 +553,20 @@ export async function generateCompletions({
|
||||
completionTokens,
|
||||
totalTokens: promptTokens + completionTokens,
|
||||
},
|
||||
model: model.modelId,
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(currentModel.modelId, promptTokens, completionTokens),
|
||||
};
|
||||
} catch (error) {
|
||||
if (error.message?.includes('refused')) {
|
||||
lastError = error as Error;
|
||||
if (error.message?.includes("refused")) {
|
||||
throw new LLMRefusalError(error.message);
|
||||
}
|
||||
throw error;
|
||||
logger.error("LLM extraction failed", {
|
||||
error: lastError.message,
|
||||
model: currentModel.modelId,
|
||||
mode,
|
||||
});
|
||||
throw lastError;
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,22 +575,139 @@ export async function performLLMExtract(
|
||||
document: Document,
|
||||
): Promise<Document> {
|
||||
if (meta.options.formats.includes("extract")) {
|
||||
meta.internalOptions.abort?.throwIfAborted();
|
||||
const { extract, warning } = await generateCompletions({
|
||||
// const originalOptions = meta.options.extract!;
|
||||
|
||||
// let generationOptions = { ...originalOptions }; // Start with original options
|
||||
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
logger: meta.logger.child({
|
||||
method: "performLLMExtract/generateCompletions",
|
||||
}),
|
||||
options: meta.options.extract!,
|
||||
markdown: document.markdown,
|
||||
previousWarning: document.warning
|
||||
});
|
||||
previousWarning: document.warning,
|
||||
// ... existing model and provider options ...
|
||||
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||
// model: getModel("o3-mini", "openai"),
|
||||
// model: getModel("qwen-qwq-32b", "groq"),
|
||||
// model: getModel("gemini-2.0-flash", "google"),
|
||||
// model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
};
|
||||
|
||||
if (meta.options.formats.includes("json")) {
|
||||
document.json = extract;
|
||||
} else {
|
||||
document.extract = extract;
|
||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, costLimitExceededTokenUsage } =
|
||||
await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [meta.url],
|
||||
useAgent: isAgentExtractModelValid(meta.options.extract?.agent?.model),
|
||||
});
|
||||
|
||||
if (warning) {
|
||||
document.warning = warning + (document.warning ? " " + document.warning : "");
|
||||
}
|
||||
document.warning = warning;
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.smartScrapeCallCount++;
|
||||
document.metadata.costTracking.smartScrapeCost += smartScrapeCost;
|
||||
document.metadata.costTracking.otherCallCount++;
|
||||
document.metadata.costTracking.otherCost += otherCost;
|
||||
document.metadata.costTracking.totalCost += smartScrapeCost + otherCost;
|
||||
if (costLimitExceededTokenUsage) {
|
||||
document.metadata.costTracking.costLimitExceededTokenUsage = costLimitExceededTokenUsage;
|
||||
}
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 1,
|
||||
smartScrapeCost: smartScrapeCost,
|
||||
otherCallCount: 1,
|
||||
otherCost: otherCost,
|
||||
totalCost: smartScrapeCost + otherCost,
|
||||
};
|
||||
}
|
||||
|
||||
// IMPORTANT: here it only get's the last page!!!
|
||||
const extractedData =
|
||||
extractedDataArray[extractedDataArray.length - 1] ?? undefined;
|
||||
|
||||
// // Prepare the schema, potentially wrapping it
|
||||
// const { schemaToUse, schemaWasWrapped } = prepareSmartScrapeSchema(
|
||||
// originalOptions.schema,
|
||||
// meta.logger,
|
||||
// );
|
||||
|
||||
// // Update generationOptions with the potentially wrapped schema
|
||||
// generationOptions.schema = schemaToUse;
|
||||
|
||||
// meta.internalOptions.abort?.throwIfAborted();
|
||||
// const {
|
||||
// extract: rawExtract,
|
||||
// warning,
|
||||
// totalUsage,
|
||||
// model,
|
||||
// } = await generateCompletions({
|
||||
// logger: meta.logger.child({
|
||||
// method: "performLLMExtract/generateCompletions",
|
||||
// }),
|
||||
// options: generationOptions, // Use the potentially modified options
|
||||
// markdown: document.markdown,
|
||||
// previousWarning: document.warning,
|
||||
// // ... existing model and provider options ...
|
||||
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||
// providerOptions: {
|
||||
// anthropic: {
|
||||
// thinking: { type: "enabled", budgetTokens: 12000 },
|
||||
// },
|
||||
// },
|
||||
// });
|
||||
|
||||
// // Log token usage
|
||||
// meta.logger.info("LLM extraction token usage", {
|
||||
// model: model,
|
||||
// promptTokens: totalUsage.promptTokens,
|
||||
// completionTokens: totalUsage.completionTokens,
|
||||
// totalTokens: totalUsage.totalTokens,
|
||||
// });
|
||||
|
||||
// // Process the result to extract data and SmartScrape decision
|
||||
// const {
|
||||
// extractedData,
|
||||
// shouldUseSmartscrape,
|
||||
// smartscrape_reasoning,
|
||||
// smartscrape_prompt,
|
||||
// } = processSmartScrapeResult(rawExtract, schemaWasWrapped, meta.logger);
|
||||
|
||||
// // Log the SmartScrape decision if applicable
|
||||
// if (schemaWasWrapped) {
|
||||
// meta.logger.info("SmartScrape decision processing result", {
|
||||
// shouldUseSmartscrape,
|
||||
// smartscrape_reasoning,
|
||||
// // Don't log the full prompt potentially
|
||||
// smartscrape_prompt_present: !!smartscrape_prompt,
|
||||
// extractedDataIsPresent:
|
||||
// extractedData !== undefined && extractedData !== null,
|
||||
// });
|
||||
|
||||
// // TODO: Implement logic to ACTUALLY trigger SmartScrape based on the result
|
||||
// // For example:
|
||||
// // if (shouldUseSmartscrape && smartscrape_prompt) {
|
||||
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
|
||||
// // // Call the smartScrape function (which needs to be implemented/imported)
|
||||
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
|
||||
// // // Process/merge smartScrapedDocs with extractedData
|
||||
// // // ... potentially update finalExtract ...
|
||||
// // } else {
|
||||
// // meta.logger.info("SmartScrape not required based on LLM output.");
|
||||
// // }
|
||||
// }
|
||||
|
||||
// Assign the final extracted data
|
||||
if (meta.options.formats.includes("json")) {
|
||||
document.json = extractedData;
|
||||
} else {
|
||||
document.extract = extractedData;
|
||||
}
|
||||
// document.warning = warning;
|
||||
}
|
||||
|
||||
return document;
|
||||
@ -366,7 +717,7 @@ export function removeDefaultProperty(schema: any): any {
|
||||
if (typeof schema !== "object" || schema === null) return schema;
|
||||
|
||||
const rest = { ...schema };
|
||||
|
||||
|
||||
// unsupported global keys
|
||||
delete rest.default;
|
||||
|
||||
@ -408,18 +759,22 @@ export function removeDefaultProperty(schema: any): any {
|
||||
return rest;
|
||||
}
|
||||
|
||||
export async function generateSchemaFromPrompt(prompt: string): Promise<any> {
|
||||
const model = getModel("gpt-4o");
|
||||
export async function generateSchemaFromPrompt(
|
||||
prompt: string,
|
||||
): Promise<{ extract: any; cost: number }> {
|
||||
const model = getModel("gpt-4o", "openai");
|
||||
const retryModel = getModel("gpt-4o-mini", "openai");
|
||||
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (const temp of temperatures) {
|
||||
try {
|
||||
const { extract } = await generateCompletions({
|
||||
const { extract, cost } = await generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "generateSchemaFromPrompt/generateCompletions",
|
||||
}),
|
||||
model: model,
|
||||
model,
|
||||
retryModel,
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: `You are a schema generator for a web scraping system. Generate a JSON schema based on the user's prompt.
|
||||
@ -448,13 +803,12 @@ DO NOT USE FORMATS.
|
||||
Keep it simple. Don't create too many properties, just the ones that are needed. Don't invent properties.
|
||||
Return a valid JSON schema object with properties that would capture the information requested in the prompt.`,
|
||||
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
||||
temperature: temp
|
||||
// temperature: temp,
|
||||
},
|
||||
markdown: prompt
|
||||
markdown: prompt,
|
||||
});
|
||||
|
||||
return extract;
|
||||
|
||||
return { extract, cost };
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
||||
|
56
apps/api/src/services/agentLivecastWS.ts
Normal file
56
apps/api/src/services/agentLivecastWS.ts
Normal file
@ -0,0 +1,56 @@
|
||||
import { configDotenv } from 'dotenv';
|
||||
import { logger } from '../lib/logger';
|
||||
import type { Request } from 'express';
|
||||
import WSWebSocket from 'ws';
|
||||
configDotenv();
|
||||
|
||||
/**
|
||||
* Attaches WebSocket proxying logic to the Express application
|
||||
* This function should be called after creating the Express app but before starting the server
|
||||
*/
|
||||
export function attachWsProxy(app: any) {
|
||||
logger.info('Attaching WebSocket proxy to Express app');
|
||||
|
||||
// Make sure express-ws is properly initialized
|
||||
if (!app.ws) {
|
||||
logger.error('Express app does not have WebSocket support. Make sure express-ws is properly initialized.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Define the WebSocket route
|
||||
app.ws('/agent-livecast', (clientWs: WSWebSocket, req: Request) => {
|
||||
try {
|
||||
console.log(req.url);
|
||||
const url = new URL(req.url ?? '', 'http://placeholder/');
|
||||
const sessionIdParam = url.searchParams.get('userProvidedId') || '';
|
||||
|
||||
const workerWsUrl = `${process.env.FIRE_ENGINE_BETA_URL?.replace('http', 'ws')}?userProvidedId=${sessionIdParam}`;
|
||||
console.log(workerWsUrl)
|
||||
const wsWorker = new WebSocket(workerWsUrl);
|
||||
|
||||
wsWorker.onopen = () => {
|
||||
// clientWs is your user's browser socket
|
||||
// wsWorker is the worker's socket
|
||||
|
||||
// Forward messages from the user -> worker
|
||||
clientWs.on('message', (dataFromClient) => {
|
||||
wsWorker.send(dataFromClient as unknown as string);
|
||||
});
|
||||
|
||||
// Forward messages from the worker -> user
|
||||
wsWorker.onmessage = (event) => {
|
||||
clientWs.send(event.data);
|
||||
};
|
||||
|
||||
// Close events
|
||||
clientWs.on('close', () => wsWorker.close());
|
||||
wsWorker.onclose = () => clientWs.close();
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Error in wsProxy upgrade:', error);
|
||||
clientWs.close();
|
||||
}
|
||||
});
|
||||
|
||||
logger.info('WebSocket proxy successfully attached to Express app');
|
||||
}
|
@ -102,6 +102,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
crawl_id: job.crawl_id,
|
||||
tokens_billed: job.tokens_billed,
|
||||
is_migrated: true,
|
||||
cost_tracking: job.cost_tracking,
|
||||
};
|
||||
|
||||
// Send job to external server
|
||||
@ -181,6 +182,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
num_tokens: job.num_tokens,
|
||||
retry: job.retry,
|
||||
tokens_billed: job.tokens_billed,
|
||||
cost_tracking: job.cost_tracking,
|
||||
},
|
||||
};
|
||||
if (job.mode !== "single_urls") {
|
||||
|
@ -61,7 +61,10 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { performExtraction } from "../lib/extract/extraction-service";
|
||||
import {
|
||||
ExtractResult,
|
||||
performExtraction,
|
||||
} from "../lib/extract/extraction-service";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||
import { saveExtract, updateExtract } from "../lib/extract/extract-redis";
|
||||
@ -71,6 +74,7 @@ import { updateDeepResearch } from "../lib/deep-research/deep-research-redis";
|
||||
import { performDeepResearch } from "../lib/deep-research/deep-research-service";
|
||||
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -100,19 +104,35 @@ const runningJobs: Set<string> = new Set();
|
||||
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawlPre(job.data.crawl_id)) {
|
||||
if (job.data.crawlerOptions && !await redisConnection.exists("crawl:" + job.data.crawl_id + ":invisible_urls")) {
|
||||
await redisConnection.set("crawl:" + job.data.crawl_id + ":invisible_urls", "done", "EX", 60 * 60 * 24);
|
||||
if (
|
||||
job.data.crawlerOptions &&
|
||||
!(await redisConnection.exists(
|
||||
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
||||
))
|
||||
) {
|
||||
await redisConnection.set(
|
||||
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
||||
"done",
|
||||
"EX",
|
||||
60 * 60 * 24,
|
||||
);
|
||||
|
||||
const sc = (await getCrawl(job.data.crawl_id))!;
|
||||
|
||||
const visitedUrls = new Set(await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
));
|
||||
const visitedUrls = new Set(
|
||||
await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
),
|
||||
);
|
||||
|
||||
const lastUrls: string[] = ((await supabase_service.rpc("diff_get_last_crawl_urls", {
|
||||
i_team_id: job.data.team_id,
|
||||
i_url: sc.originUrl!,
|
||||
})).data ?? []).map(x => x.url);
|
||||
const lastUrls: string[] = (
|
||||
(
|
||||
await supabase_service.rpc("diff_get_last_crawl_urls", {
|
||||
i_team_id: job.data.team_id,
|
||||
i_url: sc.originUrl!,
|
||||
})
|
||||
).data ?? []
|
||||
).map((x) => x.url);
|
||||
|
||||
const lastUrlsSet = new Set(lastUrls);
|
||||
|
||||
@ -124,14 +144,24 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
);
|
||||
|
||||
const univistedUrls = crawler.filterLinks(
|
||||
Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)),
|
||||
Array.from(lastUrlsSet).filter((x) => !visitedUrls.has(x)),
|
||||
Infinity,
|
||||
sc.crawlerOptions.maxDepth ?? 10,
|
||||
);
|
||||
|
||||
const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id));
|
||||
|
||||
console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount);
|
||||
const addableJobCount =
|
||||
sc.crawlerOptions.limit === undefined
|
||||
? Infinity
|
||||
: sc.crawlerOptions.limit -
|
||||
(await getDoneJobsOrderedLength(job.data.crawl_id));
|
||||
|
||||
console.log(
|
||||
sc.originUrl!,
|
||||
univistedUrls,
|
||||
visitedUrls,
|
||||
lastUrls,
|
||||
addableJobCount,
|
||||
);
|
||||
|
||||
if (univistedUrls.length !== 0 && addableJobCount > 0) {
|
||||
const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
|
||||
@ -401,13 +431,29 @@ const processExtractJobInternal = async (
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
try {
|
||||
const result = await performExtraction(job.data.extractId, {
|
||||
let result: ExtractResult | null = null;
|
||||
|
||||
// const model = job.data.request.agent?.model
|
||||
// if (job.data.request.agent && model && model.toLowerCase().includes("fire-1")) {
|
||||
// result = await performExtraction(job.data.extractId, {
|
||||
// request: job.data.request,
|
||||
// teamId: job.data.teamId,
|
||||
// subId: job.data.subId,
|
||||
// });
|
||||
// } else {
|
||||
// result = await performExtraction_F0(job.data.extractId, {
|
||||
// request: job.data.request,
|
||||
// teamId: job.data.teamId,
|
||||
// subId: job.data.subId,
|
||||
// });
|
||||
// }
|
||||
result = await performExtraction_F0(job.data.extractId, {
|
||||
request: job.data.request,
|
||||
teamId: job.data.teamId,
|
||||
subId: job.data.subId,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
if (result && result.success) {
|
||||
// Move job to completed state in Redis
|
||||
await job.moveToCompleted(result, token, false);
|
||||
return result;
|
||||
@ -418,7 +464,7 @@ const processExtractJobInternal = async (
|
||||
await updateExtract(job.data.extractId, {
|
||||
status: "failed",
|
||||
error:
|
||||
result.error ??
|
||||
result?.error ??
|
||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||
job.data.extractId,
|
||||
});
|
||||
@ -481,7 +527,10 @@ const processDeepResearchJobInternal = async (
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
try {
|
||||
console.log("[Deep Research] Starting deep research: ", job.data.researchId);
|
||||
console.log(
|
||||
"[Deep Research] Starting deep research: ",
|
||||
job.data.researchId,
|
||||
);
|
||||
const result = await performDeepResearch({
|
||||
researchId: job.data.researchId,
|
||||
teamId: job.data.teamId,
|
||||
@ -494,9 +543,9 @@ const processDeepResearchJobInternal = async (
|
||||
systemPrompt: job.data.request.systemPrompt,
|
||||
formats: job.data.request.formats,
|
||||
jsonOptions: job.data.request.jsonOptions,
|
||||
});
|
||||
|
||||
if(result.success) {
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
// Move job to completed state in Redis and update research status
|
||||
await job.moveToCompleted(result, token, false);
|
||||
return result;
|
||||
@ -544,7 +593,7 @@ const processGenerateLlmsTxtJobInternal = async (
|
||||
) => {
|
||||
const logger = _logger.child({
|
||||
module: "generate-llmstxt-worker",
|
||||
method: "processJobInternal",
|
||||
method: "processJobInternal",
|
||||
jobId: job.id,
|
||||
generateId: job.data.generateId,
|
||||
teamId: job.data?.teamId ?? undefined,
|
||||
@ -574,7 +623,9 @@ const processGenerateLlmsTxtJobInternal = async (
|
||||
});
|
||||
return result;
|
||||
} else {
|
||||
const error = new Error("LLMs text generation failed without specific error");
|
||||
const error = new Error(
|
||||
"LLMs text generation failed without specific error",
|
||||
);
|
||||
await job.moveToFailed(error, token, false);
|
||||
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||
status: "failed",
|
||||
@ -598,7 +649,7 @@ const processGenerateLlmsTxtJobInternal = async (
|
||||
}
|
||||
|
||||
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||
status: "failed",
|
||||
status: "failed",
|
||||
error: error.message || "Unknown error occurred",
|
||||
});
|
||||
|
||||
@ -685,7 +736,11 @@ const workerFun = async (
|
||||
// we are 1 under the limit, assuming the job insertion logic never over-inserts. - MG
|
||||
const nextJob = await takeConcurrencyLimitedJob(job.data.team_id);
|
||||
if (nextJob !== null) {
|
||||
await pushConcurrencyLimitActiveJob(job.data.team_id, nextJob.id, 60 * 1000); // 60s initial timeout
|
||||
await pushConcurrencyLimitActiveJob(
|
||||
job.data.team_id,
|
||||
nextJob.id,
|
||||
60 * 1000,
|
||||
); // 60s initial timeout
|
||||
|
||||
await queue.add(
|
||||
nextJob.id,
|
||||
@ -1002,7 +1057,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
}
|
||||
|
||||
if (job.data.concurrencyLimited) {
|
||||
doc.warning = "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + (doc.warning ? " " + doc.warning : "");
|
||||
doc.warning =
|
||||
"This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." +
|
||||
(doc.warning ? " " + doc.warning : "");
|
||||
}
|
||||
|
||||
const data = {
|
||||
@ -1061,7 +1118,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
// If this would be done for non-crossdomain redirects, but also for e.g.
|
||||
// redirecting / -> /introduction (like our docs site does), it would
|
||||
// break crawling the entire site without allowBackwardsCrawling - mogery
|
||||
const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
|
||||
const isHostnameDifferent =
|
||||
normalizeUrlOnlyHostname(doc.metadata.url) !==
|
||||
normalizeUrlOnlyHostname(doc.metadata.sourceURL);
|
||||
if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
|
||||
// TODO: re-fetch sitemap for redirect target domain
|
||||
sc.originUrl = doc.metadata.url;
|
||||
@ -1172,7 +1231,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
internalOptions: sc.internalOptions,
|
||||
crawlerOptions: {
|
||||
...sc.crawlerOptions,
|
||||
currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
|
||||
currentDiscoveryDepth:
|
||||
(job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
|
||||
},
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
@ -1199,14 +1259,27 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
}
|
||||
|
||||
// Only run check after adding new jobs for discovery - mogery
|
||||
if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
|
||||
throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
|
||||
if (
|
||||
job.data.isCrawlSourceScrape &&
|
||||
crawler.filterLinks(
|
||||
[doc.metadata.url ?? doc.metadata.sourceURL!],
|
||||
1,
|
||||
sc.crawlerOptions?.maxDepth ?? 10,
|
||||
).length === 0
|
||||
) {
|
||||
throw new Error(
|
||||
"Source URL is not allowed by includePaths/excludePaths rules",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
} else {
|
||||
const cost_tracking = doc?.metadata?.costTracking;
|
||||
|
||||
delete doc.metadata.costTracking;
|
||||
|
||||
await logJob({
|
||||
job_id: job.id,
|
||||
success: true,
|
||||
@ -1220,6 +1293,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
num_tokens: 0, // TODO: fix
|
||||
cost_tracking,
|
||||
});
|
||||
|
||||
indexJob(job, doc);
|
||||
@ -1230,16 +1304,25 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
if (job.data.scrapeOptions.extract) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1") {
|
||||
creditsToBeBilled = 150;
|
||||
}
|
||||
|
||||
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
if (
|
||||
job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
|
||||
process.env.USE_DB_AUTHENTICATION === "true"
|
||||
) {
|
||||
try {
|
||||
const billingJobId = uuidv4();
|
||||
logger.debug(`Adding billing job to queue for team ${job.data.team_id}`, {
|
||||
billingJobId,
|
||||
credits: creditsToBeBilled,
|
||||
is_extract: false,
|
||||
});
|
||||
|
||||
logger.debug(
|
||||
`Adding billing job to queue for team ${job.data.team_id}`,
|
||||
{
|
||||
billingJobId,
|
||||
credits: creditsToBeBilled,
|
||||
is_extract: false,
|
||||
},
|
||||
);
|
||||
|
||||
// Add directly to the billing queue - the billing worker will handle the rest
|
||||
await getBillingQueue().add(
|
||||
"bill_team",
|
||||
@ -1249,12 +1332,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
credits: creditsToBeBilled,
|
||||
is_extract: false,
|
||||
timestamp: new Date().toISOString(),
|
||||
originating_job_id: job.id
|
||||
originating_job_id: job.id,
|
||||
},
|
||||
{
|
||||
jobId: billingJobId,
|
||||
priority: 10,
|
||||
}
|
||||
},
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
|
@ -22,35 +22,6 @@ export const testSuiteRateLimiter = new RateLimiterRedis({
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery
|
||||
const testSuiteTokens = [
|
||||
"a01ccae",
|
||||
"6254cf9",
|
||||
"0f96e673",
|
||||
"23befa1b",
|
||||
"69141c4",
|
||||
"48f9a97",
|
||||
"5dc70ad",
|
||||
"e5e60e5",
|
||||
"65181ba",
|
||||
"77c85b7",
|
||||
"8567275",
|
||||
"6c46abb",
|
||||
"cb0ff78",
|
||||
"fd769b2",
|
||||
// "4c2638d",
|
||||
"cbb3462", // don't remove (s-ai)
|
||||
"824abcd", // don't remove (s-ai)
|
||||
"0966288",
|
||||
"226556f",
|
||||
"0a18c9e", // gh
|
||||
];
|
||||
|
||||
// TODO: PUT OVERRIDES FOR THESE INTO THE DB - mogery
|
||||
// const manual_growth = ["22a07b64-cbfe-4924-9273-e3f01709cdf2"];
|
||||
// const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6", "9661a311-3d75-45d2-bb70-71004d995873"];
|
||||
// const manual_etier2c = ["77545e01-9cec-4fa9-8356-883fc66ac13e", "778c62c4-306f-4039-b372-eb20174760c0"];
|
||||
|
||||
const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = {
|
||||
crawl: 15,
|
||||
scrape: 100,
|
||||
@ -60,6 +31,8 @@ const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = {
|
||||
preview: 25,
|
||||
extractStatus: 25000,
|
||||
crawlStatus: 25000,
|
||||
extractAgentPreview: 1,
|
||||
scrapeAgentPreview: 5,
|
||||
};
|
||||
|
||||
export function getRateLimiter(
|
||||
@ -68,10 +41,6 @@ export function getRateLimiter(
|
||||
): RateLimiterRedis {
|
||||
return createRateLimiter(
|
||||
`${mode}`,
|
||||
(rate_limits ?? fallbackRateLimits)[mode] ?? 500,
|
||||
(rate_limits?.[mode] ?? fallbackRateLimits?.[mode] ?? 500),
|
||||
);
|
||||
}
|
||||
|
||||
export function isTestSuiteToken(token: string): boolean {
|
||||
return testSuiteTokens.some((testToken) => token.includes(testToken));
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ import {
|
||||
} from "./controllers/v1/types";
|
||||
import { ExtractorOptions, Document } from "./lib/entities";
|
||||
import { InternalOptions } from "./scraper/scrapeURL";
|
||||
import type { CostTracking } from "./lib/extract/extraction-service";
|
||||
|
||||
type Mode = "crawl" | "single_urls" | "sitemap";
|
||||
|
||||
@ -90,6 +91,7 @@ export interface FirecrawlJob {
|
||||
crawl_id?: string;
|
||||
tokens_billed?: number;
|
||||
sources?: Record<string, string[]>;
|
||||
cost_tracking?: CostTracking;
|
||||
}
|
||||
|
||||
export interface FirecrawlScrapeResponse {
|
||||
@ -132,11 +134,13 @@ export enum RateLimiterMode {
|
||||
Crawl = "crawl",
|
||||
CrawlStatus = "crawlStatus",
|
||||
Scrape = "scrape",
|
||||
ScrapeAgentPreview = "scrapeAgentPreview",
|
||||
Preview = "preview",
|
||||
Search = "search",
|
||||
Map = "map",
|
||||
Extract = "extract",
|
||||
ExtractStatus = "extractStatus",
|
||||
ExtractAgentPreview = "extractAgentPreview",
|
||||
}
|
||||
|
||||
export type AuthResponse =
|
||||
|
@ -3,23 +3,26 @@
|
||||
"rootDir": "./src",
|
||||
"lib": ["ES2022", "DOM"],
|
||||
|
||||
|
||||
// or higher
|
||||
"target": "ES2022",
|
||||
|
||||
"module": "commonjs",
|
||||
"module": "NodeNext",
|
||||
"esModuleInterop": true,
|
||||
"sourceMap": true,
|
||||
"outDir": "./dist/src",
|
||||
"moduleResolution": "node",
|
||||
"moduleResolution": "NodeNext",
|
||||
"baseUrl": ".",
|
||||
"strictNullChecks": true,
|
||||
|
||||
"paths": {
|
||||
"*": ["node_modules/*", "src/types/*"],
|
||||
},
|
||||
|
||||
"inlineSources": true,
|
||||
"inlineSources": true
|
||||
},
|
||||
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
||||
"include": [
|
||||
"src/",
|
||||
"src/**/*",
|
||||
"services/db/supabase.ts",
|
||||
"utils/utils.ts",
|
||||
"services/db/supabaseEmbeddings.ts",
|
||||
"utils/EventEmmitter.ts",
|
||||
"src/services/queue-service.ts"
|
||||
]
|
||||
}
|
||||
|
3611
apps/js-sdk/firecrawl/pnpm-lock.yaml
generated
Normal file
3611
apps/js-sdk/firecrawl/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -167,6 +167,7 @@ export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchem
|
||||
modes?: ("json" | "git-diff")[];
|
||||
}
|
||||
actions?: ActionsSchema;
|
||||
agent?: AgentOptions;
|
||||
}
|
||||
|
||||
export interface ActionsResult {
|
||||
@ -296,6 +297,21 @@ export interface MapResponse {
|
||||
* Parameters for extracting information from URLs.
|
||||
* Defines options for extracting information from URLs.
|
||||
*/
|
||||
export interface AgentOptions {
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
sessionId?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for extracting information from URLs.
|
||||
* Defines options for extracting information from URLs.
|
||||
*/
|
||||
export interface AgentOptionsExtract {
|
||||
model?: string;
|
||||
sessionId?: string;
|
||||
}
|
||||
|
||||
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
||||
prompt?: string;
|
||||
schema?: LLMSchema | object;
|
||||
@ -306,6 +322,7 @@ export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
||||
origin?: string;
|
||||
showSources?: boolean;
|
||||
scrapeOptions?: CrawlScrapeOptions;
|
||||
agent?: AgentOptionsExtract;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -107,6 +107,7 @@ class FirecrawlApp:
|
||||
# Just for backwards compatibility
|
||||
enableWebSearch: Optional[bool] = False
|
||||
show_sources: Optional[bool] = False
|
||||
agent: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
|
||||
@ -183,8 +184,12 @@ class FirecrawlApp:
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key not in ['jsonOptions', 'changeTrackingOptions']:
|
||||
if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']:
|
||||
scrape_params[key] = value
|
||||
|
||||
agent = params.get('agent')
|
||||
if agent:
|
||||
scrape_params['agent'] = agent
|
||||
|
||||
|
||||
endpoint = f'/v1/scrape'
|
||||
@ -706,6 +711,9 @@ class FirecrawlApp:
|
||||
request_data['systemPrompt'] = params['system_prompt']
|
||||
elif params.get('systemPrompt'): # Check legacy field name
|
||||
request_data['systemPrompt'] = params['systemPrompt']
|
||||
|
||||
if params.get('agent'):
|
||||
request_data['agent'] = params['agent']
|
||||
|
||||
try:
|
||||
# Send the initial extract request
|
||||
|
Loading…
x
Reference in New Issue
Block a user