diff --git a/apps/api/package.json b/apps/api/package.json index 0ece960a..bac13e79 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -94,7 +94,7 @@ "moment": "^2.29.4", "mongoose": "^8.4.4", "natural": "^7.0.7", - "openai": "^4.52.2", + "openai": "^4.57.0", "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 727c4ed4..2762a84c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -124,7 +124,7 @@ importers: version: 0.0.28 langchain: specifier: ^0.2.8 - version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -147,8 +147,8 @@ importers: specifier: ^7.0.7 version: 7.0.7(socks@2.8.3) openai: - specifier: ^4.52.2 - version: 4.52.2 + specifier: ^4.57.0 + version: 4.57.0(zod@3.23.8) pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -3733,9 +3733,14 @@ packages: openai@3.3.0: resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} - openai@4.52.2: - resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==} + openai@4.57.0: + resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==} hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true openapi-types@12.1.3: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} @@ -5319,13 +5324,13 @@ snapshots: '@js-sdsl/ordered-map@4.4.2': {} - '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': + '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5337,20 +5342,20 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': + '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 - openai: 4.52.2 + openai: 4.57.0(zod@3.23.8) zod: 3.23.8 zod-to-json-schema: 3.23.1(zod@3.23.8) transitivePeerDependencies: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -8487,17 +8492,17 @@ snapshots: kleur@3.0.3: {} - langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) - '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8524,7 +8529,7 @@ snapshots: langchainhub@0.0.11: {} - langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2): + langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8533,9 +8538,9 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) - langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) - openai: 4.52.2 + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + openai: 4.57.0(zod@3.23.8) languagedetect@2.0.0: {} @@ -8928,16 +8933,19 @@ snapshots: transitivePeerDependencies: - debug - openai@4.52.2: + openai@4.57.0(zod@3.23.8): dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 + '@types/qs': 6.9.15 abort-controller: 3.0.0 agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 node-fetch: 2.7.0 - web-streams-polyfill: 3.3.3 + qs: 6.12.2 + optionalDependencies: + zod: 3.23.8 transitivePeerDependencies: - encoding diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 32d60845..7774823a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -44,7 +44,8 @@ const strictMessage = "Unrecognized key in body -- please review the v1 API docu export const extractOptions = z.object({ mode: z.enum(["llm"]).default("llm"), schema: z.any().optional(), - prompt: z.string().default("Based on the information on the page, extract the information from the schema.") + systemPrompt: z.string().default("Based on the information on the page, extract the information from the schema."), + prompt: z.string().optional() }).strict(strictMessage); export type ExtractOptions = z.infer; @@ -316,6 +317,7 @@ export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { mode: x.mode ? "llm-extraction" : "markdown", extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.", extractionSchema: x.schema, + userPrompt: x.prompt ?? "", }; } diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index eaad3ecd..a5d0dbaf 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -15,7 +15,8 @@ export async function generateCompletions( // const schema = zodToJsonSchema(options.schema) const schema = extractionOptions.extractionSchema; - const prompt = extractionOptions.extractionPrompt; + const systemPrompt = extractionOptions.extractionPrompt; + const prompt = extractionOptions.userPrompt; const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider @@ -31,9 +32,11 @@ export async function generateCompletions( document: document, schema: schema, prompt: prompt, + systemPrompt: systemPrompt, mode: mode, }); // Validate the JSON output against the schema using AJV + if(schema){ const validate = ajv.compile(schema); if (!validate(completionResult.llm_extraction)) { //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. @@ -41,7 +44,8 @@ export async function generateCompletions( `JSON parsing error(s): ${validate.errors ?.map((err) => err.message) .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` - ); + ); + } } return completionResult; diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 01b45137..6fbbbc3a 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -16,7 +16,6 @@ function prepareOpenAIDoc( document: Document, mode: "markdown" | "raw-html" ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { - let markdown = document.markdown; let extractionTarget = document.markdown; @@ -25,7 +24,6 @@ function prepareOpenAIDoc( extractionTarget = document.rawHtml; } - // Check if the markdown content exists in the document if (!extractionTarget) { return null; @@ -34,33 +32,32 @@ function prepareOpenAIDoc( // ); } - - - // count number of tokens const numTokens = numTokensFromString(extractionTarget, "gpt-4"); if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); + extractionTarget = extractionTarget.slice(0, maxTokens * modifier); } return [[{ type: "text", text: extractionTarget }], numTokens]; } export async function generateOpenAICompletions({ client, - model = process.env.MODEL_NAME || "gpt-4o", + model = process.env.MODEL_NAME || "gpt-4o-mini", document, schema, //TODO - add zod dynamic type checking - prompt = defaultPrompt, + systemPrompt = defaultPrompt, + prompt, temperature, - mode + mode, }: { client: OpenAI; model?: string; document: Document; schema: any; // This should be replaced with a proper Zod schema type when available prompt?: string; + systemPrompt?: string; temperature?: number; mode: "markdown" | "raw-html"; }): Promise { @@ -70,44 +67,90 @@ export async function generateOpenAICompletions({ if (preparedDoc === null) { return { ...document, - warning: "LLM extraction was not performed since the document's content is empty or missing.", + warning: + "LLM extraction was not performed since the document's content is empty or missing.", }; } const [content, numTokens] = preparedDoc; - const completion = await openai.chat.completions.create({ - model, - messages: [ - { - role: "system", - content: prompt, - }, - { role: "user", content }, - ], - tools: [ - { - type: "function", - function: { - name: "extract_content", - description: "Extracts the content from the given webpage(s)", - parameters: schema, + let completion; + let llmExtraction; + if (prompt && !schema) { + // If prompt is defined, ask OpenAI to generate a schema based on the prompt + // const schemaCompletion = await openai.chat.completions.create({ + // model, + // messages: [ + // { + // role: "system", + // content: "You are a helpful assistant that generates JSON schemas based on user prompts.", + // }, + // { + // role: "user", + // content: `Generate a JSON schema compatible with openai function calling based on this prompt: ${prompt}`, + // }, + // ], + // temperature: 0, + // response_format: { type: "json_object" }, + // }); + + // console.log(schemaCompletion.choices[0].message.content); + + // const generatedSchema = JSON.parse(schemaCompletion.choices[0].message.content); + console.log(prompt); + const jsonCompletion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, }, - }, - ], - tool_choice: { "type": "function", "function": {"name": "extract_content"}}, - temperature, - }); + { role: "user", content }, + { role: "user", content: `Transform the above content into structured json output based on the following user request: ${prompt}` }, + ], + response_format: { type: "json_object" }, + temperature, + }); - const c = completion.choices[0].message.tool_calls[0].function.arguments; + console.log(jsonCompletion.choices[0].message.content); - // Extract the LLM extraction content from the completion response - const llmExtraction = JSON.parse(c); + llmExtraction = JSON.parse(jsonCompletion.choices[0].message.content.trim()); + console.log(llmExtraction); + } else { + completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, + }, + { role: "user", content }, + ], + tools: [ + { + type: "function", + function: { + name: "extract_content", + description: "Extracts the content from the given webpage(s)", + parameters: schema, + }, + }, + ], + tool_choice: { type: "function", function: { name: "extract_content" } }, + temperature, + }); + const c = completion.choices[0].message.tool_calls[0].function.arguments; + + // Extract the LLM extraction content from the completion response + llmExtraction = JSON.parse(c); + } // Return the document with the LLM extraction content added return { ...document, llm_extraction: llmExtraction, - warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, + warning: + numTokens > maxTokens + ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` + : undefined, }; } - diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 0eb5464b..dfd17c63 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -36,6 +36,7 @@ export type ExtractorOptions = { mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; + userPrompt?: string; } export type SearchOptions = {