From 2ca22659d3359c185cc196b56c5ec40a1f7b2ca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 11 Nov 2024 21:07:37 +0100 Subject: [PATCH] fix(scrapeURL/llmExtract): fix schema-less LLM extract --- apps/api/src/controllers/v1/types.ts | 2 +- .../src/scraper/scrapeURL/transformers/llmExtract.ts | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f072ca0b..28dbb48f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -53,7 +53,7 @@ const strictMessage = "Unrecognized key in body -- please review the v1 API docu export const extractOptions = z.object({ mode: z.enum(["llm"]).default("llm"), schema: z.any().optional(), - systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."), + systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."), prompt: z.string().optional() }).strict(strictMessage); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 22e2649b..69a92197 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -144,6 +144,16 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt } document.extract = jsonCompletion.choices[0].message.parsed; + + if (document.extract === null && jsonCompletion.choices[0].message.content !== null) { + try { + document.extract = JSON.parse(jsonCompletion.choices[0].message.content); + } catch (e) { + logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); + throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); + } + } + if (options.schema && options.schema.type === "array") { document.extract = document.extract?.items; }