From 904c904971913f08516a181dc7ffeb67ad2a3bae Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 13 Nov 2024 18:06:20 -0300 Subject: [PATCH] wip --- .../src/__tests__/e2e_extract/index.test.ts | 151 ++++++++++++ apps/api/src/controllers/v1/extract.ts | 176 ++++++++------ apps/api/src/controllers/v1/types.ts | 3 +- apps/api/src/lib/extract/completions.ts | 217 +++++++++--------- .../scrapeURL/transformers/llmExtract.ts | 46 ++-- apps/api/src/search/fireEngine.ts | 1 - 6 files changed, 397 insertions(+), 197 deletions(-) create mode 100644 apps/api/src/__tests__/e2e_extract/index.test.ts diff --git a/apps/api/src/__tests__/e2e_extract/index.test.ts b/apps/api/src/__tests__/e2e_extract/index.test.ts new file mode 100644 index 00000000..39416bd7 --- /dev/null +++ b/apps/api/src/__tests__/e2e_extract/index.test.ts @@ -0,0 +1,151 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { + FirecrawlCrawlResponse, + FirecrawlCrawlStatusResponse, + FirecrawlScrapeResponse, +} from "../../types"; + +dotenv.config(); +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for Extract API Routes", () => { + describe("POST /v1/extract", () => { + it.concurrent("should return authors of blog posts on firecrawl.dev", async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://firecrawl.dev"], + prompt: "Who are the authors of the blog posts?", + schema: { + type: "object", + properties: { authors: { type: "array", items: { type: "string" } } }, + }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("founders"); + + let gotItRight = 0; + for (const author of response.body.data?.authors) { + if (author.includes("Caleb Peffer")) gotItRight++; + if (author.includes("Gergő Móricz")) gotItRight++; + if (author.includes("Eric Ciarla")) gotItRight++; + if (author.includes("Nicolas Camara")) gotItRight++; + } + + expect(gotItRight).toBeGreaterThan(3); + }, 60000); + + it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["mendable.ai"], + prompt: "Who are the founders of the company?", + allowExternalLinks: true, + schema: { + type: "object", + properties: { founders: { type: "array", items: { type: "string" } } }, + }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("founders"); + + let gotItRight = 0; + for (const founder of response.body.data?.founders) { + if (founder.includes("Caleb")) gotItRight++; + if (founder.includes("Eric")) gotItRight++; + if (founder.includes("Nicolas")) gotItRight++; + } + + expect(gotItRight).toBe(3); + }, 60000); + + it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://firecrawl.dev"], + prompt: "What are they hiring for?", + allowExternalLinks: true, + schema: { + type: "array", + items: { + type: "string" + } + }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + console.log(response.body.data); + + let gotItRight = 0; + for (const hiring of response.body.data?.items) { + if (hiring.includes("Developer Relations Specialist")) gotItRight++; + if (hiring.includes("Web Automation Engineer")) gotItRight++; + if (hiring.includes("Developer Experience Engineer")) gotItRight++; + if (hiring.includes("Developer Support Engineer")) gotItRight++; + if (hiring.includes("Dev Ops Engineer")) gotItRight++; + if (hiring.includes("Founding Web Automation Engineer")) gotItRight++; + } + + expect(gotItRight).toBeGreaterThan(5); + }, 60000); + + it.concurrent("should return PCI DSS compliance for Fivetran", async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["fivetran.com"], + prompt: "Does Fivetran have PCI DSS compliance?", + allowExternalLinks: true, + schema: { + type: "object", + properties: { + pciDssCompliance: { type: "boolean" } + } + }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data?.pciDssCompliance).toBe(true); + }, 60000); + + it.concurrent("should return Azure Data Connectors for Fivetran", async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["fivetran.com"], + prompt: "What are the Azure Data Connectors they offer?", + schema: { + type: "array", + items: { + type: "object", + properties: { + connector: { type: "string" }, + description: { type: "string" }, + supportsCaptureDelete: { type: "boolean" } + } + } + } + }) + + console.log(response.body); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("data"); + // expect(response.body.data?.pciDssCompliance).toBe(true); + }, 60000); + }); +}); diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index d3c06c34..178213d0 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs"; import { addScrapeJob } from "../../services/queue-jobs"; import { PlanType } from "../../types"; import { getJobPriority } from "../../lib/job-priority"; -import { generateFinalExtraction } from "../../lib/extract/completions"; +import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); const MAX_EXTRACT_LIMIT = 100; -const MAX_RANKING_LIMIT = 3; +const MAX_RANKING_LIMIT = 5; +const SCORE_THRESHOLD = 0.75; export async function extractController( req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, - res: Response //ExtractResponse> + res: Response ) { req.body = extractRequestSchema.parse(req.body); const id = crypto.randomUUID(); - let links: string[] = req.body.urls; + let links: string[]; //= req.body.urls; const sc: StoredCrawl = { originUrl: req.body.urls[0], @@ -59,10 +61,14 @@ export async function extractController( const crawler = crawlToCrawler(id, sc); let urlWithoutWww = req.body.urls[0].replace("www.", ""); + console.log("urlWithoutWww", urlWithoutWww); - let mapUrl = req.body.prompt - ? `"${req.body.prompt}" site:${urlWithoutWww}` - : `site:${req.body.urls[0]}`; + const allowExternalLinks = req.body.allowExternalLinks ?? false; + + let mapUrl = req.body.prompt && allowExternalLinks + ? `${req.body.prompt} ${urlWithoutWww}` + : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}` + : `site:${urlWithoutWww}`; const resultsPerPage = 100; const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage); @@ -84,82 +90,103 @@ export async function extractController( }; pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - allResults = await Promise.all(pagePromises); + allResults = (await Promise.all(pagePromises)).flat(); + // console.log("allResults", allResults); + // if allResults is empty, return an error + if (allResults.length === 0) { + return res.status(400).json({ + success: false, + error: "No results found", + }); + } await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours } // console.log("allResults", allResults); // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), - ...(cachedResult ? [] : pagePromises), - ]); + // const [sitemap, ...searchResults] = await Promise.all([ + // req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(), + // ...(cachedResult ? [] : pagePromises), + // ]); - if (!cachedResult) { - allResults = searchResults; - } + // if (!cachedResult) { + // allResults = searchResults; + // } - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } + links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); + console.log("links", links); + // if (sitemap !== null) { + // sitemap.forEach((x) => { + // links.push(x.url); + // }); + // } - let mapResults = allResults - .flat() - .filter((result) => result !== null && result !== undefined); + // let mapResults = allResults + // .flat() + // .filter((result) => result !== null && result !== undefined); - const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT); - if (mapResults.length > minumumCutoff) { - mapResults = mapResults.slice(0, minumumCutoff); - } + // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT); + // if (mapResults.length > minumumCutoff) { + // mapResults = mapResults.slice(0, minumumCutoff); + // } - if (mapResults.length > 0) { - if (req.body.prompt) { - // Ensure all map results are first, maintaining their order - links = [ - mapResults[0].url, - ...mapResults.slice(1).map((x) => x.url), - ...links, - ]; - } else { - mapResults.map((x) => { - links.push(x.url); - }); - } - } + // if (mapResults.length > 0) { + // if (req.body.prompt) { + // // Ensure all map results are first, maintaining their order + // links = [ + // mapResults[0].url, + // ...mapResults.slice(1).map((x) => x.url), + // ...links, + // ]; + // } else { + // mapResults.map((x) => { + // links.push(x.url); + // }); + // } + // } + + // console.log("mapResults", mapResults); // console.log("links", links); let linksAndScores: { link: string; score: number }[] = []; // Perform cosine similarity between the search query and the list of links if (req.body.prompt) { - const searchQuery = req.body.prompt.toLowerCase(); + const searchQuery = mapUrl; //req.body.prompt.toLowerCase(); linksAndScores = await performRanking(links, searchQuery); } + console.log("linksAndScores", linksAndScores); + links = linksAndScores + .filter(x => x.score > SCORE_THRESHOLD) + .map(x => x.link.split("url: ")[1].split(",")[0]) + .filter(x => !isUrlBlocked(x)) + + console.log("links:", links.length); + + // should we use some sort of llm to determine the best links? // console.log("linksAndScores", linksAndScores); - links = links - .map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim(); - } catch (_) { - return null; - } - }) - .filter((x) => x !== null) as string[]; + // links = links + // .map((x) => { + // try { + // return checkAndUpdateURLForMap(x).url.trim(); + // } catch (_) { + // return null; + // } + // }) + // .filter((x) => x !== null) as string[]; // allows for subdomains to be included - links = links.filter((x) => isSameDomain(x, req.body.urls[0])); + // links = links.filter((x) => isSameDomain(x, req.body.urls[0])); // if includeSubdomains is false, filter out subdomains - if (!req.body.includeSubdomains) { - links = links.filter((x) => isSameSubdomain(x, req.body.urls[0])); - } + // if (!req.body.includeSubdomains) { + // links = links.filter((x) => isSameSubdomain(x, req.body.urls[0])); + // z} // remove duplicates that could be due to http/https or www - links = removeDuplicateUrls(links); + // links = removeDuplicateUrls(links); // get top N links links = links.slice(0, MAX_RANKING_LIMIT); @@ -170,7 +197,7 @@ export async function extractController( for (const url of links) { const origin = req.body.origin || "api"; - const timeout = req.body.timeout; + const timeout = req.body.timeout ?? 30000; const jobId = crypto.randomUUID(); const startTime = new Date().getTime(); @@ -196,7 +223,7 @@ export async function extractController( jobPriority ); - const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); + const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); let doc: Document; try { @@ -234,18 +261,20 @@ export async function extractController( docs.push(doc); } + console.log(docs) - // console.log("docs", docs); + const completions = await generateOpenAICompletions( + logger.child({ method: "extractController/generateOpenAICompletions" }), + { + mode: "llm", + systemPrompt: "Only use the provided content to answer the question.", + prompt: mapUrl, + schema: req.body.schema, + }, + docs.map(x => x.markdown).join('\n') + ); - // {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"} - const completions = await generateFinalExtraction({ - pagesContent: docs.map(x => x.markdown).join('\n'), - systemPrompt: '', - prompt: req.body.prompt, - schema: req.body.schema, - }); - - // console.log("completions", completions); + console.log("completions", completions); // if(req.body.extract && req.body.formats.includes("extract")) { // creditsToBeBilled = 5; @@ -315,9 +344,18 @@ export async function extractController( // scrape_id: result.scrape_id // }; + console.log("completions.extract", completions.extract); + + let data: any; + try { + data = JSON.parse(completions.extract); + } catch (e) { + data = completions.extract; + } + return res.status(200).json({ success: true, - data: completions.content, // includeMetadata ? mapResults : linksToReturn, + data: data, // includeMetadata ? mapResults : linksToReturn, scrape_id: id, //origin?.includes("website") ? id : undefined, }); } \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 02523bb7..f9048fd6 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -161,6 +161,7 @@ export const extractV1Options = z.object({ limit: z.number().int().positive().finite().safe().optional(), ignoreSitemap: z.boolean().default(false), includeSubdomains: z.boolean().default(true), + allowExternalLinks: z.boolean().default(false), origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), }).strict(strictMessage) @@ -353,7 +354,7 @@ export type ExtractResponse = | { success: true; warning?: string; - data: Document; + data: z.infer; scrape_id?: string; }; diff --git a/apps/api/src/lib/extract/completions.ts b/apps/api/src/lib/extract/completions.ts index c02b3e31..34a5a215 100644 --- a/apps/api/src/lib/extract/completions.ts +++ b/apps/api/src/lib/extract/completions.ts @@ -1,121 +1,124 @@ -import OpenAI from "openai"; -import { encoding_for_model } from "@dqbd/tiktoken"; -import { TiktokenModel } from "@dqbd/tiktoken"; -import { ExtractOptions } from "../../controllers/v1/types"; -import { Document } from "../entities"; -import { z } from "zod"; +// use llmExtract.ts instead -const maxTokens = 32000; -const modifier = 4; +// import OpenAI from "openai"; +// import { encoding_for_model } from "@dqbd/tiktoken"; +// import { TiktokenModel } from "@dqbd/tiktoken"; +// import { ExtractOptions } from "../../controllers/v1/types"; +// import { Document } from "../entities"; +// import { z } from "zod"; -export class LLMRefusalError extends Error { - constructor(refusal: string) { - super("LLM refused to extract the website's content"); - this.name = "LLMRefusalError"; - } -} +// const maxTokens = 32000; +// const modifier = 4; -interface GenerateCompletionsParams { - systemPrompt?: string; - prompt?: string; - schema?: any; - pagesContent: string; -} +// export class LLMRefusalError extends Error { +// constructor(refusal: string) { +// super("LLM refused to extract the website's content"); +// this.name = "LLMRefusalError"; +// } +// } -export async function generateBasicCompletion(prompt: string) { - const openai = new OpenAI(); - const model: TiktokenModel = - (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; +// interface GenerateCompletionsParams { +// systemPrompt?: string; +// prompt?: string; +// schema?: any; +// pagesContent: string; +// } - const completion = await openai.chat.completions.create({ - model, - messages: [{ role: "user", content: prompt }], - }); +// export async function generateBasicCompletion(prompt: string) { +// const openai = new OpenAI(); +// const model: TiktokenModel = +// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; - return completion.choices[0].message.content; -} +// const completion = await openai.chat.completions.create({ +// model, +// messages: [{ role: "user", content: prompt }], +// }); -export async function generateFinalExtraction({ - pagesContent, - systemPrompt, - prompt, - schema, -}: GenerateCompletionsParams): Promise<{ - content: string; - metadata: { numTokens: number; warning: string }; -}> { - const openai = new OpenAI(); - const model: TiktokenModel = - (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; +// return completion.choices[0].message.content; +// } - let extractionContent = pagesContent; - let numTokens = 0; - let warning = ""; +// export async function generateFinalExtraction({ +// pagesContent, +// systemPrompt, +// prompt, +// schema, +// }: GenerateCompletionsParams): Promise<{ +// content: string; +// metadata: { numTokens: number; warning: string }; +// }> { +// const openai = new OpenAI(); +// const model: TiktokenModel = +// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; - const encoder = encoding_for_model(model); - try { - const tokens = encoder.encode(extractionContent); - numTokens = tokens.length; - } catch (error) { - extractionContent = extractionContent.slice(0, maxTokens * modifier); - warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`; - } finally { - encoder.free(); - } +// let extractionContent = pagesContent; +// let numTokens = 0; +// let warning = ""; - if (numTokens > maxTokens) { - extractionContent = extractionContent.slice(0, maxTokens * modifier); - warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`; - } +// const encoder = encoding_for_model(model); +// try { +// const tokens = encoder.encode(extractionContent); +// numTokens = tokens.length; +// } catch (error) { +// extractionContent = extractionContent.slice(0, maxTokens * modifier); +// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`; +// } finally { +// encoder.free(); +// } - if (schema && (schema.type === "array" || schema._type === "ZodArray")) { - schema = { - type: "object", - properties: { - items: schema, - }, - required: ["items"], - additionalProperties: false, - }; - } else if (schema) { - schema.additionalProperties = false; - schema.required = Object.keys(schema.properties); - } +// if (numTokens > maxTokens) { +// extractionContent = extractionContent.slice(0, maxTokens * modifier); +// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`; +// } - const jsonCompletion = await openai.beta.chat.completions.parse({ - model, - messages: [ - { role: "system", content: systemPrompt ?? "" }, - { role: "user", content: [{ type: "text", text: extractionContent }] }, - { - role: "user", - content: prompt - ? `Transform the above content into structured JSON output based on the following user request: ${prompt}` - : "Transform the above content into structured JSON output.", - }, - ], - response_format: schema - ? { - type: "json_schema", - json_schema: { - name: "websiteContent", - schema: schema, - strict: true, - }, - } - : { type: "json_object" }, - }); +// if (schema && (schema.type === "array" || schema._type === "ZodArray")) { +// schema = { +// type: "object", +// properties: { +// items: schema, +// }, +// required: ["items"], +// additionalProperties: false, +// }; +// } else if (schema) { +// schema.additionalProperties = false; +// schema.required = Object.keys(schema.properties); +// } - if (jsonCompletion.choices[0].message.refusal !== null) { - throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); - } +// const jsonCompletion = await openai.beta.chat.completions.parse({ +// temperature: 0, +// model, +// messages: [ +// { role: "system", content: systemPrompt ?? "" }, +// { role: "user", content: [{ type: "text", text: extractionContent }] }, +// { +// role: "user", +// content: prompt +// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}` +// : "Transform the above content into structured JSON output.", +// }, +// ], +// response_format: schema +// ? { +// type: "json_schema", +// json_schema: { +// name: "websiteContent", +// schema: schema, +// strict: true, +// }, +// } +// : { type: "json_object" }, +// }); - const extraction = jsonCompletion.choices[0].message.parsed; - return { - content: extraction ?? "", - metadata: { - numTokens, - warning, - }, - }; -} +// if (jsonCompletion.choices[0].message.refusal !== null) { +// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); +// } + +// const extraction = jsonCompletion.choices[0].message.parsed; +// return { +// content: extraction ?? "", +// metadata: { +// numTokens, +// warning, +// }, +// }; +// } diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 69a92197..1a5abd66 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -58,32 +58,33 @@ function normalizeSchema(x: any): any { } } -async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise { +export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> { + let extract: any; + let warning: string | undefined; + const openai = new OpenAI(); const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; - if (document.markdown === undefined) { + if (markdown === undefined) { throw new Error("document.markdown is undefined -- this is unexpected"); } - let extractionContent = document.markdown; - // count number of tokens let numTokens = 0; const encoder = encoding_for_model(model as TiktokenModel); try { // Encode the message into tokens - const tokens = encoder.encode(extractionContent); + const tokens = encoder.encode(markdown); // Return the number of tokens numTokens = tokens.length; } catch (error) { - logger.warn("Calculating num tokens of string failed", { error, extractionContent }); + logger.warn("Calculating num tokens of string failed", { error, markdown }); - extractionContent = extractionContent.slice(0, maxTokens * modifier); + markdown = markdown.slice(0, maxTokens * modifier); - const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; - document.warning = document.warning === undefined ? warning : " " + warning; + let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; + warning = previousWarning === undefined ? w : w + " " + previousWarning; } finally { // Free the encoder resources after use encoder.free(); @@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - extractionContent = extractionContent.slice(0, maxTokens * modifier); + markdown = markdown.slice(0, maxTokens * modifier); - const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; - document.warning = document.warning === undefined ? warning : " " + warning; + const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; + warning = previousWarning === undefined ? w : w + " " + previousWarning; } let schema = options.schema; @@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt }, { role: "user", - content: [{ type: "text", text: extractionContent }], + content: [{ type: "text", text: markdown }], }, { role: "user", @@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); } - document.extract = jsonCompletion.choices[0].message.parsed; + extract = jsonCompletion.choices[0].message.parsed; - if (document.extract === null && jsonCompletion.choices[0].message.content !== null) { + if (extract === null && jsonCompletion.choices[0].message.content !== null) { try { - document.extract = JSON.parse(jsonCompletion.choices[0].message.content); + extract = JSON.parse(jsonCompletion.choices[0].message.content); } catch (e) { logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); @@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt } if (options.schema && options.schema.type === "array") { - document.extract = document.extract?.items; + extract = extract?.items; } - return document; + return { extract, warning }; } export async function performLLMExtract(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("extract")) { - document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!); + const { extract, warning } = await generateOpenAICompletions( + meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), + meta.options.extract!, + document.markdown, + document.warning, + ); + document.extract = extract; + document.warning = warning; } return document; diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 09a58e4e..0b82478e 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -37,7 +37,6 @@ export async function fireEngineMap( ); return []; } - console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL); const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { method: "POST",