From 5ab86b8b43afa79dd733eafeada4b03705805778 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 21 Feb 2025 16:44:42 -0300 Subject: [PATCH] (fix/token-slicer) Fixes extract token limit issues (#1236) * Nick: fixes extract token limit errors * Update llmExtract.ts * Update llmExtract.ts --- .../src/lib/deep-research/research-manager.ts | 13 +- .../scrapeURL/transformers/llmExtract.test.ts | 253 +++++++++++++----- .../scrapeURL/transformers/llmExtract.ts | 123 +++++---- 3 files changed, 259 insertions(+), 130 deletions(-) diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 1b3b0626..61d5bd34 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -5,8 +5,7 @@ import { DeepResearchSource, updateDeepResearch, } from "./deep-research-redis"; -import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; -import { truncateText } from "../../scraper/scrapeURL/transformers/llmExtract"; +import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; interface AnalysisResult { gaps: string[]; @@ -178,7 +177,7 @@ export class ResearchLLMService { }, }, prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}" - ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${truncateText(findings.map((f) => `- ${f.text}`).join("\n"), 10000)}` : ""} + ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${trimToTokenLimit(findings.map((f) => `- ${f.text}`).join("\n"), 10000).text}` : ""} Each query should be specific and focused on a particular aspect. Build upon previous findings when available. @@ -225,7 +224,7 @@ export class ResearchLLMService { }, }, }, - prompt: truncateText( + prompt: trimToTokenLimit( `You are researching: ${currentTopic} You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it. Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} @@ -234,7 +233,7 @@ export class ResearchLLMService { Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis. If I have enough information, set shouldContinue to false.`, 120000, - ), + ).text, }, markdown: "", }); @@ -266,7 +265,7 @@ export class ResearchLLMService { report: { type: "string" }, }, }, - prompt: truncateText( + prompt: trimToTokenLimit( `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. Research data: @@ -281,7 +280,7 @@ export class ResearchLLMService { - Cite sources - Use bullet points and lists where appropriate for readability`, 100000, - ), + ).text, }, markdown: "", }); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts index a3f3e04c..60da5923 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts @@ -1,5 +1,5 @@ import { removeDefaultProperty } from "./llmExtract"; -import { truncateText } from "./llmExtract"; +import { trimToTokenLimit } from "./llmExtract"; import { encoding_for_model } from "@dqbd/tiktoken"; jest.mock("@dqbd/tiktoken", () => ({ @@ -46,10 +46,13 @@ describe("removeDefaultProperty", () => { }); }); -describe("truncateText", () => { + +describe("trimToTokenLimit", () => { const mockEncode = jest.fn(); + const mockFree = jest.fn(); const mockEncoder = { encode: mockEncode, + free: mockFree, }; beforeEach(() => { @@ -57,84 +60,214 @@ describe("truncateText", () => { (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder); }); - it("should return the original text if it's within token limit", () => { - const text = "This is a short text"; + it("should return original text if within token limit", () => { + const text = "This is a test text"; mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens - const result = truncateText(text, 10); - expect(result).toBe(text); + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result).toEqual({ + text, + numTokens: 5, + warning: undefined + }); expect(mockEncode).toHaveBeenCalledWith(text); + expect(mockFree).toHaveBeenCalled(); }); - it("should truncate text that exceeds token limit", () => { - const text = "This is a longer text that needs truncation"; - mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens + it("should trim text and return warning when exceeding token limit", () => { + const text = "This is a longer text that needs to be trimmed"; + mockEncode + .mockReturnValueOnce(new Array(20)) // First call for full text + .mockReturnValueOnce(new Array(8)); // Second call for trimmed text - const result = truncateText(text, 10); - expect(result.length).toBeLessThan(text.length); - expect(mockEncode).toHaveBeenCalled(); + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(8); + expect(result.warning).toContain("automatically trimmed"); + expect(mockEncode).toHaveBeenCalledTimes(2); + expect(mockFree).toHaveBeenCalled(); }); - it("should handle empty string", () => { - const text = ""; - mockEncode.mockReturnValue([]); + it("should append previous warning if provided", () => { + const text = "This is a test text that is too long"; + const previousWarning = "Previous warning message"; + mockEncode + .mockReturnValueOnce(new Array(15)) + .mockReturnValueOnce(new Array(8)); - const result = truncateText(text, 10); - expect(result).toBe(""); - expect(mockEncode).toHaveBeenCalledWith(""); + const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning); + + expect(result.warning).toContain("automatically trimmed"); + expect(result.warning).toContain(previousWarning); }); - it("should use character-based fallback when encoder throws error", () => { - const text = "This is some text"; + it("should use fallback approach when encoder throws error", () => { + const text = "This is some text to test fallback"; mockEncode.mockImplementation(() => { throw new Error("Encoder error"); }); - const result = truncateText(text, 5); - // With modifier of 3, should truncate to approximately 15 characters - expect(result.length).toBeLessThanOrEqual(15); + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token + expect(result.numTokens).toBe(10); + expect(result.warning).toContain("Failed to derive number of LLM tokens"); }); - it("should handle very short max token limits", () => { + it("should handle empty text", () => { + const text = ""; + mockEncode.mockReturnValue([]); + + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result).toEqual({ + text: "", + numTokens: 0, + warning: undefined + }); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle large token limits (128k)", () => { + const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens + mockEncode + .mockReturnValueOnce(new Array(130000)) // First check shows it's too long + .mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim + + const result = trimToTokenLimit(text, 128000, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(127000); + expect(result.warning).toContain("automatically trimmed"); + expect(mockEncode).toHaveBeenCalledTimes(2); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle large token limits (512k) with 32k context window", () => { + const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens + mockEncode + .mockReturnValueOnce(new Array(520000)) // First check shows it's too long + .mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim + + const result = trimToTokenLimit(text, 32000, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(32000); + expect(result.warning).toContain("automatically trimmed"); + expect(mockEncode).toHaveBeenCalledTimes(2); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should preserve text when under token limit", () => { const text = "Short text"; + mockEncode.mockReturnValue(new Array(5)); // 5 tokens + + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text).toBe(text); + expect(result.numTokens).toBe(5); + expect(result.warning).toBeUndefined(); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should append new warning to previous warning", () => { + const text = "A".repeat(300); + const previousWarning = "Previous warning message"; + mockEncode + .mockReturnValueOnce(new Array(100)) + .mockReturnValueOnce(new Array(50)); + + const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning); + + expect(result.warning).toContain("automatically trimmed"); + expect(result.warning).toContain(previousWarning); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle encoder initialization failure gracefully", () => { + const text = "Sample text"; + (encoding_for_model as jest.Mock).mockImplementationOnce(() => { + throw new Error("Encoder initialization failed"); + }); + + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars + expect(result.warning).toContain("Failed to derive number of LLM tokens"); + expect(mockFree).not.toHaveBeenCalled(); + }); + + it("should handle encoding errors during trimming", () => { + const text = "Sample text"; + mockEncode.mockImplementation(() => { + throw new Error("Encoding failed"); + }); + + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text.length).toBeLessThanOrEqual(30); + expect(result.warning).toContain("Failed to derive number of LLM tokens"); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle very small token limits", () => { + const text = "This is a test sentence that should be trimmed significantly"; + mockEncode + .mockReturnValueOnce(new Array(20)) + .mockReturnValueOnce(new Array(3)); + + const result = trimToTokenLimit(text, 3, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(3); + expect(result.warning).toContain("automatically trimmed"); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle unicode characters", () => { + const text = "Hello 👋 World 🌍"; + mockEncode + .mockReturnValueOnce(new Array(8)) + .mockReturnValueOnce(new Array(4)); + + const result = trimToTokenLimit(text, 4, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(4); + expect(result.warning).toContain("automatically trimmed"); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle multiple trimming iterations", () => { + const text = "A".repeat(1000); + mockEncode + .mockReturnValueOnce(new Array(300)) + .mockReturnValueOnce(new Array(200)) + .mockReturnValueOnce(new Array(100)) + .mockReturnValueOnce(new Array(50)); + + const result = trimToTokenLimit(text, 50, "gpt-4o"); + + expect(result.text.length).toBeLessThan(text.length); + expect(result.numTokens).toBe(50); + expect(result.warning).toContain("automatically trimmed"); + expect(mockEncode).toHaveBeenCalledTimes(4); + expect(mockFree).toHaveBeenCalled(); + }); + + it("should handle exact token limit match", () => { + const text = "Exact token limit text"; mockEncode.mockReturnValue(new Array(10)); - const result = truncateText(text, 1); - expect(result.length).toBeLessThan(text.length); + const result = trimToTokenLimit(text, 10, "gpt-4o"); + + expect(result.text).toBe(text); + expect(result.numTokens).toBe(10); + expect(result.warning).toBeUndefined(); + expect(mockFree).toHaveBeenCalled(); }); - it("should handle zero max tokens", () => { - const text = "Some text"; - mockEncode.mockReturnValue(new Array(2)); - - const result = truncateText(text, 0); - expect(result).toBe(""); - }); - - it("should handle extremely large text exceeding model context", () => { - // Create a very large text (e.g., 100,000 characters) - const text = "a".repeat(100000); - - // First call: simulate 25000 tokens - mockEncode.mockReturnValueOnce(new Array(25000)); - // Subsequent calls: simulate gradually decreasing token counts - // This simulates the iterative truncation process - mockEncode - .mockReturnValueOnce(new Array(20000)) - .mockReturnValueOnce(new Array(15000)) - .mockReturnValueOnce(new Array(12000)) - .mockReturnValueOnce(new Array(9000)); - - const result = truncateText(text, 10000); // Common model context limit - - // The result should be significantly shorter but not empty - expect(result.length).toBeLessThan(text.length); - expect(result.length).toBeGreaterThan(0); - // Given our new conservative approach, we should have a substantial amount of text - expect(result.length).toBeGreaterThan(30000); // At least 30% of original - expect(mockEncode).toHaveBeenCalled(); - - // Log the actual length for verification - console.log("Result length:", result.length, "characters"); - }); + }); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 9b31cb80..29bf4f6f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -89,34 +89,63 @@ function normalizeSchema(x: any): any { } } -export function truncateText(text: string, maxTokens: number): string { - const modifier = 3; // Estimate: 1 token ≈ 3-4 characters for safety + + +interface TrimResult { + text: string; + numTokens: number; + warning?: string; +} + +export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult { try { - const encoder = encoding_for_model("gpt-4o"); - // Continuously trim the text until its token count is within the limit. - while (true) { + const encoder = encoding_for_model(modelId as TiktokenModel); + try { const tokens = encoder.encode(text); - if (tokens.length <= maxTokens) { - return text; + const numTokens = tokens.length; + + if (numTokens <= maxTokens) { + return { text, numTokens }; } - // Calculate a new length using a more conservative approach - // Instead of scaling the entire text, we'll remove a smaller portion - const ratio = maxTokens / tokens.length; - const newLength = Math.max( - Math.ceil(text.length * ratio), - Math.floor(text.length * 0.8) // Never remove more than 20% at once - ); - if (newLength <= 0) { - return ""; + + const modifier = 3; + // Start with 3 chars per token estimation + let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1); + + // Keep trimming until we're under the token limit + while (true) { + const currentTokens = encoder.encode(currentText); + if (currentTokens.length <= maxTokens) { + const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`; + return { + text: currentText, + numTokens: currentTokens.length, + warning: previousWarning ? `${warning} ${previousWarning}` : warning + }; + } + const overflow = currentTokens.length * modifier - maxTokens - 1; + // If still over limit, remove another chunk + currentText = currentText.slice(0, Math.floor(currentText.length - overflow)); } - text = text.slice(0, newLength); + + } catch (e) { + throw e; + } finally { + encoder.free(); } } catch (error) { - // Fallback using character-based estimation. - if (text.length <= maxTokens * modifier) { - return text; - } - return text.slice(0, maxTokens * modifier); + // Fallback to a more conservative character-based approach + const estimatedCharsPerToken = 2.8; + const safeLength = maxTokens * estimatedCharsPerToken; + const trimmedText = text.slice(0, Math.floor(safeLength)); + + const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`; + + return { + text: trimmedText, + numTokens: maxTokens, // We assume we hit the max in this fallback case + warning: previousWarning ? `${warning} ${previousWarning}` : warning + }; } } @@ -149,51 +178,19 @@ export async function generateCompletions({ } const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId); - - // Ratio of 4 was way too high, now 3.5. - const modifier = 3.5; // tokens to characters ratio // Calculate 80% of max input tokens (for content) const maxTokensSafe = Math.floor(maxInputTokens * 0.8); - // count number of tokens - let numTokens = 0; - try { - // Encode the message into tokens - const encoder = encoding_for_model(model.modelId as TiktokenModel); - - try { - const tokens = encoder.encode(markdown); - numTokens = tokens.length; - } catch (e) { - throw e; - } finally { - // Free the encoder resources after use - encoder.free(); - } - } catch (error) { - logger.warn("Calculating num tokens of string failed", { error }); + // Use the new trimming function + const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit( + markdown, + maxTokensSafe, + model.modelId, + previousWarning + ); - markdown = markdown.slice(0, maxTokensSafe * modifier); - - let w = - "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + - maxTokensSafe + - ") we support."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } - - if (numTokens > maxTokensSafe) { - // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, maxTokensSafe * modifier); - - const w = - "The extraction content would have used more tokens (" + - numTokens + - ") than the maximum we allow (" + - maxTokensSafe + - "). -- the input has been automatically trimmed."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } + markdown = trimmedMarkdown; + warning = trimWarning; let schema = options.schema; // Normalize the bad json schema users write (mogery)