From 5ab86b8b43afa79dd733eafeada4b03705805778 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 21 Feb 2025 16:44:42 -0300
Subject: [PATCH] (fix/token-slicer) Fixes extract token limit issues (#1236)

* Nick: fixes extract token limit errors

* Update llmExtract.ts

* Update llmExtract.ts
---
 .../src/lib/deep-research/research-manager.ts |  13 +-
 .../scrapeURL/transformers/llmExtract.test.ts | 253 +++++++++++++-----
 .../scrapeURL/transformers/llmExtract.ts      | 123 +++++----
 3 files changed, 259 insertions(+), 130 deletions(-)

diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts
index 1b3b0626..61d5bd34 100644
--- a/apps/api/src/lib/deep-research/research-manager.ts
+++ b/apps/api/src/lib/deep-research/research-manager.ts
@@ -5,8 +5,7 @@ import {
   DeepResearchSource,
   updateDeepResearch,
 } from "./deep-research-redis";
-import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
-import { truncateText } from "../../scraper/scrapeURL/transformers/llmExtract";
+import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
 
 interface AnalysisResult {
   gaps: string[];
@@ -178,7 +177,7 @@ export class ResearchLLMService {
           },
         },
         prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}"
-          ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${truncateText(findings.map((f) => `- ${f.text}`).join("\n"), 10000)}` : ""}
+          ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${trimToTokenLimit(findings.map((f) => `- ${f.text}`).join("\n"), 10000).text}` : ""}
           
           Each query should be specific and focused on a particular aspect.
           Build upon previous findings when available.
@@ -225,7 +224,7 @@ export class ResearchLLMService {
               },
             },
           },
-          prompt: truncateText(
+          prompt: trimToTokenLimit(
             `You are researching: ${currentTopic}
               You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it.
               Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
@@ -234,7 +233,7 @@ export class ResearchLLMService {
               Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis.
               If I have enough information, set shouldContinue to false.`,
             120000,
-          ),
+          ).text,
         },
         markdown: "",
       });
@@ -266,7 +265,7 @@ export class ResearchLLMService {
             report: { type: "string" },
           },
         },
-        prompt: truncateText(
+        prompt: trimToTokenLimit(
           `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
   
             Research data:
@@ -281,7 +280,7 @@ export class ResearchLLMService {
             - Cite sources
             - Use bullet points and lists where appropriate for readability`,
           100000,
-        ),
+        ).text,
       },
       markdown: "",
     });
diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
index a3f3e04c..60da5923 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
@@ -1,5 +1,5 @@
 import { removeDefaultProperty } from "./llmExtract";
-import { truncateText } from "./llmExtract";
+import { trimToTokenLimit } from "./llmExtract";
 import { encoding_for_model } from "@dqbd/tiktoken";
 
 jest.mock("@dqbd/tiktoken", () => ({
@@ -46,10 +46,13 @@ describe("removeDefaultProperty", () => {
   });
 });
 
-describe("truncateText", () => {
+
+describe("trimToTokenLimit", () => {
   const mockEncode = jest.fn();
+  const mockFree = jest.fn();
   const mockEncoder = {
     encode: mockEncode,
+    free: mockFree,
   };
 
   beforeEach(() => {
@@ -57,84 +60,214 @@ describe("truncateText", () => {
     (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
   });
 
-  it("should return the original text if it's within token limit", () => {
-    const text = "This is a short text";
+  it("should return original text if within token limit", () => {
+    const text = "This is a test text";
     mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens
 
-    const result = truncateText(text, 10);
-    expect(result).toBe(text);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result).toEqual({
+      text,
+      numTokens: 5,
+      warning: undefined
+    });
     expect(mockEncode).toHaveBeenCalledWith(text);
+    expect(mockFree).toHaveBeenCalled();
   });
 
-  it("should truncate text that exceeds token limit", () => {
-    const text = "This is a longer text that needs truncation";
-    mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens
+  it("should trim text and return warning when exceeding token limit", () => {
+    const text = "This is a longer text that needs to be trimmed";
+    mockEncode
+      .mockReturnValueOnce(new Array(20)) // First call for full text
+      .mockReturnValueOnce(new Array(8)); // Second call for trimmed text
 
-    const result = truncateText(text, 10);
-    expect(result.length).toBeLessThan(text.length);
-    expect(mockEncode).toHaveBeenCalled();
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(8);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockEncode).toHaveBeenCalledTimes(2);
+    expect(mockFree).toHaveBeenCalled();
   });
 
-  it("should handle empty string", () => {
-    const text = "";
-    mockEncode.mockReturnValue([]);
+  it("should append previous warning if provided", () => {
+    const text = "This is a test text that is too long";
+    const previousWarning = "Previous warning message";
+    mockEncode
+      .mockReturnValueOnce(new Array(15))
+      .mockReturnValueOnce(new Array(8));
 
-    const result = truncateText(text, 10);
-    expect(result).toBe("");
-    expect(mockEncode).toHaveBeenCalledWith("");
+    const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning);
+    
+    expect(result.warning).toContain("automatically trimmed");
+    expect(result.warning).toContain(previousWarning);
   });
 
-  it("should use character-based fallback when encoder throws error", () => {
-    const text = "This is some text";
+  it("should use fallback approach when encoder throws error", () => {
+    const text = "This is some text to test fallback";
     mockEncode.mockImplementation(() => {
       throw new Error("Encoder error");
     });
 
-    const result = truncateText(text, 5);
-    // With modifier of 3, should truncate to approximately 15 characters
-    expect(result.length).toBeLessThanOrEqual(15);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token
+    expect(result.numTokens).toBe(10);
+    expect(result.warning).toContain("Failed to derive number of LLM tokens");
   });
 
-  it("should handle very short max token limits", () => {
+  it("should handle empty text", () => {
+    const text = "";
+    mockEncode.mockReturnValue([]);
+
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result).toEqual({
+      text: "",
+      numTokens: 0,
+      warning: undefined
+    });
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle large token limits (128k)", () => {
+    const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens
+    mockEncode
+      .mockReturnValueOnce(new Array(130000)) // First check shows it's too long
+      .mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim
+
+    const result = trimToTokenLimit(text, 128000, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(127000);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockEncode).toHaveBeenCalledTimes(2);
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle large token limits (512k) with 32k context window", () => {
+    const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens
+    mockEncode
+      .mockReturnValueOnce(new Array(520000)) // First check shows it's too long
+      .mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim
+
+    const result = trimToTokenLimit(text, 32000, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(32000);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockEncode).toHaveBeenCalledTimes(2);
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should preserve text when under token limit", () => {
     const text = "Short text";
+    mockEncode.mockReturnValue(new Array(5)); // 5 tokens
+
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text).toBe(text);
+    expect(result.numTokens).toBe(5);
+    expect(result.warning).toBeUndefined();
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should append new warning to previous warning", () => {
+    const text = "A".repeat(300);
+    const previousWarning = "Previous warning message";
+    mockEncode
+      .mockReturnValueOnce(new Array(100))
+      .mockReturnValueOnce(new Array(50));
+
+    const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning);
+    
+    expect(result.warning).toContain("automatically trimmed");
+    expect(result.warning).toContain(previousWarning);
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle encoder initialization failure gracefully", () => {
+    const text = "Sample text";
+    (encoding_for_model as jest.Mock).mockImplementationOnce(() => {
+      throw new Error("Encoder initialization failed");
+    });
+
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars
+    expect(result.warning).toContain("Failed to derive number of LLM tokens");
+    expect(mockFree).not.toHaveBeenCalled();
+  });
+
+  it("should handle encoding errors during trimming", () => {
+    const text = "Sample text";
+    mockEncode.mockImplementation(() => {
+      throw new Error("Encoding failed");
+    });
+
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThanOrEqual(30);
+    expect(result.warning).toContain("Failed to derive number of LLM tokens");
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle very small token limits", () => {
+    const text = "This is a test sentence that should be trimmed significantly";
+    mockEncode
+      .mockReturnValueOnce(new Array(20))
+      .mockReturnValueOnce(new Array(3));
+
+    const result = trimToTokenLimit(text, 3, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(3);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle unicode characters", () => {
+    const text = "Hello 👋 World 🌍";
+    mockEncode
+      .mockReturnValueOnce(new Array(8))
+      .mockReturnValueOnce(new Array(4));
+
+    const result = trimToTokenLimit(text, 4, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(4);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle multiple trimming iterations", () => {
+    const text = "A".repeat(1000);
+    mockEncode
+      .mockReturnValueOnce(new Array(300))
+      .mockReturnValueOnce(new Array(200))
+      .mockReturnValueOnce(new Array(100))
+      .mockReturnValueOnce(new Array(50));
+
+    const result = trimToTokenLimit(text, 50, "gpt-4o");
+    
+    expect(result.text.length).toBeLessThan(text.length);
+    expect(result.numTokens).toBe(50);
+    expect(result.warning).toContain("automatically trimmed");
+    expect(mockEncode).toHaveBeenCalledTimes(4);
+    expect(mockFree).toHaveBeenCalled();
+  });
+
+  it("should handle exact token limit match", () => {
+    const text = "Exact token limit text";
     mockEncode.mockReturnValue(new Array(10));
 
-    const result = truncateText(text, 1);
-    expect(result.length).toBeLessThan(text.length);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
+    
+    expect(result.text).toBe(text);
+    expect(result.numTokens).toBe(10);
+    expect(result.warning).toBeUndefined();
+    expect(mockFree).toHaveBeenCalled();
   });
 
-  it("should handle zero max tokens", () => {
-    const text = "Some text";
-    mockEncode.mockReturnValue(new Array(2));
-
-    const result = truncateText(text, 0);
-    expect(result).toBe("");
-  });
-
-  it("should handle extremely large text exceeding model context", () => {
-    // Create a very large text (e.g., 100,000 characters)
-    const text = "a".repeat(100000);
-    
-    // First call: simulate 25000 tokens
-    mockEncode.mockReturnValueOnce(new Array(25000));
-    // Subsequent calls: simulate gradually decreasing token counts
-    // This simulates the iterative truncation process
-    mockEncode
-      .mockReturnValueOnce(new Array(20000))
-      .mockReturnValueOnce(new Array(15000))
-      .mockReturnValueOnce(new Array(12000))
-      .mockReturnValueOnce(new Array(9000));
-
-    const result = truncateText(text, 10000); // Common model context limit
-    
-    // The result should be significantly shorter but not empty
-    expect(result.length).toBeLessThan(text.length);
-    expect(result.length).toBeGreaterThan(0);
-    // Given our new conservative approach, we should have a substantial amount of text
-    expect(result.length).toBeGreaterThan(30000); // At least 30% of original
-    expect(mockEncode).toHaveBeenCalled();
-    
-    // Log the actual length for verification
-    console.log("Result length:", result.length, "characters");
-  });
+  
 });
diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
index 9b31cb80..29bf4f6f 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@@ -89,34 +89,63 @@ function normalizeSchema(x: any): any {
   }
 }
 
-export function truncateText(text: string, maxTokens: number): string {
-  const modifier = 3; // Estimate: 1 token ≈ 3-4 characters for safety
+
+
+interface TrimResult {
+  text: string;
+  numTokens: number;
+  warning?: string;
+}
+
+export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
   try {
-    const encoder = encoding_for_model("gpt-4o");
-    // Continuously trim the text until its token count is within the limit.
-    while (true) {
+    const encoder = encoding_for_model(modelId as TiktokenModel);
+    try {
       const tokens = encoder.encode(text);
-      if (tokens.length <= maxTokens) {
-        return text;
+      const numTokens = tokens.length;
+      
+      if (numTokens <= maxTokens) {
+        return { text, numTokens };
       }
-      // Calculate a new length using a more conservative approach
-      // Instead of scaling the entire text, we'll remove a smaller portion
-      const ratio = maxTokens / tokens.length;
-      const newLength = Math.max(
-        Math.ceil(text.length * ratio),
-        Math.floor(text.length * 0.8)  // Never remove more than 20% at once
-      );
-      if (newLength <= 0) {
-        return "";
+
+      const modifier = 3;
+      // Start with 3 chars per token estimation
+      let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
+      
+      // Keep trimming until we're under the token limit
+      while (true) {
+        const currentTokens = encoder.encode(currentText);
+        if (currentTokens.length <= maxTokens) {
+          const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
+          return {
+            text: currentText,
+            numTokens: currentTokens.length,
+            warning: previousWarning ? `${warning} ${previousWarning}` : warning
+          };
+        }
+        const overflow = currentTokens.length * modifier - maxTokens - 1;
+        // If still over limit, remove another chunk
+        currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
       }
-      text = text.slice(0, newLength);
+
+    } catch (e) {
+      throw e;
+    } finally {
+      encoder.free();
     }
   } catch (error) {
-    // Fallback using character-based estimation.
-    if (text.length <= maxTokens * modifier) {
-      return text;
-    }
-    return text.slice(0, maxTokens * modifier);
+    // Fallback to a more conservative character-based approach
+    const estimatedCharsPerToken = 2.8;
+    const safeLength = maxTokens * estimatedCharsPerToken;
+    const trimmedText = text.slice(0, Math.floor(safeLength));
+    
+    const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
+    
+    return {
+      text: trimmedText,
+      numTokens: maxTokens, // We assume we hit the max in this fallback case
+      warning: previousWarning ? `${warning} ${previousWarning}` : warning
+    };
   }
 }
 
@@ -149,51 +178,19 @@ export async function generateCompletions({
   }
 
   const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId);
-
-  // Ratio of 4 was way too high, now 3.5.
-  const modifier = 3.5; // tokens to characters ratio
   // Calculate 80% of max input tokens (for content)
   const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
 
-  // count number of tokens
-  let numTokens = 0;
-  try {
-    // Encode the message into tokens
-    const encoder = encoding_for_model(model.modelId as TiktokenModel);
-    
-    try {
-      const tokens = encoder.encode(markdown);
-      numTokens = tokens.length;
-    } catch (e) {
-      throw e;
-    } finally {
-      // Free the encoder resources after use
-      encoder.free();
-    }
-  } catch (error) {
-    logger.warn("Calculating num tokens of string failed", { error });
+  // Use the new trimming function
+  const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit(
+    markdown,
+    maxTokensSafe,
+    model.modelId,
+    previousWarning
+  );
 
-    markdown = markdown.slice(0, maxTokensSafe * modifier);
-
-    let w =
-      "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
-      maxTokensSafe +
-      ") we support.";
-    warning = previousWarning === undefined ? w : w + " " + previousWarning;
-  }
-
-  if (numTokens > maxTokensSafe) {
-    // trim the document to the maximum number of tokens, tokens != characters
-    markdown = markdown.slice(0, maxTokensSafe * modifier);
-
-    const w =
-      "The extraction content would have used more tokens (" +
-      numTokens +
-      ") than the maximum we allow (" +
-      maxTokensSafe +
-      "). -- the input has been automatically trimmed.";
-    warning = previousWarning === undefined ? w : w + " " + previousWarning;
-  }
+  markdown = trimmedMarkdown;
+  warning = trimWarning;
 
   let schema = options.schema;
   // Normalize the bad json schema users write (mogery)