(fix/token-slicer) Fixes extract token limit issues (#1236)

* Nick: fixes extract token limit errors * Update llmExtract.ts * Update llmExtract.ts
2025-08-14 03:06:00 +08:00 · 2025-02-21 16:44:42 -03:00 · 2025-02-21 16:44:42 -03:00 · 5ab86b8b43
commit 5ab86b8b43
parent 76e1f29ae8
3 changed files with 259 additions and 130 deletions
--- a/apps/api/src/lib/deep-research/research-manager.ts
+++ b/apps/api/src/lib/deep-research/research-manager.ts
@ -5,8 +5,7 @@ import {
  DeepResearchSource,
  updateDeepResearch,
 } from "./deep-research-redis";
-import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
+import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
 import { truncateText } from "../../scraper/scrapeURL/transformers/llmExtract";
 interface AnalysisResult {
  gaps: string[];
@ -178,7 +177,7 @@ export class ResearchLLMService {
          },
        },
        prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}"
-          ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${truncateText(findings.map((f) => `- ${f.text}`).join("\n"), 10000)}` : ""}
+          ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${trimToTokenLimit(findings.map((f) => `- ${f.text}`).join("\n"), 10000).text}` : ""}
          Each query should be specific and focused on a particular aspect.
          Build upon previous findings when available.
@ -225,7 +224,7 @@ export class ResearchLLMService {
              },
            },
          },
-          prompt: truncateText(
+          prompt: trimToTokenLimit(
            `You are researching: ${currentTopic}
              You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it.
              Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
@ -234,7 +233,7 @@ export class ResearchLLMService {
              Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis.
              If I have enough information, set shouldContinue to false.`,
            120000,
-          ),
+          ).text,
        },
        markdown: "",
      });
@ -266,7 +265,7 @@ export class ResearchLLMService {
            report: { type: "string" },
          },
        },
-        prompt: truncateText(
+        prompt: trimToTokenLimit(
          `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
            Research data:
@ -281,7 +280,7 @@ export class ResearchLLMService {
            - Cite sources
            - Use bullet points and lists where appropriate for readability`,
          100000,
-        ),
+        ).text,
      },
      markdown: "",
    });
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
@ -1,5 +1,5 @@
 import { removeDefaultProperty } from "./llmExtract";
-import { truncateText } from "./llmExtract";
+import { trimToTokenLimit } from "./llmExtract";
 import { encoding_for_model } from "@dqbd/tiktoken";
 jest.mock("@dqbd/tiktoken", () => ({
@ -46,10 +46,13 @@ describe("removeDefaultProperty", () => {
  });
 });
-describe("truncateText", () => {
+
 describe("trimToTokenLimit", () => {
  const mockEncode = jest.fn();
  const mockFree = jest.fn();
  const mockEncoder = {
    encode: mockEncode,
    free: mockFree,
  };
  beforeEach(() => {
@ -57,84 +60,214 @@ describe("truncateText", () => {
    (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
  });
-  it("should return the original text if it's within token limit", () => {
+  it("should return original text if within token limit", () => {
-    const text = "This is a short text";
+    const text = "This is a test text";
    mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens
-    const result = truncateText(text, 10);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
-    expect(result).toBe(text);
+    
    expect(result).toEqual({
      text,
      numTokens: 5,
      warning: undefined
    });
    expect(mockEncode).toHaveBeenCalledWith(text);
    expect(mockFree).toHaveBeenCalled();
  });
-  it("should truncate text that exceeds token limit", () => {
+  it("should trim text and return warning when exceeding token limit", () => {
-    const text = "This is a longer text that needs truncation";
+    const text = "This is a longer text that needs to be trimmed";
-    mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens
+    mockEncode
      .mockReturnValueOnce(new Array(20)) // First call for full text
      .mockReturnValueOnce(new Array(8)); // Second call for trimmed text
-    const result = truncateText(text, 10);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
-    expect(result.length).toBeLessThan(text.length);
+    
-    expect(mockEncode).toHaveBeenCalled();
+    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(8);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockEncode).toHaveBeenCalledTimes(2);
    expect(mockFree).toHaveBeenCalled();
  });
-  it("should handle empty string", () => {
+  it("should append previous warning if provided", () => {
-    const text = "";
+    const text = "This is a test text that is too long";
-    mockEncode.mockReturnValue([]);
+    const previousWarning = "Previous warning message";
    mockEncode
      .mockReturnValueOnce(new Array(15))
      .mockReturnValueOnce(new Array(8));
-    const result = truncateText(text, 10);
+    const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning);
-    expect(result).toBe("");
+    
-    expect(mockEncode).toHaveBeenCalledWith("");
+    expect(result.warning).toContain("automatically trimmed");
    expect(result.warning).toContain(previousWarning);
  });
-  it("should use character-based fallback when encoder throws error", () => {
+  it("should use fallback approach when encoder throws error", () => {
-    const text = "This is some text";
+    const text = "This is some text to test fallback";
    mockEncode.mockImplementation(() => {
      throw new Error("Encoder error");
    });
-    const result = truncateText(text, 5);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
-    // With modifier of 3, should truncate to approximately 15 characters
+    
-    expect(result.length).toBeLessThanOrEqual(15);
+    expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token
    expect(result.numTokens).toBe(10);
    expect(result.warning).toContain("Failed to derive number of LLM tokens");
  });
-  it("should handle very short max token limits", () => {
+  it("should handle empty text", () => {
    const text = "";
    mockEncode.mockReturnValue([]);
    const result = trimToTokenLimit(text, 10, "gpt-4o");
    expect(result).toEqual({
      text: "",
      numTokens: 0,
      warning: undefined
    });
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle large token limits (128k)", () => {
    const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens
    mockEncode
      .mockReturnValueOnce(new Array(130000)) // First check shows it's too long
      .mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim
    const result = trimToTokenLimit(text, 128000, "gpt-4o");
    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(127000);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockEncode).toHaveBeenCalledTimes(2);
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle large token limits (512k) with 32k context window", () => {
    const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens
    mockEncode
      .mockReturnValueOnce(new Array(520000)) // First check shows it's too long
      .mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim
    const result = trimToTokenLimit(text, 32000, "gpt-4o");
    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(32000);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockEncode).toHaveBeenCalledTimes(2);
    expect(mockFree).toHaveBeenCalled();
  });
  it("should preserve text when under token limit", () => {
    const text = "Short text";
    mockEncode.mockReturnValue(new Array(5)); // 5 tokens
    const result = trimToTokenLimit(text, 10, "gpt-4o");
    expect(result.text).toBe(text);
    expect(result.numTokens).toBe(5);
    expect(result.warning).toBeUndefined();
    expect(mockFree).toHaveBeenCalled();
  });
  it("should append new warning to previous warning", () => {
    const text = "A".repeat(300);
    const previousWarning = "Previous warning message";
    mockEncode
      .mockReturnValueOnce(new Array(100))
      .mockReturnValueOnce(new Array(50));
    const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning);
    expect(result.warning).toContain("automatically trimmed");
    expect(result.warning).toContain(previousWarning);
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle encoder initialization failure gracefully", () => {
    const text = "Sample text";
    (encoding_for_model as jest.Mock).mockImplementationOnce(() => {
      throw new Error("Encoder initialization failed");
    });
    const result = trimToTokenLimit(text, 10, "gpt-4o");
    expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars
    expect(result.warning).toContain("Failed to derive number of LLM tokens");
    expect(mockFree).not.toHaveBeenCalled();
  });
  it("should handle encoding errors during trimming", () => {
    const text = "Sample text";
    mockEncode.mockImplementation(() => {
      throw new Error("Encoding failed");
    });
    const result = trimToTokenLimit(text, 10, "gpt-4o");
    expect(result.text.length).toBeLessThanOrEqual(30);
    expect(result.warning).toContain("Failed to derive number of LLM tokens");
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle very small token limits", () => {
    const text = "This is a test sentence that should be trimmed significantly";
    mockEncode
      .mockReturnValueOnce(new Array(20))
      .mockReturnValueOnce(new Array(3));
    const result = trimToTokenLimit(text, 3, "gpt-4o");
    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(3);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle unicode characters", () => {
    const text = "Hello 👋 World 🌍";
    mockEncode
      .mockReturnValueOnce(new Array(8))
      .mockReturnValueOnce(new Array(4));
    const result = trimToTokenLimit(text, 4, "gpt-4o");
    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(4);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle multiple trimming iterations", () => {
    const text = "A".repeat(1000);
    mockEncode
      .mockReturnValueOnce(new Array(300))
      .mockReturnValueOnce(new Array(200))
      .mockReturnValueOnce(new Array(100))
      .mockReturnValueOnce(new Array(50));
    const result = trimToTokenLimit(text, 50, "gpt-4o");
    expect(result.text.length).toBeLessThan(text.length);
    expect(result.numTokens).toBe(50);
    expect(result.warning).toContain("automatically trimmed");
    expect(mockEncode).toHaveBeenCalledTimes(4);
    expect(mockFree).toHaveBeenCalled();
  });
  it("should handle exact token limit match", () => {
    const text = "Exact token limit text";
    mockEncode.mockReturnValue(new Array(10));
-    const result = truncateText(text, 1);
+    const result = trimToTokenLimit(text, 10, "gpt-4o");
-    expect(result.length).toBeLessThan(text.length);
+    
    expect(result.text).toBe(text);
    expect(result.numTokens).toBe(10);
    expect(result.warning).toBeUndefined();
    expect(mockFree).toHaveBeenCalled();
  });
-  it("should handle zero max tokens", () => {
+  
    const text = "Some text";
    mockEncode.mockReturnValue(new Array(2));
    const result = truncateText(text, 0);
    expect(result).toBe("");
  });
  it("should handle extremely large text exceeding model context", () => {
    // Create a very large text (e.g., 100,000 characters)
    const text = "a".repeat(100000);
    // First call: simulate 25000 tokens
    mockEncode.mockReturnValueOnce(new Array(25000));
    // Subsequent calls: simulate gradually decreasing token counts
    // This simulates the iterative truncation process
    mockEncode
      .mockReturnValueOnce(new Array(20000))
      .mockReturnValueOnce(new Array(15000))
      .mockReturnValueOnce(new Array(12000))
      .mockReturnValueOnce(new Array(9000));
    const result = truncateText(text, 10000); // Common model context limit
    // The result should be significantly shorter but not empty
    expect(result.length).toBeLessThan(text.length);
    expect(result.length).toBeGreaterThan(0);
    // Given our new conservative approach, we should have a substantial amount of text
    expect(result.length).toBeGreaterThan(30000); // At least 30% of original
    expect(mockEncode).toHaveBeenCalled();
    // Log the actual length for verification
    console.log("Result length:", result.length, "characters");
  });
 });
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -89,34 +89,63 @@ function normalizeSchema(x: any): any {
  }
 }
-export function truncateText(text: string, maxTokens: number): string {
+
-  const modifier = 3; // Estimate: 1 token ≈ 3-4 characters for safety
+
 interface TrimResult {
  text: string;
  numTokens: number;
  warning?: string;
 }
 export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
  try {
-    const encoder = encoding_for_model("gpt-4o");
+    const encoder = encoding_for_model(modelId as TiktokenModel);
-    // Continuously trim the text until its token count is within the limit.
+    try {
    while (true) {
      const tokens = encoder.encode(text);
-      if (tokens.length <= maxTokens) {
+      const numTokens = tokens.length;
-        return text;
+      
      if (numTokens <= maxTokens) {
        return { text, numTokens };
      }
-      // Calculate a new length using a more conservative approach
+
-      // Instead of scaling the entire text, we'll remove a smaller portion
+      const modifier = 3;
-      const ratio = maxTokens / tokens.length;
+      // Start with 3 chars per token estimation
-      const newLength = Math.max(
+      let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
-        Math.ceil(text.length * ratio),
+      
-        Math.floor(text.length * 0.8)  // Never remove more than 20% at once
+      // Keep trimming until we're under the token limit
-      );
+      while (true) {
-      if (newLength <= 0) {
+        const currentTokens = encoder.encode(currentText);
-        return "";
+        if (currentTokens.length <= maxTokens) {
          const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
          return {
            text: currentText,
            numTokens: currentTokens.length,
            warning: previousWarning ? `${warning} ${previousWarning}` : warning
          };
        }
        const overflow = currentTokens.length * modifier - maxTokens - 1;
        // If still over limit, remove another chunk
        currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
      }
-      text = text.slice(0, newLength);
+
    } catch (e) {
      throw e;
    } finally {
      encoder.free();
    }
  } catch (error) {
-    // Fallback using character-based estimation.
+    // Fallback to a more conservative character-based approach
-    if (text.length <= maxTokens * modifier) {
+    const estimatedCharsPerToken = 2.8;
-      return text;
+    const safeLength = maxTokens * estimatedCharsPerToken;
-    }
+    const trimmedText = text.slice(0, Math.floor(safeLength));
-    return text.slice(0, maxTokens * modifier);
+    
    const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
    return {
      text: trimmedText,
      numTokens: maxTokens, // We assume we hit the max in this fallback case
      warning: previousWarning ? `${warning} ${previousWarning}` : warning
    };
  }
 }
@ -149,51 +178,19 @@ export async function generateCompletions({
  }
  const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId);
  // Ratio of 4 was way too high, now 3.5.
  const modifier = 3.5; // tokens to characters ratio
  // Calculate 80% of max input tokens (for content)
  const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
-  // count number of tokens
+  // Use the new trimming function
-  let numTokens = 0;
+  const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit(
-  try {
+    markdown,
-    // Encode the message into tokens
+    maxTokensSafe,
-    const encoder = encoding_for_model(model.modelId as TiktokenModel);
+    model.modelId,
-    
+    previousWarning
-    try {
+  );
      const tokens = encoder.encode(markdown);
      numTokens = tokens.length;
    } catch (e) {
      throw e;
    } finally {
      // Free the encoder resources after use
      encoder.free();
    }
  } catch (error) {
    logger.warn("Calculating num tokens of string failed", { error });
-    markdown = markdown.slice(0, maxTokensSafe * modifier);
+  markdown = trimmedMarkdown;
-
+  warning = trimWarning;
    let w =
      "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
      maxTokensSafe +
      ") we support.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  }
  if (numTokens > maxTokensSafe) {
    // trim the document to the maximum number of tokens, tokens != characters
    markdown = markdown.slice(0, maxTokensSafe * modifier);
    const w =
      "The extraction content would have used more tokens (" +
      numTokens +
      ") than the maximum we allow (" +
      maxTokensSafe +
      "). -- the input has been automatically trimmed.";
    warning = previousWarning === undefined ? w : w + " " + previousWarning;
  }
  let schema = options.schema;
  // Normalize the bad json schema users write (mogery)