(fix/token-slicer) Fixes extract token limit issues (#1236)

* Nick: fixes extract token limit errors

* Update llmExtract.ts

* Update llmExtract.ts
This commit is contained in:
Nicolas 2025-02-21 16:44:42 -03:00 committed by GitHub
parent 76e1f29ae8
commit 5ab86b8b43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 259 additions and 130 deletions

View File

@ -5,8 +5,7 @@ import {
DeepResearchSource, DeepResearchSource,
updateDeepResearch, updateDeepResearch,
} from "./deep-research-redis"; } from "./deep-research-redis";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
import { truncateText } from "../../scraper/scrapeURL/transformers/llmExtract";
interface AnalysisResult { interface AnalysisResult {
gaps: string[]; gaps: string[];
@ -178,7 +177,7 @@ export class ResearchLLMService {
}, },
}, },
prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}" prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}"
${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${truncateText(findings.map((f) => `- ${f.text}`).join("\n"), 10000)}` : ""} ${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${trimToTokenLimit(findings.map((f) => `- ${f.text}`).join("\n"), 10000).text}` : ""}
Each query should be specific and focused on a particular aspect. Each query should be specific and focused on a particular aspect.
Build upon previous findings when available. Build upon previous findings when available.
@ -225,7 +224,7 @@ export class ResearchLLMService {
}, },
}, },
}, },
prompt: truncateText( prompt: trimToTokenLimit(
`You are researching: ${currentTopic} `You are researching: ${currentTopic}
You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it. You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it.
Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
@ -234,7 +233,7 @@ export class ResearchLLMService {
Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis. Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis.
If I have enough information, set shouldContinue to false.`, If I have enough information, set shouldContinue to false.`,
120000, 120000,
), ).text,
}, },
markdown: "", markdown: "",
}); });
@ -266,7 +265,7 @@ export class ResearchLLMService {
report: { type: "string" }, report: { type: "string" },
}, },
}, },
prompt: truncateText( prompt: trimToTokenLimit(
`Create a comprehensive research report on "${topic}" based on the collected findings and analysis. `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
Research data: Research data:
@ -281,7 +280,7 @@ export class ResearchLLMService {
- Cite sources - Cite sources
- Use bullet points and lists where appropriate for readability`, - Use bullet points and lists where appropriate for readability`,
100000, 100000,
), ).text,
}, },
markdown: "", markdown: "",
}); });

View File

@ -1,5 +1,5 @@
import { removeDefaultProperty } from "./llmExtract"; import { removeDefaultProperty } from "./llmExtract";
import { truncateText } from "./llmExtract"; import { trimToTokenLimit } from "./llmExtract";
import { encoding_for_model } from "@dqbd/tiktoken"; import { encoding_for_model } from "@dqbd/tiktoken";
jest.mock("@dqbd/tiktoken", () => ({ jest.mock("@dqbd/tiktoken", () => ({
@ -46,10 +46,13 @@ describe("removeDefaultProperty", () => {
}); });
}); });
describe("truncateText", () => {
describe("trimToTokenLimit", () => {
const mockEncode = jest.fn(); const mockEncode = jest.fn();
const mockFree = jest.fn();
const mockEncoder = { const mockEncoder = {
encode: mockEncode, encode: mockEncode,
free: mockFree,
}; };
beforeEach(() => { beforeEach(() => {
@ -57,84 +60,214 @@ describe("truncateText", () => {
(encoding_for_model as jest.Mock).mockReturnValue(mockEncoder); (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
}); });
it("should return the original text if it's within token limit", () => { it("should return original text if within token limit", () => {
const text = "This is a short text"; const text = "This is a test text";
mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens
const result = truncateText(text, 10); const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result).toBe(text);
expect(result).toEqual({
text,
numTokens: 5,
warning: undefined
});
expect(mockEncode).toHaveBeenCalledWith(text); expect(mockEncode).toHaveBeenCalledWith(text);
expect(mockFree).toHaveBeenCalled();
}); });
it("should truncate text that exceeds token limit", () => { it("should trim text and return warning when exceeding token limit", () => {
const text = "This is a longer text that needs truncation"; const text = "This is a longer text that needs to be trimmed";
mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens mockEncode
.mockReturnValueOnce(new Array(20)) // First call for full text
.mockReturnValueOnce(new Array(8)); // Second call for trimmed text
const result = truncateText(text, 10); const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result.length).toBeLessThan(text.length);
expect(mockEncode).toHaveBeenCalled(); expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(8);
expect(result.warning).toContain("automatically trimmed");
expect(mockEncode).toHaveBeenCalledTimes(2);
expect(mockFree).toHaveBeenCalled();
}); });
it("should handle empty string", () => { it("should append previous warning if provided", () => {
const text = ""; const text = "This is a test text that is too long";
mockEncode.mockReturnValue([]); const previousWarning = "Previous warning message";
mockEncode
.mockReturnValueOnce(new Array(15))
.mockReturnValueOnce(new Array(8));
const result = truncateText(text, 10); const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning);
expect(result).toBe("");
expect(mockEncode).toHaveBeenCalledWith(""); expect(result.warning).toContain("automatically trimmed");
expect(result.warning).toContain(previousWarning);
}); });
it("should use character-based fallback when encoder throws error", () => { it("should use fallback approach when encoder throws error", () => {
const text = "This is some text"; const text = "This is some text to test fallback";
mockEncode.mockImplementation(() => { mockEncode.mockImplementation(() => {
throw new Error("Encoder error"); throw new Error("Encoder error");
}); });
const result = truncateText(text, 5); const result = trimToTokenLimit(text, 10, "gpt-4o");
// With modifier of 3, should truncate to approximately 15 characters
expect(result.length).toBeLessThanOrEqual(15); expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token
expect(result.numTokens).toBe(10);
expect(result.warning).toContain("Failed to derive number of LLM tokens");
}); });
it("should handle very short max token limits", () => { it("should handle empty text", () => {
const text = "";
mockEncode.mockReturnValue([]);
const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result).toEqual({
text: "",
numTokens: 0,
warning: undefined
});
expect(mockFree).toHaveBeenCalled();
});
it("should handle large token limits (128k)", () => {
const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens
mockEncode
.mockReturnValueOnce(new Array(130000)) // First check shows it's too long
.mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim
const result = trimToTokenLimit(text, 128000, "gpt-4o");
expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(127000);
expect(result.warning).toContain("automatically trimmed");
expect(mockEncode).toHaveBeenCalledTimes(2);
expect(mockFree).toHaveBeenCalled();
});
it("should handle large token limits (512k) with 32k context window", () => {
const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens
mockEncode
.mockReturnValueOnce(new Array(520000)) // First check shows it's too long
.mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim
const result = trimToTokenLimit(text, 32000, "gpt-4o");
expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(32000);
expect(result.warning).toContain("automatically trimmed");
expect(mockEncode).toHaveBeenCalledTimes(2);
expect(mockFree).toHaveBeenCalled();
});
it("should preserve text when under token limit", () => {
const text = "Short text"; const text = "Short text";
mockEncode.mockReturnValue(new Array(5)); // 5 tokens
const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result.text).toBe(text);
expect(result.numTokens).toBe(5);
expect(result.warning).toBeUndefined();
expect(mockFree).toHaveBeenCalled();
});
it("should append new warning to previous warning", () => {
const text = "A".repeat(300);
const previousWarning = "Previous warning message";
mockEncode
.mockReturnValueOnce(new Array(100))
.mockReturnValueOnce(new Array(50));
const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning);
expect(result.warning).toContain("automatically trimmed");
expect(result.warning).toContain(previousWarning);
expect(mockFree).toHaveBeenCalled();
});
it("should handle encoder initialization failure gracefully", () => {
const text = "Sample text";
(encoding_for_model as jest.Mock).mockImplementationOnce(() => {
throw new Error("Encoder initialization failed");
});
const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars
expect(result.warning).toContain("Failed to derive number of LLM tokens");
expect(mockFree).not.toHaveBeenCalled();
});
it("should handle encoding errors during trimming", () => {
const text = "Sample text";
mockEncode.mockImplementation(() => {
throw new Error("Encoding failed");
});
const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result.text.length).toBeLessThanOrEqual(30);
expect(result.warning).toContain("Failed to derive number of LLM tokens");
expect(mockFree).toHaveBeenCalled();
});
it("should handle very small token limits", () => {
const text = "This is a test sentence that should be trimmed significantly";
mockEncode
.mockReturnValueOnce(new Array(20))
.mockReturnValueOnce(new Array(3));
const result = trimToTokenLimit(text, 3, "gpt-4o");
expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(3);
expect(result.warning).toContain("automatically trimmed");
expect(mockFree).toHaveBeenCalled();
});
it("should handle unicode characters", () => {
const text = "Hello 👋 World 🌍";
mockEncode
.mockReturnValueOnce(new Array(8))
.mockReturnValueOnce(new Array(4));
const result = trimToTokenLimit(text, 4, "gpt-4o");
expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(4);
expect(result.warning).toContain("automatically trimmed");
expect(mockFree).toHaveBeenCalled();
});
it("should handle multiple trimming iterations", () => {
const text = "A".repeat(1000);
mockEncode
.mockReturnValueOnce(new Array(300))
.mockReturnValueOnce(new Array(200))
.mockReturnValueOnce(new Array(100))
.mockReturnValueOnce(new Array(50));
const result = trimToTokenLimit(text, 50, "gpt-4o");
expect(result.text.length).toBeLessThan(text.length);
expect(result.numTokens).toBe(50);
expect(result.warning).toContain("automatically trimmed");
expect(mockEncode).toHaveBeenCalledTimes(4);
expect(mockFree).toHaveBeenCalled();
});
it("should handle exact token limit match", () => {
const text = "Exact token limit text";
mockEncode.mockReturnValue(new Array(10)); mockEncode.mockReturnValue(new Array(10));
const result = truncateText(text, 1); const result = trimToTokenLimit(text, 10, "gpt-4o");
expect(result.length).toBeLessThan(text.length);
expect(result.text).toBe(text);
expect(result.numTokens).toBe(10);
expect(result.warning).toBeUndefined();
expect(mockFree).toHaveBeenCalled();
}); });
it("should handle zero max tokens", () => {
const text = "Some text";
mockEncode.mockReturnValue(new Array(2));
const result = truncateText(text, 0);
expect(result).toBe("");
});
it("should handle extremely large text exceeding model context", () => {
// Create a very large text (e.g., 100,000 characters)
const text = "a".repeat(100000);
// First call: simulate 25000 tokens
mockEncode.mockReturnValueOnce(new Array(25000));
// Subsequent calls: simulate gradually decreasing token counts
// This simulates the iterative truncation process
mockEncode
.mockReturnValueOnce(new Array(20000))
.mockReturnValueOnce(new Array(15000))
.mockReturnValueOnce(new Array(12000))
.mockReturnValueOnce(new Array(9000));
const result = truncateText(text, 10000); // Common model context limit
// The result should be significantly shorter but not empty
expect(result.length).toBeLessThan(text.length);
expect(result.length).toBeGreaterThan(0);
// Given our new conservative approach, we should have a substantial amount of text
expect(result.length).toBeGreaterThan(30000); // At least 30% of original
expect(mockEncode).toHaveBeenCalled();
// Log the actual length for verification
console.log("Result length:", result.length, "characters");
});
}); });

View File

@ -89,34 +89,63 @@ function normalizeSchema(x: any): any {
} }
} }
export function truncateText(text: string, maxTokens: number): string {
const modifier = 3; // Estimate: 1 token ≈ 3-4 characters for safety
interface TrimResult {
text: string;
numTokens: number;
warning?: string;
}
export function trimToTokenLimit(text: string, maxTokens: number, modelId: string="gpt-4o", previousWarning?: string): TrimResult {
try { try {
const encoder = encoding_for_model("gpt-4o"); const encoder = encoding_for_model(modelId as TiktokenModel);
// Continuously trim the text until its token count is within the limit. try {
while (true) {
const tokens = encoder.encode(text); const tokens = encoder.encode(text);
if (tokens.length <= maxTokens) { const numTokens = tokens.length;
return text;
if (numTokens <= maxTokens) {
return { text, numTokens };
} }
// Calculate a new length using a more conservative approach
// Instead of scaling the entire text, we'll remove a smaller portion const modifier = 3;
const ratio = maxTokens / tokens.length; // Start with 3 chars per token estimation
const newLength = Math.max( let currentText = text.slice(0, Math.floor(maxTokens * modifier) - 1);
Math.ceil(text.length * ratio),
Math.floor(text.length * 0.8) // Never remove more than 20% at once // Keep trimming until we're under the token limit
); while (true) {
if (newLength <= 0) { const currentTokens = encoder.encode(currentText);
return ""; if (currentTokens.length <= maxTokens) {
const warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
return {
text: currentText,
numTokens: currentTokens.length,
warning: previousWarning ? `${warning} ${previousWarning}` : warning
};
}
const overflow = currentTokens.length * modifier - maxTokens - 1;
// If still over limit, remove another chunk
currentText = currentText.slice(0, Math.floor(currentText.length - overflow));
} }
text = text.slice(0, newLength);
} catch (e) {
throw e;
} finally {
encoder.free();
} }
} catch (error) { } catch (error) {
// Fallback using character-based estimation. // Fallback to a more conservative character-based approach
if (text.length <= maxTokens * modifier) { const estimatedCharsPerToken = 2.8;
return text; const safeLength = maxTokens * estimatedCharsPerToken;
} const trimmedText = text.slice(0, Math.floor(safeLength));
return text.slice(0, maxTokens * modifier);
const warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
return {
text: trimmedText,
numTokens: maxTokens, // We assume we hit the max in this fallback case
warning: previousWarning ? `${warning} ${previousWarning}` : warning
};
} }
} }
@ -149,51 +178,19 @@ export async function generateCompletions({
} }
const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId); const { maxInputTokens, maxOutputTokens } = getModelLimits(model.modelId);
// Ratio of 4 was way too high, now 3.5.
const modifier = 3.5; // tokens to characters ratio
// Calculate 80% of max input tokens (for content) // Calculate 80% of max input tokens (for content)
const maxTokensSafe = Math.floor(maxInputTokens * 0.8); const maxTokensSafe = Math.floor(maxInputTokens * 0.8);
// count number of tokens // Use the new trimming function
let numTokens = 0; const { text: trimmedMarkdown, numTokens, warning: trimWarning } = trimToTokenLimit(
try { markdown,
// Encode the message into tokens maxTokensSafe,
const encoder = encoding_for_model(model.modelId as TiktokenModel); model.modelId,
previousWarning
try { );
const tokens = encoder.encode(markdown);
numTokens = tokens.length;
} catch (e) {
throw e;
} finally {
// Free the encoder resources after use
encoder.free();
}
} catch (error) {
logger.warn("Calculating num tokens of string failed", { error });
markdown = markdown.slice(0, maxTokensSafe * modifier); markdown = trimmedMarkdown;
warning = trimWarning;
let w =
"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
maxTokensSafe +
") we support.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
if (numTokens > maxTokensSafe) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, maxTokensSafe * modifier);
const w =
"The extraction content would have used more tokens (" +
numTokens +
") than the maximum we allow (" +
maxTokensSafe +
"). -- the input has been automatically trimmed.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
let schema = options.schema; let schema = options.schema;
// Normalize the bad json schema users write (mogery) // Normalize the bad json schema users write (mogery)