From 5a1886936cdd4f780ab2c55d17d66c40f7e60b65 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <43451761+ericciarla@users.noreply.github.com>
Date: Mon, 3 Mar 2025 16:37:33 -0500
Subject: [PATCH] Truncate llmstxt cache based on maxurls limit & improve
 maxurls handling (#1285)

* init

* Update generate-llmstxt-service.ts
---
 apps/api/requests.http                        | 12 +++---
 .../generate-llmstxt-service.ts               | 42 +++++++++++++++----
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/apps/api/requests.http b/apps/api/requests.http
index 6308738a..a3997371 100644
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@@ -1,6 +1,6 @@
 # Pick your baseUrl here:
 @baseUrl = http://localhost:3002
-# @baseUrl = https://api.firecrawl.dev
+#@baseUrl = https://api.firecrawl.dev
 
 ### Scrape Website
 # @name scrape
@@ -102,21 +102,21 @@ DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 
 ### Generate LLMs TXT
-# @name llmsTxt
+# @name generateLlmsTxt
 POST {{baseUrl}}/v1/llmstxt HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 
 {
   "url": "https://firecrawl.dev",
-  "maxUrls": 2,
+  "maxUrls": 1,
   "showFullText": false
 }
 
 
 ### Check Generate LLMs TXT Status
-@llmsTxtId = {{llmsTxt.response.body.$.id}}
-# @name llmsTxtStatus
-GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
+@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
+# @name generateLlmsTxtStatus
+GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 
diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
index ada4a426..c9bd9c0c 100644
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
@@ -41,10 +41,30 @@ function limitPages(fullText: string, maxPages: number): string {
   return limitedPages.join("");
 }
 
+// Helper function to limit llmstxt entries
+function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
+  // Split by newlines
+  const lines = llmstxt.split('\n');
+  
+  // Find the header line (starts with #)
+  const headerIndex = lines.findIndex(line => line.startsWith('#'));
+  if (headerIndex === -1) return llmstxt;
+  
+  // Get the header and the entries
+  const header = lines[headerIndex];
+  const entries = lines.filter(line => line.startsWith('- ['));
+  
+  // Take only the requested number of entries
+  const limitedEntries = entries.slice(0, maxEntries);
+  
+  // Reconstruct the text
+  return `${header}\n\n${limitedEntries.join('\n')}`;
+}
+
 export async function performGenerateLlmsTxt(
   options: GenerateLLMsTextServiceOptions,
 ) {
-  const { generationId, teamId, plan, url, maxUrls, showFullText, subId } =
+  const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } =
     options;
   const startTime = Date.now();
   const logger = _logger.child({
@@ -55,19 +75,25 @@ export async function performGenerateLlmsTxt(
   });
 
   try {
+    // Enforce max URL limit
+    const effectiveMaxUrls = Math.min(maxUrls, 5000);
+
     // Check cache first
-    const cachedResult = await getLlmsTextFromCache(url, maxUrls);
+    const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
     if (cachedResult) {
       logger.info("Found cached LLMs text", { url });
 
       // Limit pages and remove separators before returning
-      const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
+      const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);
       const cleanFullText = removePageSeparators(limitedFullText);
+      
+      // Limit llmstxt entries to match maxUrls
+      const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);
 
       // Update final result with cached text
       await updateGeneratedLlmsTxt(generationId, {
         status: "completed",
-        generatedText: cachedResult.llmstxt,
+        generatedText: limitedLlmsTxt,
         fullText: cleanFullText,
         showFullText: showFullText,
       });
@@ -75,7 +101,7 @@ export async function performGenerateLlmsTxt(
       return {
         success: true,
         data: {
-          generatedText: cachedResult.llmstxt,
+          generatedText: limitedLlmsTxt,
           fullText: cleanFullText,
           showFullText: showFullText,
         },
@@ -88,7 +114,7 @@ export async function performGenerateLlmsTxt(
       url,
       teamId,
       plan,
-      limit: maxUrls,
+      limit: effectiveMaxUrls,
       includeSubdomains: false,
       ignoreSitemap: false,
       includeMetadata: true,
@@ -177,10 +203,10 @@ export async function performGenerateLlmsTxt(
     }
 
     // After successful generation, save to cache
-    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
+    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);
 
     // Limit pages and remove separators before final update
-    const limitedFullText = limitPages(llmsFulltxt, maxUrls);
+    const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);
     const cleanFullText = removePageSeparators(limitedFullText);
 
     // Update final result with both generated text and full text