Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285)

* init * Update generate-llmstxt-service.ts
2025-08-12 04:18:59 +08:00 · 2025-03-03 16:37:33 -05:00 · 2025-03-03 16:37:33 -05:00 · 5a1886936c
commit 5a1886936c
parent 1beadf39f0
2 changed files with 40 additions and 14 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -1,6 +1,6 @@
 # Pick your baseUrl here:
@baseUrl = http://localhost:3002
-# @baseUrl = https://api.firecrawl.dev
+#@baseUrl = https://api.firecrawl.dev

 ### Scrape Website
 # @name scrape
@ -102,21 +102,21 @@ DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}

 ### Generate LLMs TXT
-# @name llmsTxt
+# @name generateLlmsTxt
 POST {{baseUrl}}/v1/llmstxt HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json

 {
  "url": "https://firecrawl.dev",
-  "maxUrls": 2,
+  "maxUrls": 1,
  "showFullText": false
 }


 ### Check Generate LLMs TXT Status
-@llmsTxtId = {{llmsTxt.response.body.$.id}}
-# @name llmsTxtStatus
-GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
+@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
+# @name generateLlmsTxtStatus
+GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}

--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
@ -41,10 +41,30 @@ function limitPages(fullText: string, maxPages: number): string {
  return limitedPages.join("");
 }

+// Helper function to limit llmstxt entries
+function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
+  // Split by newlines
+  const lines = llmstxt.split('\n');
+  
+  // Find the header line (starts with #)
+  const headerIndex = lines.findIndex(line => line.startsWith('#'));
+  if (headerIndex === -1) return llmstxt;
+  
+  // Get the header and the entries
+  const header = lines[headerIndex];
+  const entries = lines.filter(line => line.startsWith('- ['));
+  
+  // Take only the requested number of entries
+  const limitedEntries = entries.slice(0, maxEntries);
+  
+  // Reconstruct the text
+  return `${header}\n\n${limitedEntries.join('\n')}`;
+}
+
 export async function performGenerateLlmsTxt(
  options: GenerateLLMsTextServiceOptions,
 ) {
-  const { generationId, teamId, plan, url, maxUrls, showFullText, subId } =
+  const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } =
    options;
  const startTime = Date.now();
  const logger = _logger.child({
@ -55,19 +75,25 @@ export async function performGenerateLlmsTxt(
  });

  try {
+    // Enforce max URL limit
+    const effectiveMaxUrls = Math.min(maxUrls, 5000);
+
    // Check cache first
-    const cachedResult = await getLlmsTextFromCache(url, maxUrls);
+    const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
    if (cachedResult) {
      logger.info("Found cached LLMs text", { url });

      // Limit pages and remove separators before returning
-      const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
+      const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);
      const cleanFullText = removePageSeparators(limitedFullText);
+      
+      // Limit llmstxt entries to match maxUrls
+      const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);

      // Update final result with cached text
      await updateGeneratedLlmsTxt(generationId, {
        status: "completed",
-        generatedText: cachedResult.llmstxt,
+        generatedText: limitedLlmsTxt,
        fullText: cleanFullText,
        showFullText: showFullText,
      });
@ -75,7 +101,7 @@ export async function performGenerateLlmsTxt(
      return {
        success: true,
        data: {
-          generatedText: cachedResult.llmstxt,
+          generatedText: limitedLlmsTxt,
          fullText: cleanFullText,
          showFullText: showFullText,
        },
@ -88,7 +114,7 @@ export async function performGenerateLlmsTxt(
      url,
      teamId,
      plan,
-      limit: maxUrls,
+      limit: effectiveMaxUrls,
      includeSubdomains: false,
      ignoreSitemap: false,
      includeMetadata: true,
@ -177,10 +203,10 @@ export async function performGenerateLlmsTxt(
    }

    // After successful generation, save to cache
-    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
+    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);

    // Limit pages and remove separators before final update
-    const limitedFullText = limitPages(llmsFulltxt, maxUrls);
+    const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);
    const cleanFullText = removePageSeparators(limitedFullText);

    // Update final result with both generated text and full text