From 5a1886936cdd4f780ab2c55d17d66c40f7e60b65 Mon Sep 17 00:00:00 2001 From: Eric Ciarla <43451761+ericciarla@users.noreply.github.com> Date: Mon, 3 Mar 2025 16:37:33 -0500 Subject: [PATCH] Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285) * init * Update generate-llmstxt-service.ts --- apps/api/requests.http | 12 +++--- .../generate-llmstxt-service.ts | 42 +++++++++++++++---- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 6308738a..a3997371 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,6 +1,6 @@ # Pick your baseUrl here: @baseUrl = http://localhost:3002 -# @baseUrl = https://api.firecrawl.dev +#@baseUrl = https://api.firecrawl.dev ### Scrape Website # @name scrape @@ -102,21 +102,21 @@ DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} ### Generate LLMs TXT -# @name llmsTxt +# @name generateLlmsTxt POST {{baseUrl}}/v1/llmstxt HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { "url": "https://firecrawl.dev", - "maxUrls": 2, + "maxUrls": 1, "showFullText": false } ### Check Generate LLMs TXT Status -@llmsTxtId = {{llmsTxt.response.body.$.id}} -# @name llmsTxtStatus -GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1 +@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}} +# @name generateLlmsTxtStatus +GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index ada4a426..c9bd9c0c 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -41,10 +41,30 @@ function limitPages(fullText: string, maxPages: number): string { return limitedPages.join(""); } +// Helper function to limit llmstxt entries +function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string { + // Split by newlines + const lines = llmstxt.split('\n'); + + // Find the header line (starts with #) + const headerIndex = lines.findIndex(line => line.startsWith('#')); + if (headerIndex === -1) return llmstxt; + + // Get the header and the entries + const header = lines[headerIndex]; + const entries = lines.filter(line => line.startsWith('- [')); + + // Take only the requested number of entries + const limitedEntries = entries.slice(0, maxEntries); + + // Reconstruct the text + return `${header}\n\n${limitedEntries.join('\n')}`; +} + export async function performGenerateLlmsTxt( options: GenerateLLMsTextServiceOptions, ) { - const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = + const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } = options; const startTime = Date.now(); const logger = _logger.child({ @@ -55,19 +75,25 @@ export async function performGenerateLlmsTxt( }); try { + // Enforce max URL limit + const effectiveMaxUrls = Math.min(maxUrls, 5000); + // Check cache first - const cachedResult = await getLlmsTextFromCache(url, maxUrls); + const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls); if (cachedResult) { logger.info("Found cached LLMs text", { url }); // Limit pages and remove separators before returning - const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls); + const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls); const cleanFullText = removePageSeparators(limitedFullText); + + // Limit llmstxt entries to match maxUrls + const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls); // Update final result with cached text await updateGeneratedLlmsTxt(generationId, { status: "completed", - generatedText: cachedResult.llmstxt, + generatedText: limitedLlmsTxt, fullText: cleanFullText, showFullText: showFullText, }); @@ -75,7 +101,7 @@ export async function performGenerateLlmsTxt( return { success: true, data: { - generatedText: cachedResult.llmstxt, + generatedText: limitedLlmsTxt, fullText: cleanFullText, showFullText: showFullText, }, @@ -88,7 +114,7 @@ export async function performGenerateLlmsTxt( url, teamId, plan, - limit: maxUrls, + limit: effectiveMaxUrls, includeSubdomains: false, ignoreSitemap: false, includeMetadata: true, @@ -177,10 +203,10 @@ export async function performGenerateLlmsTxt( } // After successful generation, save to cache - await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls); + await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls); // Limit pages and remove separators before final update - const limitedFullText = limitPages(llmsFulltxt, maxUrls); + const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls); const cleanFullText = removePageSeparators(limitedFullText); // Update final result with both generated text and full text