Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285)

* init

* Update generate-llmstxt-service.ts
This commit is contained in:
Eric Ciarla 2025-03-03 16:37:33 -05:00 committed by GitHub
parent 1beadf39f0
commit 5a1886936c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 40 additions and 14 deletions

View File

@ -1,6 +1,6 @@
# Pick your baseUrl here:
@baseUrl = http://localhost:3002
# @baseUrl = https://api.firecrawl.dev
#@baseUrl = https://api.firecrawl.dev
### Scrape Website
# @name scrape
@ -102,21 +102,21 @@ DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Generate LLMs TXT
# @name llmsTxt
# @name generateLlmsTxt
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "https://firecrawl.dev",
"maxUrls": 2,
"maxUrls": 1,
"showFullText": false
}
### Check Generate LLMs TXT Status
@llmsTxtId = {{llmsTxt.response.body.$.id}}
# @name llmsTxtStatus
GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
# @name generateLlmsTxtStatus
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}

View File

@ -41,10 +41,30 @@ function limitPages(fullText: string, maxPages: number): string {
return limitedPages.join("");
}
// Helper function to limit llmstxt entries
function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
// Split by newlines
const lines = llmstxt.split('\n');
// Find the header line (starts with #)
const headerIndex = lines.findIndex(line => line.startsWith('#'));
if (headerIndex === -1) return llmstxt;
// Get the header and the entries
const header = lines[headerIndex];
const entries = lines.filter(line => line.startsWith('- ['));
// Take only the requested number of entries
const limitedEntries = entries.slice(0, maxEntries);
// Reconstruct the text
return `${header}\n\n${limitedEntries.join('\n')}`;
}
export async function performGenerateLlmsTxt(
options: GenerateLLMsTextServiceOptions,
) {
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } =
const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } =
options;
const startTime = Date.now();
const logger = _logger.child({
@ -55,19 +75,25 @@ export async function performGenerateLlmsTxt(
});
try {
// Enforce max URL limit
const effectiveMaxUrls = Math.min(maxUrls, 5000);
// Check cache first
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
if (cachedResult) {
logger.info("Found cached LLMs text", { url });
// Limit pages and remove separators before returning
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Limit llmstxt entries to match maxUrls
const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);
// Update final result with cached text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: cachedResult.llmstxt,
generatedText: limitedLlmsTxt,
fullText: cleanFullText,
showFullText: showFullText,
});
@ -75,7 +101,7 @@ export async function performGenerateLlmsTxt(
return {
success: true,
data: {
generatedText: cachedResult.llmstxt,
generatedText: limitedLlmsTxt,
fullText: cleanFullText,
showFullText: showFullText,
},
@ -88,7 +114,7 @@ export async function performGenerateLlmsTxt(
url,
teamId,
plan,
limit: maxUrls,
limit: effectiveMaxUrls,
includeSubdomains: false,
ignoreSitemap: false,
includeMetadata: true,
@ -177,10 +203,10 @@ export async function performGenerateLlmsTxt(
}
// After successful generation, save to cache
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);
// Limit pages and remove separators before final update
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Update final result with both generated text and full text