Nick: llmstxt improvements

This commit is contained in:
Nicolas 2025-02-19 16:09:46 -03:00
parent d4cf2269ed
commit acf1e60608
2 changed files with 68 additions and 21 deletions

View File

@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
title: z.string(),
});
// Helper function to remove page separators
function removePageSeparators(text: string): string {
return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
}
// Helper function to limit pages in full text
function limitPages(fullText: string, maxPages: number): string {
const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
// First element is the header, so we start from index 1
const limitedPages = pages.slice(0, maxPages + 1);
return limitedPages.join('');
}
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
const openai = new OpenAI();
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
if (cachedResult) {
logger.info("Found cached LLMs text", { url });
// Limit pages and remove separators before returning
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Update final result with cached text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
fullText: cleanFullText,
showFullText: showFullText,
});
return {
success: true,
data: {
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
fullText: cleanFullText,
showFullText: showFullText,
},
};
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
if (!result) continue;
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
}
// Update progress after each batch
await updateGeneratedLlmsTxt(generationId, {
status: "processing",
generatedText: llmstxt,
fullText: llmsFulltxt,
fullText: removePageSeparators(llmsFulltxt),
});
}
// After successful generation, save to cache
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
// Limit pages and remove separators before final update
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Update final result with both generated text and full text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: llmstxt,
fullText: llmsFulltxt,
fullText: cleanFullText,
showFullText: showFullText,
});
@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
success: true,
data: {
generatedText: llmstxt,
fullText: llmsFulltxt,
fullText: cleanFullText,
showFullText: showFullText,
},
};

View File

@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
return null;
}
// Check if data is older than 1 week
const oneWeekAgo = new Date();
oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
if (!data || new Date(data.updated_at) < oneWeekAgo) {
return null;
}
return data;
} catch (error) {
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
@ -53,14 +61,15 @@ export async function saveLlmsTextToCache(
const originUrl = normalizeUrlOnlyHostname(url);
try {
// First check if there's an existing entry with fewer URLs
// First check if there's an existing entry
const { data: existingData } = await supabase_service
.from("llm_texts")
.select("*")
.eq("origin_url", originUrl)
.single();
// Always update the entry for the origin URL
if (existingData) {
// Update existing entry
const { error } = await supabase_service
.from("llm_texts")
.update({
@ -72,9 +81,27 @@ export async function saveLlmsTextToCache(
.eq("origin_url", originUrl);
if (error) {
logger.error("Error saving LLMs text to cache", { error, originUrl });
logger.error("Error updating LLMs text in cache", { error, originUrl });
} else {
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
}
} else {
// Insert new entry
const { error } = await supabase_service
.from("llm_texts")
.insert({
origin_url: originUrl,
llmstxt,
llmstxt_full,
max_urls: maxUrls,
updated_at: new Date().toISOString(),
});
if (error) {
logger.error("Error inserting LLMs text to cache", { error, originUrl });
} else {
logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
}
}
} catch (error) {
logger.error("Failed to save LLMs text to cache", { error, originUrl });