Nick: llmstxt improvements

This commit is contained in:
Nicolas 2025-02-19 16:09:46 -03:00
parent d4cf2269ed
commit acf1e60608
2 changed files with 68 additions and 21 deletions

View File

@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
title: z.string(), title: z.string(),
}); });
// Helper function to remove page separators
function removePageSeparators(text: string): string {
return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
}
// Helper function to limit pages in full text
function limitPages(fullText: string, maxPages: number): string {
const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
// First element is the header, so we start from index 1
const limitedPages = pages.slice(0, maxPages + 1);
return limitedPages.join('');
}
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) { export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
const openai = new OpenAI(); const openai = new OpenAI();
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options; const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
if (cachedResult) { if (cachedResult) {
logger.info("Found cached LLMs text", { url }); logger.info("Found cached LLMs text", { url });
// Limit pages and remove separators before returning
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Update final result with cached text // Update final result with cached text
await updateGeneratedLlmsTxt(generationId, { await updateGeneratedLlmsTxt(generationId, {
status: "completed", status: "completed",
generatedText: cachedResult.llmstxt, generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full, fullText: cleanFullText,
showFullText: showFullText, showFullText: showFullText,
}); });
return { return {
success: true, success: true,
data: { data: {
generatedText: cachedResult.llmstxt, generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full, fullText: cleanFullText,
showFullText: showFullText, showFullText: showFullText,
}, },
}; };
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
if (!result) continue; if (!result) continue;
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`; llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`; llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
} }
// Update progress after each batch // Update progress after each batch
await updateGeneratedLlmsTxt(generationId, { await updateGeneratedLlmsTxt(generationId, {
status: "processing", status: "processing",
generatedText: llmstxt, generatedText: llmstxt,
fullText: llmsFulltxt, fullText: removePageSeparators(llmsFulltxt),
}); });
} }
// After successful generation, save to cache // After successful generation, save to cache
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls); await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
// Limit pages and remove separators before final update
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
const cleanFullText = removePageSeparators(limitedFullText);
// Update final result with both generated text and full text // Update final result with both generated text and full text
await updateGeneratedLlmsTxt(generationId, { await updateGeneratedLlmsTxt(generationId, {
status: "completed", status: "completed",
generatedText: llmstxt, generatedText: llmstxt,
fullText: llmsFulltxt, fullText: cleanFullText,
showFullText: showFullText, showFullText: showFullText,
}); });
@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
success: true, success: true,
data: { data: {
generatedText: llmstxt, generatedText: llmstxt,
fullText: llmsFulltxt, fullText: cleanFullText,
showFullText: showFullText, showFullText: showFullText,
}, },
}; };

View File

@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
return null; return null;
} }
// Check if data is older than 1 week
const oneWeekAgo = new Date();
oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
if (!data || new Date(data.updated_at) < oneWeekAgo) {
return null;
}
return data; return data;
} catch (error) { } catch (error) {
logger.error("Failed to fetch LLMs text from cache", { error, originUrl }); logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
@ -53,14 +61,15 @@ export async function saveLlmsTextToCache(
const originUrl = normalizeUrlOnlyHostname(url); const originUrl = normalizeUrlOnlyHostname(url);
try { try {
// First check if there's an existing entry with fewer URLs // First check if there's an existing entry
const { data: existingData } = await supabase_service const { data: existingData } = await supabase_service
.from("llm_texts") .from("llm_texts")
.select("*") .select("*")
.eq("origin_url", originUrl) .eq("origin_url", originUrl)
.single(); .single();
// Always update the entry for the origin URL if (existingData) {
// Update existing entry
const { error } = await supabase_service const { error } = await supabase_service
.from("llm_texts") .from("llm_texts")
.update({ .update({
@ -72,9 +81,27 @@ export async function saveLlmsTextToCache(
.eq("origin_url", originUrl); .eq("origin_url", originUrl);
if (error) { if (error) {
logger.error("Error saving LLMs text to cache", { error, originUrl }); logger.error("Error updating LLMs text in cache", { error, originUrl });
} else { } else {
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls }); logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
}
} else {
// Insert new entry
const { error } = await supabase_service
.from("llm_texts")
.insert({
origin_url: originUrl,
llmstxt,
llmstxt_full,
max_urls: maxUrls,
updated_at: new Date().toISOString(),
});
if (error) {
logger.error("Error inserting LLMs text to cache", { error, originUrl });
} else {
logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
}
} }
} catch (error) { } catch (error) {
logger.error("Failed to save LLMs text to cache", { error, originUrl }); logger.error("Failed to save LLMs text to cache", { error, originUrl });