diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index 55c06124..bf28a00f 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -28,6 +28,19 @@ const DescriptionSchema = z.object({ title: z.string(), }); +// Helper function to remove page separators +function removePageSeparators(text: string): string { + return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, ''); +} + +// Helper function to limit pages in full text +function limitPages(fullText: string, maxPages: number): string { + const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/); + // First element is the header, so we start from index 1 + const limitedPages = pages.slice(0, maxPages + 1); + return limitedPages.join(''); +} + export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) { const openai = new OpenAI(); const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options; @@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt if (cachedResult) { logger.info("Found cached LLMs text", { url }); + // Limit pages and remove separators before returning + const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls); + const cleanFullText = removePageSeparators(limitedFullText); + // Update final result with cached text await updateGeneratedLlmsTxt(generationId, { status: "completed", generatedText: cachedResult.llmstxt, - fullText: cachedResult.llmstxt_full, + fullText: cleanFullText, showFullText: showFullText, }); - return { success: true, data: { generatedText: cachedResult.llmstxt, - fullText: cachedResult.llmstxt_full, + fullText: cleanFullText, showFullText: showFullText, }, }; @@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt if (!result) continue; llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`; - llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`; + llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`; } // Update progress after each batch await updateGeneratedLlmsTxt(generationId, { status: "processing", generatedText: llmstxt, - fullText: llmsFulltxt, + fullText: removePageSeparators(llmsFulltxt), }); } // After successful generation, save to cache await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls); + // Limit pages and remove separators before final update + const limitedFullText = limitPages(llmsFulltxt, maxUrls); + const cleanFullText = removePageSeparators(limitedFullText); + // Update final result with both generated text and full text await updateGeneratedLlmsTxt(generationId, { status: "completed", generatedText: llmstxt, - fullText: llmsFulltxt, + fullText: cleanFullText, showFullText: showFullText, }); @@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt success: true, data: { generatedText: llmstxt, - fullText: llmsFulltxt, + fullText: cleanFullText, showFullText: showFullText, }, }; diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts index 73be813b..4120170a 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts @@ -33,6 +33,14 @@ export async function getLlmsTextFromCache( return null; } + // Check if data is older than 1 week + const oneWeekAgo = new Date(); + oneWeekAgo.setDate(oneWeekAgo.getDate() - 7); + + if (!data || new Date(data.updated_at) < oneWeekAgo) { + return null; + } + return data; } catch (error) { logger.error("Failed to fetch LLMs text from cache", { error, originUrl }); @@ -53,28 +61,47 @@ export async function saveLlmsTextToCache( const originUrl = normalizeUrlOnlyHostname(url); try { - // First check if there's an existing entry with fewer URLs + // First check if there's an existing entry const { data: existingData } = await supabase_service .from("llm_texts") .select("*") .eq("origin_url", originUrl) .single(); - // Always update the entry for the origin URL - const { error } = await supabase_service - .from("llm_texts") - .update({ - llmstxt, - llmstxt_full, - max_urls: maxUrls, - updated_at: new Date().toISOString(), - }) - .eq("origin_url", originUrl); + if (existingData) { + // Update existing entry + const { error } = await supabase_service + .from("llm_texts") + .update({ + llmstxt, + llmstxt_full, + max_urls: maxUrls, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); - if (error) { - logger.error("Error saving LLMs text to cache", { error, originUrl }); + if (error) { + logger.error("Error updating LLMs text in cache", { error, originUrl }); + } else { + logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls }); + } } else { - logger.debug("Successfully cached LLMs text", { originUrl, maxUrls }); + // Insert new entry + const { error } = await supabase_service + .from("llm_texts") + .insert({ + origin_url: originUrl, + llmstxt, + llmstxt_full, + max_urls: maxUrls, + updated_at: new Date().toISOString(), + }); + + if (error) { + logger.error("Error inserting LLMs text to cache", { error, originUrl }); + } else { + logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls }); + } } } catch (error) { logger.error("Failed to save LLMs text to cache", { error, originUrl });