Nick: llmstxt improvements

2025-08-12 17:59:00 +08:00 · 2025-02-19 16:09:46 -03:00 · 2025-02-19 16:09:46 -03:00 · acf1e60608
commit acf1e60608
parent d4cf2269ed
2 changed files with 68 additions and 21 deletions
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
  title: z.string(),
 });

+// Helper function to remove page separators
+function removePageSeparators(text: string): string {
+  return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
+}
+
+// Helper function to limit pages in full text
+function limitPages(fullText: string, maxPages: number): string {
+  const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
+  // First element is the header, so we start from index 1
+  const limitedPages = pages.slice(0, maxPages + 1);
+  return limitedPages.join('');
+}
+
 export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
  const openai = new OpenAI();
  const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
    if (cachedResult) {
      logger.info("Found cached LLMs text", { url });
      
+      // Limit pages and remove separators before returning
+      const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
+      const cleanFullText = removePageSeparators(limitedFullText);
+      
      // Update final result with cached text
      await updateGeneratedLlmsTxt(generationId, {
        status: "completed",
        generatedText: cachedResult.llmstxt,
-        fullText: cachedResult.llmstxt_full,
+        fullText: cleanFullText,
        showFullText: showFullText,
      });

-      
      return {
        success: true,
        data: {
          generatedText: cachedResult.llmstxt,
-          fullText: cachedResult.llmstxt_full,
+          fullText: cleanFullText,
          showFullText: showFullText,
        },
      };
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
        if (!result) continue;
        
        llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
-        llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
+        llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
      }

      // Update progress after each batch
      await updateGeneratedLlmsTxt(generationId, {
        status: "processing",
        generatedText: llmstxt,
-        fullText: llmsFulltxt,
+        fullText: removePageSeparators(llmsFulltxt),
      });
    }

    // After successful generation, save to cache
    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);

+    // Limit pages and remove separators before final update
+    const limitedFullText = limitPages(llmsFulltxt, maxUrls);
+    const cleanFullText = removePageSeparators(limitedFullText);
+
    // Update final result with both generated text and full text
    await updateGeneratedLlmsTxt(generationId, {
      status: "completed",
      generatedText: llmstxt,
-      fullText: llmsFulltxt,
+      fullText: cleanFullText,
      showFullText: showFullText,
    });

@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
      success: true,
      data: {
        generatedText: llmstxt,
-        fullText: llmsFulltxt,
+        fullText: cleanFullText,
        showFullText: showFullText,
      },
    };
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
      return null;
    }

+    // Check if data is older than 1 week
+    const oneWeekAgo = new Date();
+    oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
+    
+    if (!data || new Date(data.updated_at) < oneWeekAgo) {
+      return null;
+    }
+
    return data;
  } catch (error) {
    logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
@ -53,28 +61,47 @@ export async function saveLlmsTextToCache(
  const originUrl = normalizeUrlOnlyHostname(url);

  try {
-    // First check if there's an existing entry with fewer URLs
+    // First check if there's an existing entry
    const { data: existingData } = await supabase_service
      .from("llm_texts")
      .select("*")
      .eq("origin_url", originUrl)
      .single();

-    // Always update the entry for the origin URL
-    const { error } = await supabase_service
-      .from("llm_texts")
-      .update({
-        llmstxt,
-        llmstxt_full,
-        max_urls: maxUrls,
-        updated_at: new Date().toISOString(),
-      })
-      .eq("origin_url", originUrl);
+    if (existingData) {
+      // Update existing entry
+      const { error } = await supabase_service
+        .from("llm_texts")
+        .update({
+          llmstxt,
+          llmstxt_full,
+          max_urls: maxUrls,
+          updated_at: new Date().toISOString(),
+        })
+        .eq("origin_url", originUrl);

-    if (error) {
-      logger.error("Error saving LLMs text to cache", { error, originUrl });
+      if (error) {
+        logger.error("Error updating LLMs text in cache", { error, originUrl });
+      } else {
+        logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
+      }
    } else {
-      logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
+      // Insert new entry
+      const { error } = await supabase_service
+        .from("llm_texts")
+        .insert({
+          origin_url: originUrl,
+          llmstxt,
+          llmstxt_full,
+          max_urls: maxUrls,
+          updated_at: new Date().toISOString(),
+        });
+
+      if (error) {
+        logger.error("Error inserting LLMs text to cache", { error, originUrl });
+      } else {
+        logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
+      }
    }
  } catch (error) {
    logger.error("Failed to save LLMs text to cache", { error, originUrl });