Nick: llmstxt improvements

2025-08-14 05:56:00 +08:00 · 2025-02-19 16:09:46 -03:00 · 2025-02-19 16:09:46 -03:00 · acf1e60608
commit acf1e60608
parent d4cf2269ed
2 changed files with 68 additions and 21 deletions
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
  title: z.string(),
 });
 // Helper function to remove page separators
 function removePageSeparators(text: string): string {
  return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
 }
 // Helper function to limit pages in full text
 function limitPages(fullText: string, maxPages: number): string {
  const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
  // First element is the header, so we start from index 1
  const limitedPages = pages.slice(0, maxPages + 1);
  return limitedPages.join('');
 }
 export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
  const openai = new OpenAI();
  const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
    if (cachedResult) {
      logger.info("Found cached LLMs text", { url });
      // Limit pages and remove separators before returning
      const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
      const cleanFullText = removePageSeparators(limitedFullText);
      // Update final result with cached text
      await updateGeneratedLlmsTxt(generationId, {
        status: "completed",
        generatedText: cachedResult.llmstxt,
-        fullText: cachedResult.llmstxt_full,
+        fullText: cleanFullText,
        showFullText: showFullText,
      });
      return {
        success: true,
        data: {
          generatedText: cachedResult.llmstxt,
-          fullText: cachedResult.llmstxt_full,
+          fullText: cleanFullText,
          showFullText: showFullText,
        },
      };
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
        if (!result) continue;
        llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
-        llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
+        llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
      }
      // Update progress after each batch
      await updateGeneratedLlmsTxt(generationId, {
        status: "processing",
        generatedText: llmstxt,
-        fullText: llmsFulltxt,
+        fullText: removePageSeparators(llmsFulltxt),
      });
    }
    // After successful generation, save to cache
    await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
    // Limit pages and remove separators before final update
    const limitedFullText = limitPages(llmsFulltxt, maxUrls);
    const cleanFullText = removePageSeparators(limitedFullText);
    // Update final result with both generated text and full text
    await updateGeneratedLlmsTxt(generationId, {
      status: "completed",
      generatedText: llmstxt,
-      fullText: llmsFulltxt,
+      fullText: cleanFullText,
      showFullText: showFullText,
    });
@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
      success: true,
      data: {
        generatedText: llmstxt,
-        fullText: llmsFulltxt,
+        fullText: cleanFullText,
        showFullText: showFullText,
      },
    };
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
      return null;
    }
    // Check if data is older than 1 week
    const oneWeekAgo = new Date();
    oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
    if (!data || new Date(data.updated_at) < oneWeekAgo) {
      return null;
    }
    return data;
  } catch (error) {
    logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
@ -53,14 +61,15 @@ export async function saveLlmsTextToCache(
  const originUrl = normalizeUrlOnlyHostname(url);
  try {
-    // First check if there's an existing entry with fewer URLs
+    // First check if there's an existing entry
    const { data: existingData } = await supabase_service
      .from("llm_texts")
      .select("*")
      .eq("origin_url", originUrl)
      .single();
-    // Always update the entry for the origin URL
+    if (existingData) {
      // Update existing entry
      const { error } = await supabase_service
        .from("llm_texts")
        .update({
@ -72,9 +81,27 @@ export async function saveLlmsTextToCache(
        .eq("origin_url", originUrl);
      if (error) {
-      logger.error("Error saving LLMs text to cache", { error, originUrl });
+        logger.error("Error updating LLMs text in cache", { error, originUrl });
      } else {
-      logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
+        logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
      }
    } else {
      // Insert new entry
      const { error } = await supabase_service
        .from("llm_texts")
        .insert({
          origin_url: originUrl,
          llmstxt,
          llmstxt_full,
          max_urls: maxUrls,
          updated_at: new Date().toISOString(),
        });
      if (error) {
        logger.error("Error inserting LLMs text to cache", { error, originUrl });
      } else {
        logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
      }
    }
  } catch (error) {
    logger.error("Failed to save LLMs text to cache", { error, originUrl });