mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 05:56:00 +08:00
Nick: llmstxt improvements
This commit is contained in:
parent
d4cf2269ed
commit
acf1e60608
@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
|
|||||||
title: z.string(),
|
title: z.string(),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Helper function to remove page separators
|
||||||
|
function removePageSeparators(text: string): string {
|
||||||
|
return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to limit pages in full text
|
||||||
|
function limitPages(fullText: string, maxPages: number): string {
|
||||||
|
const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
|
||||||
|
// First element is the header, so we start from index 1
|
||||||
|
const limitedPages = pages.slice(0, maxPages + 1);
|
||||||
|
return limitedPages.join('');
|
||||||
|
}
|
||||||
|
|
||||||
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
|
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
|
||||||
const openai = new OpenAI();
|
const openai = new OpenAI();
|
||||||
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
|
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
|
||||||
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
|||||||
if (cachedResult) {
|
if (cachedResult) {
|
||||||
logger.info("Found cached LLMs text", { url });
|
logger.info("Found cached LLMs text", { url });
|
||||||
|
|
||||||
|
// Limit pages and remove separators before returning
|
||||||
|
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
|
||||||
|
const cleanFullText = removePageSeparators(limitedFullText);
|
||||||
|
|
||||||
// Update final result with cached text
|
// Update final result with cached text
|
||||||
await updateGeneratedLlmsTxt(generationId, {
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
generatedText: cachedResult.llmstxt,
|
generatedText: cachedResult.llmstxt,
|
||||||
fullText: cachedResult.llmstxt_full,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: {
|
||||||
generatedText: cachedResult.llmstxt,
|
generatedText: cachedResult.llmstxt,
|
||||||
fullText: cachedResult.llmstxt_full,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
|||||||
if (!result) continue;
|
if (!result) continue;
|
||||||
|
|
||||||
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
|
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
|
||||||
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
|
llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update progress after each batch
|
// Update progress after each batch
|
||||||
await updateGeneratedLlmsTxt(generationId, {
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
status: "processing",
|
status: "processing",
|
||||||
generatedText: llmstxt,
|
generatedText: llmstxt,
|
||||||
fullText: llmsFulltxt,
|
fullText: removePageSeparators(llmsFulltxt),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// After successful generation, save to cache
|
// After successful generation, save to cache
|
||||||
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
||||||
|
|
||||||
|
// Limit pages and remove separators before final update
|
||||||
|
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
|
||||||
|
const cleanFullText = removePageSeparators(limitedFullText);
|
||||||
|
|
||||||
// Update final result with both generated text and full text
|
// Update final result with both generated text and full text
|
||||||
await updateGeneratedLlmsTxt(generationId, {
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
generatedText: llmstxt,
|
generatedText: llmstxt,
|
||||||
fullText: llmsFulltxt,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
|||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: {
|
||||||
generatedText: llmstxt,
|
generatedText: llmstxt,
|
||||||
fullText: llmsFulltxt,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if data is older than 1 week
|
||||||
|
const oneWeekAgo = new Date();
|
||||||
|
oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
|
||||||
|
|
||||||
|
if (!data || new Date(data.updated_at) < oneWeekAgo) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||||
@ -53,14 +61,15 @@ export async function saveLlmsTextToCache(
|
|||||||
const originUrl = normalizeUrlOnlyHostname(url);
|
const originUrl = normalizeUrlOnlyHostname(url);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// First check if there's an existing entry with fewer URLs
|
// First check if there's an existing entry
|
||||||
const { data: existingData } = await supabase_service
|
const { data: existingData } = await supabase_service
|
||||||
.from("llm_texts")
|
.from("llm_texts")
|
||||||
.select("*")
|
.select("*")
|
||||||
.eq("origin_url", originUrl)
|
.eq("origin_url", originUrl)
|
||||||
.single();
|
.single();
|
||||||
|
|
||||||
// Always update the entry for the origin URL
|
if (existingData) {
|
||||||
|
// Update existing entry
|
||||||
const { error } = await supabase_service
|
const { error } = await supabase_service
|
||||||
.from("llm_texts")
|
.from("llm_texts")
|
||||||
.update({
|
.update({
|
||||||
@ -72,9 +81,27 @@ export async function saveLlmsTextToCache(
|
|||||||
.eq("origin_url", originUrl);
|
.eq("origin_url", originUrl);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
logger.error("Error saving LLMs text to cache", { error, originUrl });
|
logger.error("Error updating LLMs text in cache", { error, originUrl });
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
|
logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Insert new entry
|
||||||
|
const { error } = await supabase_service
|
||||||
|
.from("llm_texts")
|
||||||
|
.insert({
|
||||||
|
origin_url: originUrl,
|
||||||
|
llmstxt,
|
||||||
|
llmstxt_full,
|
||||||
|
max_urls: maxUrls,
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
logger.error("Error inserting LLMs text to cache", { error, originUrl });
|
||||||
|
} else {
|
||||||
|
logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Failed to save LLMs text to cache", { error, originUrl });
|
logger.error("Failed to save LLMs text to cache", { error, originUrl });
|
||||||
|
Loading…
x
Reference in New Issue
Block a user