mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 17:59:00 +08:00
Nick: llmstxt improvements
This commit is contained in:
parent
d4cf2269ed
commit
acf1e60608
@ -28,6 +28,19 @@ const DescriptionSchema = z.object({
|
||||
title: z.string(),
|
||||
});
|
||||
|
||||
// Helper function to remove page separators
|
||||
function removePageSeparators(text: string): string {
|
||||
return text.replace(/<\|firecrawl-page-\d+-lllmstxt\|>\n/g, '');
|
||||
}
|
||||
|
||||
// Helper function to limit pages in full text
|
||||
function limitPages(fullText: string, maxPages: number): string {
|
||||
const pages = fullText.split(/<\|firecrawl-page-\d+-lllmstxt\|>\n/);
|
||||
// First element is the header, so we start from index 1
|
||||
const limitedPages = pages.slice(0, maxPages + 1);
|
||||
return limitedPages.join('');
|
||||
}
|
||||
|
||||
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
|
||||
const openai = new OpenAI();
|
||||
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } = options;
|
||||
@ -45,20 +58,23 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
||||
if (cachedResult) {
|
||||
logger.info("Found cached LLMs text", { url });
|
||||
|
||||
// Limit pages and remove separators before returning
|
||||
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
|
||||
const cleanFullText = removePageSeparators(limitedFullText);
|
||||
|
||||
// Update final result with cached text
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "completed",
|
||||
generatedText: cachedResult.llmstxt,
|
||||
fullText: cachedResult.llmstxt_full,
|
||||
fullText: cleanFullText,
|
||||
showFullText: showFullText,
|
||||
});
|
||||
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
generatedText: cachedResult.llmstxt,
|
||||
fullText: cachedResult.llmstxt_full,
|
||||
fullText: cleanFullText,
|
||||
showFullText: showFullText,
|
||||
},
|
||||
};
|
||||
@ -144,25 +160,29 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
||||
if (!result) continue;
|
||||
|
||||
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
|
||||
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
|
||||
llmsFulltxt += `<|firecrawl-page-${i + batchResults.indexOf(result) + 1}-lllmstxt|>\n## ${result.title}\n${result.markdown}\n\n`;
|
||||
}
|
||||
|
||||
// Update progress after each batch
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "processing",
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
fullText: removePageSeparators(llmsFulltxt),
|
||||
});
|
||||
}
|
||||
|
||||
// After successful generation, save to cache
|
||||
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
||||
|
||||
// Limit pages and remove separators before final update
|
||||
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
|
||||
const cleanFullText = removePageSeparators(limitedFullText);
|
||||
|
||||
// Update final result with both generated text and full text
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "completed",
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
fullText: cleanFullText,
|
||||
showFullText: showFullText,
|
||||
});
|
||||
|
||||
@ -197,7 +217,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
|
||||
success: true,
|
||||
data: {
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
fullText: cleanFullText,
|
||||
showFullText: showFullText,
|
||||
},
|
||||
};
|
||||
|
@ -33,6 +33,14 @@ export async function getLlmsTextFromCache(
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if data is older than 1 week
|
||||
const oneWeekAgo = new Date();
|
||||
oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
|
||||
|
||||
if (!data || new Date(data.updated_at) < oneWeekAgo) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (error) {
|
||||
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||
@ -53,28 +61,47 @@ export async function saveLlmsTextToCache(
|
||||
const originUrl = normalizeUrlOnlyHostname(url);
|
||||
|
||||
try {
|
||||
// First check if there's an existing entry with fewer URLs
|
||||
// First check if there's an existing entry
|
||||
const { data: existingData } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.select("*")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
// Always update the entry for the origin URL
|
||||
const { error } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.update({
|
||||
llmstxt,
|
||||
llmstxt_full,
|
||||
max_urls: maxUrls,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
if (existingData) {
|
||||
// Update existing entry
|
||||
const { error } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.update({
|
||||
llmstxt,
|
||||
llmstxt_full,
|
||||
max_urls: maxUrls,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
logger.error("Error saving LLMs text to cache", { error, originUrl });
|
||||
if (error) {
|
||||
logger.error("Error updating LLMs text in cache", { error, originUrl });
|
||||
} else {
|
||||
logger.debug("Successfully updated cached LLMs text", { originUrl, maxUrls });
|
||||
}
|
||||
} else {
|
||||
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
|
||||
// Insert new entry
|
||||
const { error } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.insert({
|
||||
origin_url: originUrl,
|
||||
llmstxt,
|
||||
llmstxt_full,
|
||||
max_urls: maxUrls,
|
||||
updated_at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
logger.error("Error inserting LLMs text to cache", { error, originUrl });
|
||||
} else {
|
||||
logger.debug("Successfully inserted new cached LLMs text", { originUrl, maxUrls });
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Failed to save LLMs text to cache", { error, originUrl });
|
||||
|
Loading…
x
Reference in New Issue
Block a user