mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 04:18:59 +08:00
Truncate llmstxt cache based on maxurls limit & improve maxurls handling (#1285)
* init * Update generate-llmstxt-service.ts
This commit is contained in:
parent
1beadf39f0
commit
5a1886936c
@ -102,21 +102,21 @@ DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
|||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
### Generate LLMs TXT
|
### Generate LLMs TXT
|
||||||
# @name llmsTxt
|
# @name generateLlmsTxt
|
||||||
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
|
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url": "https://firecrawl.dev",
|
"url": "https://firecrawl.dev",
|
||||||
"maxUrls": 2,
|
"maxUrls": 1,
|
||||||
"showFullText": false
|
"showFullText": false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Check Generate LLMs TXT Status
|
### Check Generate LLMs TXT Status
|
||||||
@llmsTxtId = {{llmsTxt.response.body.$.id}}
|
@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
|
||||||
# @name llmsTxtStatus
|
# @name generateLlmsTxtStatus
|
||||||
GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
|
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
@ -41,10 +41,30 @@ function limitPages(fullText: string, maxPages: number): string {
|
|||||||
return limitedPages.join("");
|
return limitedPages.join("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function to limit llmstxt entries
|
||||||
|
function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
|
||||||
|
// Split by newlines
|
||||||
|
const lines = llmstxt.split('\n');
|
||||||
|
|
||||||
|
// Find the header line (starts with #)
|
||||||
|
const headerIndex = lines.findIndex(line => line.startsWith('#'));
|
||||||
|
if (headerIndex === -1) return llmstxt;
|
||||||
|
|
||||||
|
// Get the header and the entries
|
||||||
|
const header = lines[headerIndex];
|
||||||
|
const entries = lines.filter(line => line.startsWith('- ['));
|
||||||
|
|
||||||
|
// Take only the requested number of entries
|
||||||
|
const limitedEntries = entries.slice(0, maxEntries);
|
||||||
|
|
||||||
|
// Reconstruct the text
|
||||||
|
return `${header}\n\n${limitedEntries.join('\n')}`;
|
||||||
|
}
|
||||||
|
|
||||||
export async function performGenerateLlmsTxt(
|
export async function performGenerateLlmsTxt(
|
||||||
options: GenerateLLMsTextServiceOptions,
|
options: GenerateLLMsTextServiceOptions,
|
||||||
) {
|
) {
|
||||||
const { generationId, teamId, plan, url, maxUrls, showFullText, subId } =
|
const { generationId, teamId, plan, url, maxUrls = 100, showFullText, subId } =
|
||||||
options;
|
options;
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
@ -55,19 +75,25 @@ export async function performGenerateLlmsTxt(
|
|||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// Enforce max URL limit
|
||||||
|
const effectiveMaxUrls = Math.min(maxUrls, 5000);
|
||||||
|
|
||||||
// Check cache first
|
// Check cache first
|
||||||
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
|
const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
|
||||||
if (cachedResult) {
|
if (cachedResult) {
|
||||||
logger.info("Found cached LLMs text", { url });
|
logger.info("Found cached LLMs text", { url });
|
||||||
|
|
||||||
// Limit pages and remove separators before returning
|
// Limit pages and remove separators before returning
|
||||||
const limitedFullText = limitPages(cachedResult.llmstxt_full, maxUrls);
|
const limitedFullText = limitPages(cachedResult.llmstxt_full, effectiveMaxUrls);
|
||||||
const cleanFullText = removePageSeparators(limitedFullText);
|
const cleanFullText = removePageSeparators(limitedFullText);
|
||||||
|
|
||||||
|
// Limit llmstxt entries to match maxUrls
|
||||||
|
const limitedLlmsTxt = limitLlmsTxtEntries(cachedResult.llmstxt, effectiveMaxUrls);
|
||||||
|
|
||||||
// Update final result with cached text
|
// Update final result with cached text
|
||||||
await updateGeneratedLlmsTxt(generationId, {
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
generatedText: cachedResult.llmstxt,
|
generatedText: limitedLlmsTxt,
|
||||||
fullText: cleanFullText,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
});
|
});
|
||||||
@ -75,7 +101,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: {
|
||||||
generatedText: cachedResult.llmstxt,
|
generatedText: limitedLlmsTxt,
|
||||||
fullText: cleanFullText,
|
fullText: cleanFullText,
|
||||||
showFullText: showFullText,
|
showFullText: showFullText,
|
||||||
},
|
},
|
||||||
@ -88,7 +114,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
url,
|
url,
|
||||||
teamId,
|
teamId,
|
||||||
plan,
|
plan,
|
||||||
limit: maxUrls,
|
limit: effectiveMaxUrls,
|
||||||
includeSubdomains: false,
|
includeSubdomains: false,
|
||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
@ -177,10 +203,10 @@ export async function performGenerateLlmsTxt(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// After successful generation, save to cache
|
// After successful generation, save to cache
|
||||||
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, effectiveMaxUrls);
|
||||||
|
|
||||||
// Limit pages and remove separators before final update
|
// Limit pages and remove separators before final update
|
||||||
const limitedFullText = limitPages(llmsFulltxt, maxUrls);
|
const limitedFullText = limitPages(llmsFulltxt, effectiveMaxUrls);
|
||||||
const cleanFullText = removePageSeparators(limitedFullText);
|
const cleanFullText = removePageSeparators(limitedFullText);
|
||||||
|
|
||||||
// Update final result with both generated text and full text
|
// Update final result with both generated text and full text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user