Update generate-llmstxt-service.ts

This commit is contained in:
Nicolas 2025-02-19 15:50:59 -03:00
parent f5de803a9d
commit d4cf2269ed

View File

@ -53,6 +53,7 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
showFullText: showFullText, showFullText: showFullText,
}); });
return { return {
success: true, success: true,
data: { data: {
@ -86,62 +87,72 @@ export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOpt
let llmsFulltxt = `# ${url} llms-full.txt\n\n`; let llmsFulltxt = `# ${url} llms-full.txt\n\n`;
// Scrape each URL // Process URLs in batches of 10
for (const url of urls) { for (let i = 0; i < urls.length; i += 10) {
_logger.debug(`Scraping URL: ${url}`); const batch = urls.slice(i, i + 10);
const document = await scrapeDocument(
{
url,
teamId,
plan,
origin: url,
timeout: 30000,
isSingleUrl: true,
},
[],
logger,
{ onlyMainContent: true }
);
if (!document) { const batchResults = await Promise.all(batch.map(async (url) => {
logger.error(`Failed to scrape URL ${url}`); _logger.debug(`Scraping URL: ${url}`);
continue; try {
} const document = await scrapeDocument(
{
url,
teamId,
plan,
origin: url,
timeout: 30000,
isSingleUrl: true,
},
[],
logger,
{ onlyMainContent: true }
);
// Process scraped result if (!document || !document.markdown) {
if (!document.markdown) continue; logger.error(`Failed to scrape URL ${url}`);
return null;
_logger.debug(`Generating description for ${document.metadata?.url}`);
const completion = await openai.beta.chat.completions.parse({
model: "gpt-4o-mini",
messages: [
{
role: "user",
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
} }
],
response_format: zodResponseFormat(DescriptionSchema, "description")
});
try { _logger.debug(`Generating description for ${document.metadata?.url}`);
const parsedResponse = completion.choices[0].message.parsed;
const description = parsedResponse!.description;
const title = parsedResponse!.title;
llmstxt += `- [${title}](${document.metadata?.url}): ${description}\n`; const completion = await openai.beta.chat.completions.parse({
llmsFulltxt += `## ${title}\n${document.markdown}\n\n`; model: "gpt-4o-mini",
messages: [
{
role: "user",
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
}
],
response_format: zodResponseFormat(DescriptionSchema, "description")
});
// Update progress with both generated text and full text const parsedResponse = completion.choices[0].message.parsed;
await updateGeneratedLlmsTxt(generationId, { return {
status: "processing", title: parsedResponse!.title,
generatedText: llmstxt, description: parsedResponse!.description,
fullText: llmsFulltxt, url: document.metadata?.url,
}); markdown: document.markdown
} catch (error) { };
logger.error(`Failed to parse LLM response for ${document.metadata?.url}`, { error }); } catch (error) {
continue; logger.error(`Failed to process URL ${url}`, { error });
return null;
}
}));
// Process successful results from batch
for (const result of batchResults) {
if (!result) continue;
llmstxt += `- [${result.title}](${result.url}): ${result.description}\n`;
llmsFulltxt += `## ${result.title}\n${result.markdown}\n\n`;
} }
// Update progress after each batch
await updateGeneratedLlmsTxt(generationId, {
status: "processing",
generatedText: llmstxt,
fullText: llmsFulltxt,
});
} }
// After successful generation, save to cache // After successful generation, save to cache