mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 23:49:02 +08:00
Add llmstxt generator endpoint (#1201)
* Nick: * Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)" This reverts commit 586a10f40d354a038afc2b67809f20a7a829f8cb. * Update deep-research-service.ts * Nick: * init * part 2 * Update generate-llmstxt-service.ts * Fix queue * Update queue-worker.ts * Almost there * Final touches * Update requests.http * final touches * Update requests.http * Improve logging * Change endpoint to /llmstxt * Update queue-worker.ts * Update generate-llmstxt-service.ts * Nick: cache * Update index.ts * Update firecrawl.py * Update package.json --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
e373fab5c1
commit
d984b50400
@ -100,3 +100,23 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
###
|
||||
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
### Generate LLMs TXT
|
||||
# @name llmsTxt
|
||||
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "https://firecrawl.dev",
|
||||
"maxUrls": 2,
|
||||
"showFullText": false
|
||||
}
|
||||
|
||||
|
||||
### Check Generate LLMs TXT Status
|
||||
@llmsTxtId = {{llmsTxt.response.body.$.id}}
|
||||
# @name llmsTxtStatus
|
||||
GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
@ -21,7 +21,7 @@ export async function deepResearchStatusController(
|
||||
|
||||
let data: any = null;
|
||||
|
||||
if (research.status === "completed") {
|
||||
if (research.status === "completed" && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const jobData = await supabaseGetJobsById([req.params.jobId]);
|
||||
if (jobData && jobData.length > 0) {
|
||||
data = jobData[0].docs[0];
|
||||
|
41
apps/api/src/controllers/v1/generate-llmstxt-status.ts
Normal file
41
apps/api/src/controllers/v1/generate-llmstxt-status.ts
Normal file
@ -0,0 +1,41 @@
|
||||
import { Response } from "express";
|
||||
import { RequestWithAuth } from "./types";
|
||||
import { getGeneratedLlmsTxt, getGeneratedLlmsTxtExpiry } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
|
||||
export async function generateLLMsTextStatusController(
|
||||
req: RequestWithAuth<{ jobId: string }, any, any>,
|
||||
res: Response,
|
||||
) {
|
||||
const generation = await getGeneratedLlmsTxt(req.params.jobId);
|
||||
const showFullText = generation?.showFullText ?? false;
|
||||
|
||||
if (!generation) {
|
||||
return res.status(404).json({
|
||||
success: false,
|
||||
error: "llmsTxt generation job not found",
|
||||
});
|
||||
}
|
||||
|
||||
let data: any = null;
|
||||
|
||||
if (showFullText) {
|
||||
data = {
|
||||
llmstxt: generation.generatedText,
|
||||
llmsfulltxt: generation.fullText,
|
||||
};
|
||||
} else {
|
||||
data = {
|
||||
llmstxt: generation.generatedText,
|
||||
};
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: generation.status === "failed" ? false : true,
|
||||
|
||||
data: data,
|
||||
status: generation.status,
|
||||
error: generation?.error ?? undefined,
|
||||
expiresAt: (await getGeneratedLlmsTxtExpiry(req.params.jobId)).toISOString(),
|
||||
});
|
||||
}
|
89
apps/api/src/controllers/v1/generate-llmstxt.ts
Normal file
89
apps/api/src/controllers/v1/generate-llmstxt.ts
Normal file
@ -0,0 +1,89 @@
|
||||
import { Response } from "express";
|
||||
import { RequestWithAuth } from "./types";
|
||||
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { z } from "zod";
|
||||
|
||||
export const generateLLMsTextRequestSchema = z.object({
|
||||
url: z.string().url().describe('The URL to generate text from'),
|
||||
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
|
||||
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
|
||||
__experimental_stream: z.boolean().optional(),
|
||||
});
|
||||
|
||||
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
|
||||
|
||||
export type GenerateLLMsTextResponse = {
|
||||
success: boolean;
|
||||
id: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Initiates a text generation job based on the provided URL.
|
||||
* @param req - The request object containing authentication and generation parameters.
|
||||
* @param res - The response object to send the generation job ID.
|
||||
* @returns A promise that resolves when the generation job is queued.
|
||||
*/
|
||||
export async function generateLLMsTextController(
|
||||
req: RequestWithAuth<{}, GenerateLLMsTextResponse, GenerateLLMsTextRequest>,
|
||||
res: Response<GenerateLLMsTextResponse>,
|
||||
) {
|
||||
req.body = generateLLMsTextRequestSchema.parse(req.body);
|
||||
|
||||
const generationId = crypto.randomUUID();
|
||||
const jobData = {
|
||||
request: req.body,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
subId: req.acuc?.sub_id,
|
||||
generationId,
|
||||
};
|
||||
|
||||
await saveGeneratedLlmsTxt(generationId, {
|
||||
id: generationId,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan!, // Add non-null assertion since plan is required
|
||||
createdAt: Date.now(),
|
||||
status: "processing",
|
||||
url: req.body.url,
|
||||
maxUrls: req.body.maxUrls,
|
||||
showFullText: req.body.showFullText,
|
||||
generatedText: "",
|
||||
fullText: "",
|
||||
});
|
||||
|
||||
if (Sentry.isInitialized()) {
|
||||
const size = JSON.stringify(jobData).length;
|
||||
await Sentry.startSpan(
|
||||
{
|
||||
name: "Add LLMstxt generation job",
|
||||
op: "queue.publish",
|
||||
attributes: {
|
||||
"messaging.message.id": generationId,
|
||||
"messaging.destination.name": getGenerateLlmsTxtQueue().name,
|
||||
"messaging.message.body.size": size,
|
||||
},
|
||||
},
|
||||
async (span) => {
|
||||
await getGenerateLlmsTxtQueue().add(generationId, {
|
||||
...jobData,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
}, { jobId: generationId });
|
||||
},
|
||||
);
|
||||
} else {
|
||||
await getGenerateLlmsTxtQueue().add(generationId, jobData, {
|
||||
jobId: generationId,
|
||||
});
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id: generationId,
|
||||
});
|
||||
}
|
@ -8,6 +8,7 @@ import {
|
||||
getExtractQueue,
|
||||
getScrapeQueue,
|
||||
getIndexQueue,
|
||||
getGenerateLlmsTxtQueue,
|
||||
getDeepResearchQueue,
|
||||
} from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
@ -55,6 +56,7 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||
new BullAdapter(getScrapeQueue()),
|
||||
new BullAdapter(getExtractQueue()),
|
||||
new BullAdapter(getIndexQueue()),
|
||||
new BullAdapter(getGenerateLlmsTxtQueue()),
|
||||
new BullAdapter(getDeepResearchQueue()),
|
||||
],
|
||||
serverAdapter: serverAdapter,
|
||||
|
70
apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
Normal file
70
apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
Normal file
@ -0,0 +1,70 @@
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
import { logger as _logger } from "../logger";
|
||||
|
||||
export interface GenerationData {
|
||||
id: string;
|
||||
team_id: string;
|
||||
plan: string;
|
||||
createdAt: number;
|
||||
status: "processing" | "completed" | "failed";
|
||||
url: string;
|
||||
maxUrls: number;
|
||||
showFullText: boolean;
|
||||
generatedText: string;
|
||||
fullText: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// TTL of 24 hours
|
||||
const GENERATION_TTL = 24 * 60 * 60;
|
||||
|
||||
export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise<void> {
|
||||
_logger.debug("Saving llmstxt generation " + id + " to Redis...");
|
||||
await redisConnection.set("generation:" + id, JSON.stringify(data));
|
||||
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
||||
}
|
||||
|
||||
export async function getGeneratedLlmsTxt(id: string): Promise<GenerationData | null> {
|
||||
const x = await redisConnection.get("generation:" + id);
|
||||
return x ? JSON.parse(x) : null;
|
||||
}
|
||||
|
||||
export async function updateGeneratedLlmsTxt(
|
||||
id: string,
|
||||
data: Partial<GenerationData>,
|
||||
): Promise<void> {
|
||||
const current = await getGeneratedLlmsTxt(id);
|
||||
if (!current) return;
|
||||
|
||||
const updatedGeneration = {
|
||||
...current,
|
||||
...data
|
||||
};
|
||||
|
||||
await redisConnection.set("generation:" + id, JSON.stringify(updatedGeneration));
|
||||
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
||||
}
|
||||
|
||||
export async function getGeneratedLlmsTxtExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("generation:" + id);
|
||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||
d.setMilliseconds(0);
|
||||
return d;
|
||||
}
|
||||
|
||||
// Convenience method for status updates
|
||||
export async function updateGeneratedLlmsTxtStatus(
|
||||
id: string,
|
||||
status: "processing" | "completed" | "failed",
|
||||
generatedText?: string,
|
||||
fullText?: string,
|
||||
error?: string,
|
||||
): Promise<void> {
|
||||
const updates: Partial<GenerationData> = { status };
|
||||
if (generatedText !== undefined) updates.generatedText = generatedText;
|
||||
if (fullText !== undefined) updates.fullText = fullText;
|
||||
if (error !== undefined) updates.error = error;
|
||||
|
||||
await updateGeneratedLlmsTxt(id, updates);
|
||||
}
|
174
apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
Normal file
174
apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
Normal file
@ -0,0 +1,174 @@
|
||||
import { logger as _logger } from "../logger";
|
||||
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";
|
||||
import { getMapResults } from "../../controllers/v1/map";
|
||||
import { MapResponse, ScrapeResponse, Document } from "../../controllers/v1/types";
|
||||
import { Response } from "express";
|
||||
import OpenAI from "openai";
|
||||
import { zodResponseFormat } from "openai/helpers/zod";
|
||||
import { z } from "zod";
|
||||
import { scrapeDocument } from "../extract/document-scraper";
|
||||
import { PlanType } from "../../types";
|
||||
import { getLlmsTextFromCache, saveLlmsTextToCache } from "./generate-llmstxt-supabase";
|
||||
|
||||
interface GenerateLLMsTextServiceOptions {
|
||||
generationId: string;
|
||||
teamId: string;
|
||||
plan: PlanType;
|
||||
url: string;
|
||||
maxUrls: number;
|
||||
showFullText: boolean;
|
||||
}
|
||||
|
||||
|
||||
const DescriptionSchema = z.object({
|
||||
description: z.string(),
|
||||
title: z.string(),
|
||||
});
|
||||
|
||||
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
|
||||
const openai = new OpenAI();
|
||||
const { generationId, teamId, plan, url, maxUrls, showFullText } = options;
|
||||
|
||||
const logger = _logger.child({
|
||||
module: "generate-llmstxt",
|
||||
method: "performGenerateLlmsTxt",
|
||||
generationId,
|
||||
teamId,
|
||||
});
|
||||
|
||||
try {
|
||||
// Check cache first
|
||||
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
|
||||
if (cachedResult) {
|
||||
logger.info("Found cached LLMs text", { url });
|
||||
|
||||
// Update final result with cached text
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "completed",
|
||||
generatedText: cachedResult.llmstxt,
|
||||
fullText: cachedResult.llmstxt_full,
|
||||
showFullText: showFullText,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
generatedText: cachedResult.llmstxt,
|
||||
fullText: cachedResult.llmstxt_full,
|
||||
showFullText: showFullText,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// If not in cache, proceed with generation
|
||||
// First, get all URLs from the map controller
|
||||
const mapResult = await getMapResults({
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
limit: maxUrls,
|
||||
includeSubdomains: false,
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
});
|
||||
|
||||
if (!mapResult || !mapResult.links) {
|
||||
throw new Error(`Failed to map URLs`);
|
||||
}
|
||||
|
||||
_logger.debug("Mapping URLs", mapResult.links);
|
||||
|
||||
const urls = mapResult.links;
|
||||
let llmstxt = `# ${url} llms.txt\n\n`;
|
||||
let llmsFulltxt = `# ${url} llms-full.txt\n\n`;
|
||||
|
||||
|
||||
// Scrape each URL
|
||||
for (const url of urls) {
|
||||
_logger.debug(`Scraping URL: ${url}`);
|
||||
const document = await scrapeDocument(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: url,
|
||||
timeout: 30000,
|
||||
isSingleUrl: true,
|
||||
},
|
||||
[],
|
||||
logger,
|
||||
{ onlyMainContent: true }
|
||||
);
|
||||
|
||||
if (!document) {
|
||||
logger.error(`Failed to scrape URL ${url}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process scraped result
|
||||
if (!document.markdown) continue;
|
||||
|
||||
_logger.debug(`Generating description for ${document.metadata?.url}`);
|
||||
|
||||
const completion = await openai.beta.chat.completions.parse({
|
||||
model: "gpt-4o-mini",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
|
||||
}
|
||||
],
|
||||
response_format: zodResponseFormat(DescriptionSchema, "description")
|
||||
});
|
||||
|
||||
try {
|
||||
const parsedResponse = completion.choices[0].message.parsed;
|
||||
const description = parsedResponse!.description;
|
||||
const title = parsedResponse!.title;
|
||||
|
||||
llmstxt += `- [${title}](${document.metadata?.url}): ${description}\n`;
|
||||
llmsFulltxt += `## ${title}\n${document.markdown}\n\n`;
|
||||
|
||||
// Update progress with both generated text and full text
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "processing",
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error(`Failed to parse LLM response for ${document.metadata?.url}`, { error });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// After successful generation, save to cache
|
||||
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
||||
|
||||
// Update final result with both generated text and full text
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "completed",
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
showFullText: showFullText,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
generatedText: llmstxt,
|
||||
fullText: llmsFulltxt,
|
||||
showFullText: showFullText,
|
||||
},
|
||||
};
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error("Generate LLMs text error", { error });
|
||||
|
||||
await updateGeneratedLlmsTxt(generationId, {
|
||||
status: "failed",
|
||||
error: error.message || "Unknown error occurred",
|
||||
});
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
@ -0,0 +1,82 @@
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { logger } from "../logger";
|
||||
import { normalizeUrlOnlyHostname } from "../canonical-url";
|
||||
|
||||
interface LlmsTextCache {
|
||||
origin_url: string;
|
||||
llmstxt: string;
|
||||
llmstxt_full: string;
|
||||
max_urls: number;
|
||||
}
|
||||
|
||||
export async function getLlmsTextFromCache(
|
||||
url: string,
|
||||
maxUrls: number,
|
||||
): Promise<LlmsTextCache | null> {
|
||||
if (process.env.USE_DB_AUTHENTICATION !== "true") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const originUrl = normalizeUrlOnlyHostname(url);
|
||||
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.select("*")
|
||||
.eq("origin_url", originUrl)
|
||||
.gte("max_urls", maxUrls) // Changed to gte since we want cached results with more URLs than requested
|
||||
.order("updated_at", { ascending: false })
|
||||
.limit(1)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (error) {
|
||||
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function saveLlmsTextToCache(
|
||||
url: string,
|
||||
llmstxt: string,
|
||||
llmstxt_full: string,
|
||||
maxUrls: number,
|
||||
): Promise<void> {
|
||||
if (process.env.USE_DB_AUTHENTICATION !== "true") {
|
||||
return;
|
||||
}
|
||||
|
||||
const originUrl = normalizeUrlOnlyHostname(url);
|
||||
|
||||
try {
|
||||
// First check if there's an existing entry with fewer URLs
|
||||
const { data: existingData } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.select("*")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
// Always update the entry for the origin URL
|
||||
const { error } = await supabase_service
|
||||
.from("llm_texts")
|
||||
.update({
|
||||
llmstxt,
|
||||
llmstxt_full,
|
||||
max_urls: maxUrls,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
logger.error("Error saving LLMs text to cache", { error, originUrl });
|
||||
} else {
|
||||
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Failed to save LLMs text to cache", { error, originUrl });
|
||||
}
|
||||
}
|
@ -29,6 +29,8 @@ import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { searchController } from "../controllers/v1/search";
|
||||
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||
import { generateLLMsTextController } from "../controllers/v1/generate-llmstxt";
|
||||
import { generateLLMsTextStatusController } from "../controllers/v1/generate-llmstxt-status";
|
||||
import { deepResearchController } from "../controllers/v1/deep-research";
|
||||
import { deepResearchStatusController } from "../controllers/v1/deep-research-status";
|
||||
|
||||
@ -242,6 +244,18 @@ v1Router.get(
|
||||
wrap(extractStatusController),
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/llmstxt",
|
||||
authMiddleware(RateLimiterMode.Extract),
|
||||
wrap(generateLLMsTextController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/llmstxt/:jobId",
|
||||
authMiddleware(RateLimiterMode.ExtractStatus),
|
||||
wrap(generateLLMsTextStatusController),
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/deep-research",
|
||||
authMiddleware(RateLimiterMode.Extract),
|
||||
|
@ -7,6 +7,7 @@ let extractQueue: Queue;
|
||||
let loggingQueue: Queue;
|
||||
let indexQueue: Queue;
|
||||
let deepResearchQueue: Queue;
|
||||
let generateLlmsTxtQueue: Queue;
|
||||
|
||||
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
|
||||
maxRetriesPerRequest: null,
|
||||
@ -16,6 +17,7 @@ export const scrapeQueueName = "{scrapeQueue}";
|
||||
export const extractQueueName = "{extractQueue}";
|
||||
export const loggingQueueName = "{loggingQueue}";
|
||||
export const indexQueueName = "{indexQueue}";
|
||||
export const generateLlmsTxtQueueName = "{generateLlmsTxtQueue}";
|
||||
export const deepResearchQueueName = "{deepResearchQueue}";
|
||||
|
||||
export function getScrapeQueue() {
|
||||
@ -72,6 +74,24 @@ export function getIndexQueue() {
|
||||
return indexQueue;
|
||||
}
|
||||
|
||||
export function getGenerateLlmsTxtQueue() {
|
||||
if (!generateLlmsTxtQueue) {
|
||||
generateLlmsTxtQueue = new Queue(generateLlmsTxtQueueName, {
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
},
|
||||
});
|
||||
logger.info("LLMs TXT generation queue created");
|
||||
}
|
||||
return generateLlmsTxtQueue;
|
||||
}
|
||||
|
||||
export function getDeepResearchQueue() {
|
||||
if (!deepResearchQueue) {
|
||||
deepResearchQueue = new Queue(deepResearchQueueName, {
|
||||
|
@ -11,6 +11,7 @@ import {
|
||||
extractQueueName,
|
||||
deepResearchQueueName,
|
||||
getIndexQueue,
|
||||
getGenerateLlmsTxtQueue,
|
||||
} from "./queue-service";
|
||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
@ -69,6 +70,8 @@ import { billTeam } from "./billing/credit_billing";
|
||||
import { saveCrawlMap } from "./indexing/crawl-maps-index";
|
||||
import { updateDeepResearch } from "../lib/deep-research/deep-research-redis";
|
||||
import { performDeepResearch } from "../lib/deep-research/deep-research-service";
|
||||
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -446,6 +449,76 @@ const processDeepResearchJobInternal = async (
|
||||
}
|
||||
};
|
||||
|
||||
const processGenerateLlmsTxtJobInternal = async (
|
||||
token: string,
|
||||
job: Job & { id: string },
|
||||
) => {
|
||||
const logger = _logger.child({
|
||||
module: "generate-llmstxt-worker",
|
||||
method: "processJobInternal",
|
||||
jobId: job.id,
|
||||
generateId: job.data.generateId,
|
||||
teamId: job.data?.teamId ?? undefined,
|
||||
});
|
||||
|
||||
const extendLockInterval = setInterval(async () => {
|
||||
logger.info(`🔄 Worker extending lock on job ${job.id}`);
|
||||
await job.extendLock(token, jobLockExtensionTime);
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
try {
|
||||
const result = await performGenerateLlmsTxt({
|
||||
generationId: job.data.generationId,
|
||||
teamId: job.data.teamId,
|
||||
plan: job.data.plan,
|
||||
url: job.data.request.url,
|
||||
maxUrls: job.data.request.maxUrls,
|
||||
showFullText: job.data.request.showFullText,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
await job.moveToCompleted(result, token, false);
|
||||
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||
status: "completed",
|
||||
generatedText: result.data.generatedText,
|
||||
fullText: result.data.fullText,
|
||||
});
|
||||
return result;
|
||||
} else {
|
||||
const error = new Error("LLMs text generation failed without specific error");
|
||||
await job.moveToFailed(error, token, false);
|
||||
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||
status: "failed",
|
||||
error: error.message,
|
||||
});
|
||||
return { success: false, error: error.message };
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
|
||||
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id,
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
await job.moveToFailed(error, token, false);
|
||||
} catch (e) {
|
||||
logger.error("Failed to move job to failed state in Redis", { error });
|
||||
}
|
||||
|
||||
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||
status: "failed",
|
||||
error: error.message || "Unknown error occurred",
|
||||
});
|
||||
|
||||
return { success: false, error: error.message || "Unknown error occurred" };
|
||||
} finally {
|
||||
clearInterval(extendLockInterval);
|
||||
}
|
||||
};
|
||||
|
||||
let isShuttingDown = false;
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
@ -1170,6 +1243,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
workerFun(getScrapeQueue(), processJobInternal),
|
||||
workerFun(getExtractQueue(), processExtractJobInternal),
|
||||
workerFun(getDeepResearchQueue(), processDeepResearchJobInternal),
|
||||
workerFun(getGenerateLlmsTxtQueue(), processGenerateLlmsTxtJobInternal),
|
||||
]);
|
||||
|
||||
console.log("All workers exited. Waiting for all jobs to finish...");
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.18.1",
|
||||
"version": "1.18.2",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -413,6 +413,48 @@ export interface DeepResearchStatusResponse {
|
||||
summaries: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for LLMs.txt generation operations.
|
||||
*/
|
||||
export interface GenerateLLMsTextParams {
|
||||
/**
|
||||
* Maximum number of URLs to process (1-100)
|
||||
* @default 10
|
||||
*/
|
||||
maxUrls?: number;
|
||||
/**
|
||||
* Whether to show the full LLMs-full.txt in the response
|
||||
* @default false
|
||||
*/
|
||||
showFullText?: boolean;
|
||||
/**
|
||||
* Experimental flag for streaming
|
||||
*/
|
||||
__experimental_stream?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for LLMs.txt generation operations.
|
||||
*/
|
||||
export interface GenerateLLMsTextResponse {
|
||||
success: boolean;
|
||||
id: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Status response interface for LLMs.txt generation operations.
|
||||
*/
|
||||
export interface GenerateLLMsTextStatusResponse {
|
||||
success: boolean;
|
||||
data: {
|
||||
llmstxt: string;
|
||||
llmsfulltxt?: string;
|
||||
};
|
||||
status: "processing" | "completed" | "failed";
|
||||
error?: string;
|
||||
expiresAt: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||
@ -1459,6 +1501,118 @@ export default class FirecrawlApp {
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates LLMs.txt for a given URL and polls until completion.
|
||||
* @param url - The URL to generate LLMs.txt from.
|
||||
* @param params - Parameters for the LLMs.txt generation operation.
|
||||
* @returns The final generation results.
|
||||
*/
|
||||
async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
||||
try {
|
||||
const response = await this.asyncGenerateLLMsText(url, params);
|
||||
|
||||
if (!response.success || 'error' in response) {
|
||||
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
||||
}
|
||||
|
||||
if (!response.id) {
|
||||
throw new FirecrawlError(`Failed to start LLMs.txt generation. No job ID returned.`, 500);
|
||||
}
|
||||
|
||||
const jobId = response.id;
|
||||
let generationStatus;
|
||||
|
||||
while (true) {
|
||||
generationStatus = await this.checkGenerateLLMsTextStatus(jobId);
|
||||
|
||||
if ('error' in generationStatus && !generationStatus.success) {
|
||||
return generationStatus;
|
||||
}
|
||||
|
||||
if (generationStatus.status === "completed") {
|
||||
return generationStatus;
|
||||
}
|
||||
|
||||
if (generationStatus.status === "failed") {
|
||||
throw new FirecrawlError(
|
||||
`LLMs.txt generation job ${generationStatus.status}. Error: ${generationStatus.error}`,
|
||||
500
|
||||
);
|
||||
}
|
||||
|
||||
if (generationStatus.status !== "processing") {
|
||||
break;
|
||||
}
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
return { success: false, error: "LLMs.txt generation job terminated unexpectedly" };
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a LLMs.txt generation operation without polling.
|
||||
* @param url - The URL to generate LLMs.txt from.
|
||||
* @param params - Parameters for the LLMs.txt generation operation.
|
||||
* @returns The response containing the generation job ID.
|
||||
*/
|
||||
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
`${this.apiUrl}/v1/llmstxt`,
|
||||
{ url, ...params },
|
||||
headers
|
||||
);
|
||||
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "start LLMs.txt generation");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a LLMs.txt generation operation.
|
||||
* @param id - The ID of the LLMs.txt generation operation.
|
||||
* @returns The current status and results of the generation operation.
|
||||
*/
|
||||
async checkGenerateLLMsTextStatus(id: string): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/llmstxt/${id}`,
|
||||
headers
|
||||
);
|
||||
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else if (response.status === 404) {
|
||||
throw new FirecrawlError("LLMs.txt generation job not found", 404);
|
||||
} else {
|
||||
this.handleError(response, "check LLMs.txt generation status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
}
|
||||
|
||||
interface CrawlWatcherEvents {
|
||||
|
@ -33,6 +33,14 @@ class SearchParams(pydantic.BaseModel):
|
||||
timeout: Optional[int] = 60000
|
||||
scrapeOptions: Optional[Dict[str, Any]] = None
|
||||
|
||||
class GenerateLLMsTextParams(pydantic.BaseModel):
|
||||
"""
|
||||
Parameters for the LLMs.txt generation operation.
|
||||
"""
|
||||
maxUrls: Optional[int] = 10
|
||||
showFullText: Optional[bool] = False
|
||||
__experimental_stream: Optional[bool] = None
|
||||
|
||||
class FirecrawlApp:
|
||||
class SearchResponse(pydantic.BaseModel):
|
||||
"""
|
||||
@ -756,6 +764,123 @@ class FirecrawlApp:
|
||||
except Exception as e:
|
||||
raise ValueError(str(e), 500)
|
||||
|
||||
def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate LLMs.txt for a given URL and poll until completion.
|
||||
|
||||
Args:
|
||||
url (str): The URL to generate LLMs.txt from.
|
||||
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generation results. The structure includes:
|
||||
- 'success' (bool): Indicates if the generation was successful.
|
||||
- 'status' (str): The final status of the generation job.
|
||||
- 'data' (Dict): The generated LLMs.txt data.
|
||||
- 'error' (Optional[str]): Error message if the generation failed.
|
||||
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
|
||||
|
||||
Raises:
|
||||
Exception: If the generation job fails or an error occurs during status checks.
|
||||
"""
|
||||
if params is None:
|
||||
params = {}
|
||||
|
||||
if isinstance(params, dict):
|
||||
generation_params = GenerateLLMsTextParams(**params)
|
||||
else:
|
||||
generation_params = params
|
||||
|
||||
response = self.async_generate_llms_text(url, generation_params)
|
||||
if not response.get('success') or 'id' not in response:
|
||||
return response
|
||||
|
||||
job_id = response['id']
|
||||
while True:
|
||||
status = self.check_generate_llms_text_status(job_id)
|
||||
|
||||
if status['status'] == 'completed':
|
||||
return status
|
||||
elif status['status'] == 'failed':
|
||||
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
|
||||
elif status['status'] != 'processing':
|
||||
break
|
||||
|
||||
time.sleep(2) # Polling interval
|
||||
|
||||
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
||||
|
||||
def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate an asynchronous LLMs.txt generation operation.
|
||||
|
||||
Args:
|
||||
url (str): The URL to generate LLMs.txt from.
|
||||
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes:
|
||||
- 'success' (bool): Indicates if the generation initiation was successful.
|
||||
- 'id' (str): The unique identifier for the generation job.
|
||||
|
||||
Raises:
|
||||
Exception: If the generation job initiation fails.
|
||||
"""
|
||||
if params is None:
|
||||
params = {}
|
||||
|
||||
if isinstance(params, dict):
|
||||
generation_params = GenerateLLMsTextParams(**params)
|
||||
else:
|
||||
generation_params = params
|
||||
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'url': url, **generation_params.dict(exclude_none=True)}
|
||||
|
||||
try:
|
||||
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception('Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, 'start LLMs.txt generation')
|
||||
except Exception as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
return {'success': False, 'error': 'Internal server error'}
|
||||
|
||||
def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Check the status of a LLMs.txt generation operation.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the LLMs.txt generation operation.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: The current status and results of the generation operation.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check fails.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
try:
|
||||
response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception('Failed to parse Firecrawl response as JSON.')
|
||||
elif response.status_code == 404:
|
||||
raise Exception('LLMs.txt generation job not found')
|
||||
else:
|
||||
self._handle_error(response, 'check LLMs.txt generation status')
|
||||
except Exception as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
return {'success': False, 'error': 'Internal server error'}
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
Loading…
x
Reference in New Issue
Block a user