Add llmstxt generator endpoint (#1201)

* Nick:

* Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)"

This reverts commit 586a10f40d354a038afc2b67809f20a7a829f8cb.

* Update deep-research-service.ts

* Nick:

* init

* part 2

* Update generate-llmstxt-service.ts

* Fix queue

* Update queue-worker.ts

* Almost there

* Final touches

* Update requests.http

* final touches

* Update requests.http

* Improve logging

* Change endpoint to /llmstxt

* Update queue-worker.ts

* Update generate-llmstxt-service.ts

* Nick: cache

* Update index.ts

* Update firecrawl.py

* Update package.json

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
Eric Ciarla 2025-02-19 12:42:33 -05:00 committed by GitHub
parent e373fab5c1
commit d984b50400
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 867 additions and 2 deletions

View File

@ -100,3 +100,23 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
###
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Generate LLMs TXT
# @name llmsTxt
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "https://firecrawl.dev",
"maxUrls": 2,
"showFullText": false
}
### Check Generate LLMs TXT Status
@llmsTxtId = {{llmsTxt.response.body.$.id}}
# @name llmsTxtStatus
GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}

View File

@ -21,7 +21,7 @@ export async function deepResearchStatusController(
let data: any = null;
if (research.status === "completed") {
if (research.status === "completed" && process.env.USE_DB_AUTHENTICATION === "true") {
const jobData = await supabaseGetJobsById([req.params.jobId]);
if (jobData && jobData.length > 0) {
data = jobData[0].docs[0];

View File

@ -0,0 +1,41 @@
import { Response } from "express";
import { RequestWithAuth } from "./types";
import { getGeneratedLlmsTxt, getGeneratedLlmsTxtExpiry } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
import { supabaseGetJobsById } from "../../lib/supabase-jobs";
export async function generateLLMsTextStatusController(
req: RequestWithAuth<{ jobId: string }, any, any>,
res: Response,
) {
const generation = await getGeneratedLlmsTxt(req.params.jobId);
const showFullText = generation?.showFullText ?? false;
if (!generation) {
return res.status(404).json({
success: false,
error: "llmsTxt generation job not found",
});
}
let data: any = null;
if (showFullText) {
data = {
llmstxt: generation.generatedText,
llmsfulltxt: generation.fullText,
};
} else {
data = {
llmstxt: generation.generatedText,
};
}
return res.status(200).json({
success: generation.status === "failed" ? false : true,
data: data,
status: generation.status,
error: generation?.error ?? undefined,
expiresAt: (await getGeneratedLlmsTxtExpiry(req.params.jobId)).toISOString(),
});
}

View File

@ -0,0 +1,89 @@
import { Response } from "express";
import { RequestWithAuth } from "./types";
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node";
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
import { z } from "zod";
export const generateLLMsTextRequestSchema = z.object({
url: z.string().url().describe('The URL to generate text from'),
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
__experimental_stream: z.boolean().optional(),
});
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
export type GenerateLLMsTextResponse = {
success: boolean;
id: string;
};
/**
* Initiates a text generation job based on the provided URL.
* @param req - The request object containing authentication and generation parameters.
* @param res - The response object to send the generation job ID.
* @returns A promise that resolves when the generation job is queued.
*/
export async function generateLLMsTextController(
req: RequestWithAuth<{}, GenerateLLMsTextResponse, GenerateLLMsTextRequest>,
res: Response<GenerateLLMsTextResponse>,
) {
req.body = generateLLMsTextRequestSchema.parse(req.body);
const generationId = crypto.randomUUID();
const jobData = {
request: req.body,
teamId: req.auth.team_id,
plan: req.auth.plan,
subId: req.acuc?.sub_id,
generationId,
};
await saveGeneratedLlmsTxt(generationId, {
id: generationId,
team_id: req.auth.team_id,
plan: req.auth.plan!, // Add non-null assertion since plan is required
createdAt: Date.now(),
status: "processing",
url: req.body.url,
maxUrls: req.body.maxUrls,
showFullText: req.body.showFullText,
generatedText: "",
fullText: "",
});
if (Sentry.isInitialized()) {
const size = JSON.stringify(jobData).length;
await Sentry.startSpan(
{
name: "Add LLMstxt generation job",
op: "queue.publish",
attributes: {
"messaging.message.id": generationId,
"messaging.destination.name": getGenerateLlmsTxtQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await getGenerateLlmsTxtQueue().add(generationId, {
...jobData,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
}, { jobId: generationId });
},
);
} else {
await getGenerateLlmsTxtQueue().add(generationId, jobData, {
jobId: generationId,
});
}
return res.status(200).json({
success: true,
id: generationId,
});
}

View File

@ -8,6 +8,7 @@ import {
getExtractQueue,
getScrapeQueue,
getIndexQueue,
getGenerateLlmsTxtQueue,
getDeepResearchQueue,
} from "./services/queue-service";
import { v0Router } from "./routes/v0";
@ -55,6 +56,7 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
new BullAdapter(getScrapeQueue()),
new BullAdapter(getExtractQueue()),
new BullAdapter(getIndexQueue()),
new BullAdapter(getGenerateLlmsTxtQueue()),
new BullAdapter(getDeepResearchQueue()),
],
serverAdapter: serverAdapter,

View File

@ -0,0 +1,70 @@
import { redisConnection } from "../../services/queue-service";
import { logger as _logger } from "../logger";
export interface GenerationData {
id: string;
team_id: string;
plan: string;
createdAt: number;
status: "processing" | "completed" | "failed";
url: string;
maxUrls: number;
showFullText: boolean;
generatedText: string;
fullText: string;
error?: string;
}
// TTL of 24 hours
const GENERATION_TTL = 24 * 60 * 60;
export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise<void> {
_logger.debug("Saving llmstxt generation " + id + " to Redis...");
await redisConnection.set("generation:" + id, JSON.stringify(data));
await redisConnection.expire("generation:" + id, GENERATION_TTL);
}
export async function getGeneratedLlmsTxt(id: string): Promise<GenerationData | null> {
const x = await redisConnection.get("generation:" + id);
return x ? JSON.parse(x) : null;
}
export async function updateGeneratedLlmsTxt(
id: string,
data: Partial<GenerationData>,
): Promise<void> {
const current = await getGeneratedLlmsTxt(id);
if (!current) return;
const updatedGeneration = {
...current,
...data
};
await redisConnection.set("generation:" + id, JSON.stringify(updatedGeneration));
await redisConnection.expire("generation:" + id, GENERATION_TTL);
}
export async function getGeneratedLlmsTxtExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("generation:" + id);
d.setMilliseconds(d.getMilliseconds() + ttl);
d.setMilliseconds(0);
return d;
}
// Convenience method for status updates
export async function updateGeneratedLlmsTxtStatus(
id: string,
status: "processing" | "completed" | "failed",
generatedText?: string,
fullText?: string,
error?: string,
): Promise<void> {
const updates: Partial<GenerationData> = { status };
if (generatedText !== undefined) updates.generatedText = generatedText;
if (fullText !== undefined) updates.fullText = fullText;
if (error !== undefined) updates.error = error;
await updateGeneratedLlmsTxt(id, updates);
}

View File

@ -0,0 +1,174 @@
import { logger as _logger } from "../logger";
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";
import { getMapResults } from "../../controllers/v1/map";
import { MapResponse, ScrapeResponse, Document } from "../../controllers/v1/types";
import { Response } from "express";
import OpenAI from "openai";
import { zodResponseFormat } from "openai/helpers/zod";
import { z } from "zod";
import { scrapeDocument } from "../extract/document-scraper";
import { PlanType } from "../../types";
import { getLlmsTextFromCache, saveLlmsTextToCache } from "./generate-llmstxt-supabase";
interface GenerateLLMsTextServiceOptions {
generationId: string;
teamId: string;
plan: PlanType;
url: string;
maxUrls: number;
showFullText: boolean;
}
const DescriptionSchema = z.object({
description: z.string(),
title: z.string(),
});
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
const openai = new OpenAI();
const { generationId, teamId, plan, url, maxUrls, showFullText } = options;
const logger = _logger.child({
module: "generate-llmstxt",
method: "performGenerateLlmsTxt",
generationId,
teamId,
});
try {
// Check cache first
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
if (cachedResult) {
logger.info("Found cached LLMs text", { url });
// Update final result with cached text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
showFullText: showFullText,
});
return {
success: true,
data: {
generatedText: cachedResult.llmstxt,
fullText: cachedResult.llmstxt_full,
showFullText: showFullText,
},
};
}
// If not in cache, proceed with generation
// First, get all URLs from the map controller
const mapResult = await getMapResults({
url,
teamId,
plan,
limit: maxUrls,
includeSubdomains: false,
ignoreSitemap: false,
includeMetadata: true,
});
if (!mapResult || !mapResult.links) {
throw new Error(`Failed to map URLs`);
}
_logger.debug("Mapping URLs", mapResult.links);
const urls = mapResult.links;
let llmstxt = `# ${url} llms.txt\n\n`;
let llmsFulltxt = `# ${url} llms-full.txt\n\n`;
// Scrape each URL
for (const url of urls) {
_logger.debug(`Scraping URL: ${url}`);
const document = await scrapeDocument(
{
url,
teamId,
plan,
origin: url,
timeout: 30000,
isSingleUrl: true,
},
[],
logger,
{ onlyMainContent: true }
);
if (!document) {
logger.error(`Failed to scrape URL ${url}`);
continue;
}
// Process scraped result
if (!document.markdown) continue;
_logger.debug(`Generating description for ${document.metadata?.url}`);
const completion = await openai.beta.chat.completions.parse({
model: "gpt-4o-mini",
messages: [
{
role: "user",
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
}
],
response_format: zodResponseFormat(DescriptionSchema, "description")
});
try {
const parsedResponse = completion.choices[0].message.parsed;
const description = parsedResponse!.description;
const title = parsedResponse!.title;
llmstxt += `- [${title}](${document.metadata?.url}): ${description}\n`;
llmsFulltxt += `## ${title}\n${document.markdown}\n\n`;
// Update progress with both generated text and full text
await updateGeneratedLlmsTxt(generationId, {
status: "processing",
generatedText: llmstxt,
fullText: llmsFulltxt,
});
} catch (error) {
logger.error(`Failed to parse LLM response for ${document.metadata?.url}`, { error });
continue;
}
}
// After successful generation, save to cache
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
// Update final result with both generated text and full text
await updateGeneratedLlmsTxt(generationId, {
status: "completed",
generatedText: llmstxt,
fullText: llmsFulltxt,
showFullText: showFullText,
});
return {
success: true,
data: {
generatedText: llmstxt,
fullText: llmsFulltxt,
showFullText: showFullText,
},
};
} catch (error: any) {
logger.error("Generate LLMs text error", { error });
await updateGeneratedLlmsTxt(generationId, {
status: "failed",
error: error.message || "Unknown error occurred",
});
throw error;
}
}

View File

@ -0,0 +1,82 @@
import { supabase_service } from "../../services/supabase";
import { logger } from "../logger";
import { normalizeUrlOnlyHostname } from "../canonical-url";
interface LlmsTextCache {
origin_url: string;
llmstxt: string;
llmstxt_full: string;
max_urls: number;
}
export async function getLlmsTextFromCache(
url: string,
maxUrls: number,
): Promise<LlmsTextCache | null> {
if (process.env.USE_DB_AUTHENTICATION !== "true") {
return null;
}
const originUrl = normalizeUrlOnlyHostname(url);
try {
const { data, error } = await supabase_service
.from("llm_texts")
.select("*")
.eq("origin_url", originUrl)
.gte("max_urls", maxUrls) // Changed to gte since we want cached results with more URLs than requested
.order("updated_at", { ascending: false })
.limit(1)
.single();
if (error) {
return null;
}
return data;
} catch (error) {
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
return null;
}
}
export async function saveLlmsTextToCache(
url: string,
llmstxt: string,
llmstxt_full: string,
maxUrls: number,
): Promise<void> {
if (process.env.USE_DB_AUTHENTICATION !== "true") {
return;
}
const originUrl = normalizeUrlOnlyHostname(url);
try {
// First check if there's an existing entry with fewer URLs
const { data: existingData } = await supabase_service
.from("llm_texts")
.select("*")
.eq("origin_url", originUrl)
.single();
// Always update the entry for the origin URL
const { error } = await supabase_service
.from("llm_texts")
.update({
llmstxt,
llmstxt_full,
max_urls: maxUrls,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
logger.error("Error saving LLMs text to cache", { error, originUrl });
} else {
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
}
} catch (error) {
logger.error("Failed to save LLMs text to cache", { error, originUrl });
}
}

View File

@ -29,6 +29,8 @@ import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { searchController } from "../controllers/v1/search";
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
import { generateLLMsTextController } from "../controllers/v1/generate-llmstxt";
import { generateLLMsTextStatusController } from "../controllers/v1/generate-llmstxt-status";
import { deepResearchController } from "../controllers/v1/deep-research";
import { deepResearchStatusController } from "../controllers/v1/deep-research-status";
@ -242,6 +244,18 @@ v1Router.get(
wrap(extractStatusController),
);
v1Router.post(
"/llmstxt",
authMiddleware(RateLimiterMode.Extract),
wrap(generateLLMsTextController),
);
v1Router.get(
"/llmstxt/:jobId",
authMiddleware(RateLimiterMode.ExtractStatus),
wrap(generateLLMsTextStatusController),
);
v1Router.post(
"/deep-research",
authMiddleware(RateLimiterMode.Extract),

View File

@ -7,6 +7,7 @@ let extractQueue: Queue;
let loggingQueue: Queue;
let indexQueue: Queue;
let deepResearchQueue: Queue;
let generateLlmsTxtQueue: Queue;
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
maxRetriesPerRequest: null,
@ -16,6 +17,7 @@ export const scrapeQueueName = "{scrapeQueue}";
export const extractQueueName = "{extractQueue}";
export const loggingQueueName = "{loggingQueue}";
export const indexQueueName = "{indexQueue}";
export const generateLlmsTxtQueueName = "{generateLlmsTxtQueue}";
export const deepResearchQueueName = "{deepResearchQueue}";
export function getScrapeQueue() {
@ -72,6 +74,24 @@ export function getIndexQueue() {
return indexQueue;
}
export function getGenerateLlmsTxtQueue() {
if (!generateLlmsTxtQueue) {
generateLlmsTxtQueue = new Queue(generateLlmsTxtQueueName, {
connection: redisConnection,
defaultJobOptions: {
removeOnComplete: {
age: 90000, // 25 hours
},
removeOnFail: {
age: 90000, // 25 hours
},
},
});
logger.info("LLMs TXT generation queue created");
}
return generateLlmsTxtQueue;
}
export function getDeepResearchQueue() {
if (!deepResearchQueue) {
deepResearchQueue = new Queue(deepResearchQueueName, {

View File

@ -11,6 +11,7 @@ import {
extractQueueName,
deepResearchQueueName,
getIndexQueue,
getGenerateLlmsTxtQueue,
} from "./queue-service";
import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";
@ -69,6 +70,8 @@ import { billTeam } from "./billing/credit_billing";
import { saveCrawlMap } from "./indexing/crawl-maps-index";
import { updateDeepResearch } from "../lib/deep-research/deep-research-redis";
import { performDeepResearch } from "../lib/deep-research/deep-research-service";
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
configDotenv();
@ -446,6 +449,76 @@ const processDeepResearchJobInternal = async (
}
};
const processGenerateLlmsTxtJobInternal = async (
token: string,
job: Job & { id: string },
) => {
const logger = _logger.child({
module: "generate-llmstxt-worker",
method: "processJobInternal",
jobId: job.id,
generateId: job.data.generateId,
teamId: job.data?.teamId ?? undefined,
});
const extendLockInterval = setInterval(async () => {
logger.info(`🔄 Worker extending lock on job ${job.id}`);
await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval);
try {
const result = await performGenerateLlmsTxt({
generationId: job.data.generationId,
teamId: job.data.teamId,
plan: job.data.plan,
url: job.data.request.url,
maxUrls: job.data.request.maxUrls,
showFullText: job.data.request.showFullText,
});
if (result.success) {
await job.moveToCompleted(result, token, false);
await updateGeneratedLlmsTxt(job.data.generateId, {
status: "completed",
generatedText: result.data.generatedText,
fullText: result.data.fullText,
});
return result;
} else {
const error = new Error("LLMs text generation failed without specific error");
await job.moveToFailed(error, token, false);
await updateGeneratedLlmsTxt(job.data.generateId, {
status: "failed",
error: error.message,
});
return { success: false, error: error.message };
}
} catch (error) {
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
Sentry.captureException(error, {
data: {
job: job.id,
},
});
try {
await job.moveToFailed(error, token, false);
} catch (e) {
logger.error("Failed to move job to failed state in Redis", { error });
}
await updateGeneratedLlmsTxt(job.data.generateId, {
status: "failed",
error: error.message || "Unknown error occurred",
});
return { success: false, error: error.message || "Unknown error occurred" };
} finally {
clearInterval(extendLockInterval);
}
};
let isShuttingDown = false;
process.on("SIGINT", () => {
@ -1170,6 +1243,7 @@ async function processJob(job: Job & { id: string }, token: string) {
workerFun(getScrapeQueue(), processJobInternal),
workerFun(getExtractQueue(), processExtractJobInternal),
workerFun(getDeepResearchQueue(), processDeepResearchJobInternal),
workerFun(getGenerateLlmsTxtQueue(), processGenerateLlmsTxtJobInternal),
]);
console.log("All workers exited. Waiting for all jobs to finish...");

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.18.1",
"version": "1.18.2",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -413,6 +413,48 @@ export interface DeepResearchStatusResponse {
summaries: string[];
}
/**
* Parameters for LLMs.txt generation operations.
*/
export interface GenerateLLMsTextParams {
/**
* Maximum number of URLs to process (1-100)
* @default 10
*/
maxUrls?: number;
/**
* Whether to show the full LLMs-full.txt in the response
* @default false
*/
showFullText?: boolean;
/**
* Experimental flag for streaming
*/
__experimental_stream?: boolean;
}
/**
* Response interface for LLMs.txt generation operations.
*/
export interface GenerateLLMsTextResponse {
success: boolean;
id: string;
}
/**
* Status response interface for LLMs.txt generation operations.
*/
export interface GenerateLLMsTextStatusResponse {
success: boolean;
data: {
llmstxt: string;
llmsfulltxt?: string;
};
status: "processing" | "completed" | "failed";
error?: string;
expiresAt: string;
}
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
@ -1459,6 +1501,118 @@ export default class FirecrawlApp {
}
return { success: false, error: "Internal server error." };
}
/**
* Generates LLMs.txt for a given URL and polls until completion.
* @param url - The URL to generate LLMs.txt from.
* @param params - Parameters for the LLMs.txt generation operation.
* @returns The final generation results.
*/
async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
try {
const response = await this.asyncGenerateLLMsText(url, params);
if (!response.success || 'error' in response) {
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
}
if (!response.id) {
throw new FirecrawlError(`Failed to start LLMs.txt generation. No job ID returned.`, 500);
}
const jobId = response.id;
let generationStatus;
while (true) {
generationStatus = await this.checkGenerateLLMsTextStatus(jobId);
if ('error' in generationStatus && !generationStatus.success) {
return generationStatus;
}
if (generationStatus.status === "completed") {
return generationStatus;
}
if (generationStatus.status === "failed") {
throw new FirecrawlError(
`LLMs.txt generation job ${generationStatus.status}. Error: ${generationStatus.error}`,
500
);
}
if (generationStatus.status !== "processing") {
break;
}
await new Promise(resolve => setTimeout(resolve, 2000));
}
return { success: false, error: "LLMs.txt generation job terminated unexpectedly" };
} catch (error: any) {
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
}
}
/**
* Initiates a LLMs.txt generation operation without polling.
* @param url - The URL to generate LLMs.txt from.
* @param params - Parameters for the LLMs.txt generation operation.
* @returns The response containing the generation job ID.
*/
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
const headers = this.prepareHeaders();
try {
const response: AxiosResponse = await this.postRequest(
`${this.apiUrl}/v1/llmstxt`,
{ url, ...params },
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "start LLMs.txt generation");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Checks the status of a LLMs.txt generation operation.
* @param id - The ID of the LLMs.txt generation operation.
* @returns The current status and results of the generation operation.
*/
async checkGenerateLLMsTextStatus(id: string): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/llmstxt/${id}`,
headers
);
if (response.status === 200) {
return response.data;
} else if (response.status === 404) {
throw new FirecrawlError("LLMs.txt generation job not found", 404);
} else {
this.handleError(response, "check LLMs.txt generation status");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
}
interface CrawlWatcherEvents {

View File

@ -33,6 +33,14 @@ class SearchParams(pydantic.BaseModel):
timeout: Optional[int] = 60000
scrapeOptions: Optional[Dict[str, Any]] = None
class GenerateLLMsTextParams(pydantic.BaseModel):
"""
Parameters for the LLMs.txt generation operation.
"""
maxUrls: Optional[int] = 10
showFullText: Optional[bool] = False
__experimental_stream: Optional[bool] = None
class FirecrawlApp:
class SearchResponse(pydantic.BaseModel):
"""
@ -756,6 +764,123 @@ class FirecrawlApp:
except Exception as e:
raise ValueError(str(e), 500)
def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
"""
Generate LLMs.txt for a given URL and poll until completion.
Args:
url (str): The URL to generate LLMs.txt from.
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
Returns:
Dict[str, Any]: A dictionary containing the generation results. The structure includes:
- 'success' (bool): Indicates if the generation was successful.
- 'status' (str): The final status of the generation job.
- 'data' (Dict): The generated LLMs.txt data.
- 'error' (Optional[str]): Error message if the generation failed.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
Raises:
Exception: If the generation job fails or an error occurs during status checks.
"""
if params is None:
params = {}
if isinstance(params, dict):
generation_params = GenerateLLMsTextParams(**params)
else:
generation_params = params
response = self.async_generate_llms_text(url, generation_params)
if not response.get('success') or 'id' not in response:
return response
job_id = response['id']
while True:
status = self.check_generate_llms_text_status(job_id)
if status['status'] == 'completed':
return status
elif status['status'] == 'failed':
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
elif status['status'] != 'processing':
break
time.sleep(2) # Polling interval
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
"""
Initiate an asynchronous LLMs.txt generation operation.
Args:
url (str): The URL to generate LLMs.txt from.
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
Returns:
Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes:
- 'success' (bool): Indicates if the generation initiation was successful.
- 'id' (str): The unique identifier for the generation job.
Raises:
Exception: If the generation job initiation fails.
"""
if params is None:
params = {}
if isinstance(params, dict):
generation_params = GenerateLLMsTextParams(**params)
else:
generation_params = params
headers = self._prepare_headers()
json_data = {'url': url, **generation_params.dict(exclude_none=True)}
try:
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
if response.status_code == 200:
try:
return response.json()
except:
raise Exception('Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, 'start LLMs.txt generation')
except Exception as e:
raise ValueError(str(e))
return {'success': False, 'error': 'Internal server error'}
def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]:
"""
Check the status of a LLMs.txt generation operation.
Args:
id (str): The ID of the LLMs.txt generation operation.
Returns:
Dict[str, Any]: The current status and results of the generation operation.
Raises:
Exception: If the status check fails.
"""
headers = self._prepare_headers()
try:
response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
if response.status_code == 200:
try:
return response.json()
except:
raise Exception('Failed to parse Firecrawl response as JSON.')
elif response.status_code == 404:
raise Exception('LLMs.txt generation job not found')
else:
self._handle_error(response, 'check LLMs.txt generation status')
except Exception as e:
raise ValueError(str(e))
return {'success': False, 'error': 'Internal server error'}
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.