From d984b50400c8fd9dd154b672d650434a205c717c Mon Sep 17 00:00:00 2001 From: Eric Ciarla <43451761+ericciarla@users.noreply.github.com> Date: Wed, 19 Feb 2025 12:42:33 -0500 Subject: [PATCH] Add llmstxt generator endpoint (#1201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Nick: * Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)" This reverts commit 586a10f40d354a038afc2b67809f20a7a829f8cb. * Update deep-research-service.ts * Nick: * init * part 2 * Update generate-llmstxt-service.ts * Fix queue * Update queue-worker.ts * Almost there * Final touches * Update requests.http * final touches * Update requests.http * Improve logging * Change endpoint to /llmstxt * Update queue-worker.ts * Update generate-llmstxt-service.ts * Nick: cache * Update index.ts * Update firecrawl.py * Update package.json --------- Co-authored-by: Nicolas Co-authored-by: Gergő Móricz --- apps/api/requests.http | 20 ++ .../controllers/v1/deep-research-status.ts | 2 +- .../controllers/v1/generate-llmstxt-status.ts | 41 +++++ .../src/controllers/v1/generate-llmstxt.ts | 89 +++++++++ apps/api/src/index.ts | 2 + .../generate-llmstxt-redis.ts | 70 +++++++ .../generate-llmstxt-service.ts | 174 ++++++++++++++++++ .../generate-llmstxt-supabase.ts | 82 +++++++++ apps/api/src/routes/v1.ts | 14 ++ apps/api/src/services/queue-service.ts | 20 ++ apps/api/src/services/queue-worker.ts | 74 ++++++++ apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 154 ++++++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 125 +++++++++++++ 14 files changed, 867 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/controllers/v1/generate-llmstxt-status.ts create mode 100644 apps/api/src/controllers/v1/generate-llmstxt.ts create mode 100644 apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts create mode 100644 apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts create mode 100644 apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts diff --git a/apps/api/requests.http b/apps/api/requests.http index c9495cf8..6308738a 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -100,3 +100,23 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} ### DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} + +### Generate LLMs TXT +# @name llmsTxt +POST {{baseUrl}}/v1/llmstxt HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "maxUrls": 2, + "showFullText": false +} + + +### Check Generate LLMs TXT Status +@llmsTxtId = {{llmsTxt.response.body.$.id}} +# @name llmsTxtStatus +GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} + diff --git a/apps/api/src/controllers/v1/deep-research-status.ts b/apps/api/src/controllers/v1/deep-research-status.ts index ea6e9165..43e99630 100644 --- a/apps/api/src/controllers/v1/deep-research-status.ts +++ b/apps/api/src/controllers/v1/deep-research-status.ts @@ -21,7 +21,7 @@ export async function deepResearchStatusController( let data: any = null; - if (research.status === "completed") { + if (research.status === "completed" && process.env.USE_DB_AUTHENTICATION === "true") { const jobData = await supabaseGetJobsById([req.params.jobId]); if (jobData && jobData.length > 0) { data = jobData[0].docs[0]; diff --git a/apps/api/src/controllers/v1/generate-llmstxt-status.ts b/apps/api/src/controllers/v1/generate-llmstxt-status.ts new file mode 100644 index 00000000..e8c2ff0c --- /dev/null +++ b/apps/api/src/controllers/v1/generate-llmstxt-status.ts @@ -0,0 +1,41 @@ +import { Response } from "express"; +import { RequestWithAuth } from "./types"; +import { getGeneratedLlmsTxt, getGeneratedLlmsTxtExpiry } from "../../lib/generate-llmstxt/generate-llmstxt-redis"; +import { supabaseGetJobsById } from "../../lib/supabase-jobs"; + +export async function generateLLMsTextStatusController( + req: RequestWithAuth<{ jobId: string }, any, any>, + res: Response, +) { + const generation = await getGeneratedLlmsTxt(req.params.jobId); + const showFullText = generation?.showFullText ?? false; + + if (!generation) { + return res.status(404).json({ + success: false, + error: "llmsTxt generation job not found", + }); + } + + let data: any = null; + + if (showFullText) { + data = { + llmstxt: generation.generatedText, + llmsfulltxt: generation.fullText, + }; + } else { + data = { + llmstxt: generation.generatedText, + }; + } + + return res.status(200).json({ + success: generation.status === "failed" ? false : true, + + data: data, + status: generation.status, + error: generation?.error ?? undefined, + expiresAt: (await getGeneratedLlmsTxtExpiry(req.params.jobId)).toISOString(), + }); +} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/generate-llmstxt.ts b/apps/api/src/controllers/v1/generate-llmstxt.ts new file mode 100644 index 00000000..e7618f72 --- /dev/null +++ b/apps/api/src/controllers/v1/generate-llmstxt.ts @@ -0,0 +1,89 @@ +import { Response } from "express"; +import { RequestWithAuth } from "./types"; +import { getGenerateLlmsTxtQueue } from "../../services/queue-service"; +import * as Sentry from "@sentry/node"; +import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis"; +import { z } from "zod"; + +export const generateLLMsTextRequestSchema = z.object({ + url: z.string().url().describe('The URL to generate text from'), + maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'), + showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'), + __experimental_stream: z.boolean().optional(), +}); + +export type GenerateLLMsTextRequest = z.infer; + +export type GenerateLLMsTextResponse = { + success: boolean; + id: string; +}; + +/** + * Initiates a text generation job based on the provided URL. + * @param req - The request object containing authentication and generation parameters. + * @param res - The response object to send the generation job ID. + * @returns A promise that resolves when the generation job is queued. + */ +export async function generateLLMsTextController( + req: RequestWithAuth<{}, GenerateLLMsTextResponse, GenerateLLMsTextRequest>, + res: Response, +) { + req.body = generateLLMsTextRequestSchema.parse(req.body); + + const generationId = crypto.randomUUID(); + const jobData = { + request: req.body, + teamId: req.auth.team_id, + plan: req.auth.plan, + subId: req.acuc?.sub_id, + generationId, + }; + + await saveGeneratedLlmsTxt(generationId, { + id: generationId, + team_id: req.auth.team_id, + plan: req.auth.plan!, // Add non-null assertion since plan is required + createdAt: Date.now(), + status: "processing", + url: req.body.url, + maxUrls: req.body.maxUrls, + showFullText: req.body.showFullText, + generatedText: "", + fullText: "", + }); + + if (Sentry.isInitialized()) { + const size = JSON.stringify(jobData).length; + await Sentry.startSpan( + { + name: "Add LLMstxt generation job", + op: "queue.publish", + attributes: { + "messaging.message.id": generationId, + "messaging.destination.name": getGenerateLlmsTxtQueue().name, + "messaging.message.body.size": size, + }, + }, + async (span) => { + await getGenerateLlmsTxtQueue().add(generationId, { + ...jobData, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, { jobId: generationId }); + }, + ); + } else { + await getGenerateLlmsTxtQueue().add(generationId, jobData, { + jobId: generationId, + }); + } + + return res.status(200).json({ + success: true, + id: generationId, + }); +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index a278be9f..d927d56d 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -8,6 +8,7 @@ import { getExtractQueue, getScrapeQueue, getIndexQueue, + getGenerateLlmsTxtQueue, getDeepResearchQueue, } from "./services/queue-service"; import { v0Router } from "./routes/v0"; @@ -55,6 +56,7 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue()), new BullAdapter(getIndexQueue()), + new BullAdapter(getGenerateLlmsTxtQueue()), new BullAdapter(getDeepResearchQueue()), ], serverAdapter: serverAdapter, diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts new file mode 100644 index 00000000..26aebfcf --- /dev/null +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts @@ -0,0 +1,70 @@ +import { redisConnection } from "../../services/queue-service"; +import { logger as _logger } from "../logger"; + +export interface GenerationData { + id: string; + team_id: string; + plan: string; + createdAt: number; + status: "processing" | "completed" | "failed"; + url: string; + maxUrls: number; + showFullText: boolean; + generatedText: string; + fullText: string; + error?: string; +} + +// TTL of 24 hours +const GENERATION_TTL = 24 * 60 * 60; + +export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise { + _logger.debug("Saving llmstxt generation " + id + " to Redis..."); + await redisConnection.set("generation:" + id, JSON.stringify(data)); + await redisConnection.expire("generation:" + id, GENERATION_TTL); +} + +export async function getGeneratedLlmsTxt(id: string): Promise { + const x = await redisConnection.get("generation:" + id); + return x ? JSON.parse(x) : null; +} + +export async function updateGeneratedLlmsTxt( + id: string, + data: Partial, +): Promise { + const current = await getGeneratedLlmsTxt(id); + if (!current) return; + + const updatedGeneration = { + ...current, + ...data + }; + + await redisConnection.set("generation:" + id, JSON.stringify(updatedGeneration)); + await redisConnection.expire("generation:" + id, GENERATION_TTL); +} + +export async function getGeneratedLlmsTxtExpiry(id: string): Promise { + const d = new Date(); + const ttl = await redisConnection.pttl("generation:" + id); + d.setMilliseconds(d.getMilliseconds() + ttl); + d.setMilliseconds(0); + return d; +} + +// Convenience method for status updates +export async function updateGeneratedLlmsTxtStatus( + id: string, + status: "processing" | "completed" | "failed", + generatedText?: string, + fullText?: string, + error?: string, +): Promise { + const updates: Partial = { status }; + if (generatedText !== undefined) updates.generatedText = generatedText; + if (fullText !== undefined) updates.fullText = fullText; + if (error !== undefined) updates.error = error; + + await updateGeneratedLlmsTxt(id, updates); +} \ No newline at end of file diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts new file mode 100644 index 00000000..410c585f --- /dev/null +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -0,0 +1,174 @@ +import { logger as _logger } from "../logger"; +import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis"; +import { getMapResults } from "../../controllers/v1/map"; +import { MapResponse, ScrapeResponse, Document } from "../../controllers/v1/types"; +import { Response } from "express"; +import OpenAI from "openai"; +import { zodResponseFormat } from "openai/helpers/zod"; +import { z } from "zod"; +import { scrapeDocument } from "../extract/document-scraper"; +import { PlanType } from "../../types"; +import { getLlmsTextFromCache, saveLlmsTextToCache } from "./generate-llmstxt-supabase"; + +interface GenerateLLMsTextServiceOptions { + generationId: string; + teamId: string; + plan: PlanType; + url: string; + maxUrls: number; + showFullText: boolean; +} + + +const DescriptionSchema = z.object({ + description: z.string(), + title: z.string(), +}); + +export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) { + const openai = new OpenAI(); + const { generationId, teamId, plan, url, maxUrls, showFullText } = options; + + const logger = _logger.child({ + module: "generate-llmstxt", + method: "performGenerateLlmsTxt", + generationId, + teamId, + }); + + try { + // Check cache first + const cachedResult = await getLlmsTextFromCache(url, maxUrls); + if (cachedResult) { + logger.info("Found cached LLMs text", { url }); + + // Update final result with cached text + await updateGeneratedLlmsTxt(generationId, { + status: "completed", + generatedText: cachedResult.llmstxt, + fullText: cachedResult.llmstxt_full, + showFullText: showFullText, + }); + + return { + success: true, + data: { + generatedText: cachedResult.llmstxt, + fullText: cachedResult.llmstxt_full, + showFullText: showFullText, + }, + }; + } + + // If not in cache, proceed with generation + // First, get all URLs from the map controller + const mapResult = await getMapResults({ + url, + teamId, + plan, + limit: maxUrls, + includeSubdomains: false, + ignoreSitemap: false, + includeMetadata: true, + }); + + if (!mapResult || !mapResult.links) { + throw new Error(`Failed to map URLs`); + } + + _logger.debug("Mapping URLs", mapResult.links); + + const urls = mapResult.links; + let llmstxt = `# ${url} llms.txt\n\n`; + let llmsFulltxt = `# ${url} llms-full.txt\n\n`; + + + // Scrape each URL + for (const url of urls) { + _logger.debug(`Scraping URL: ${url}`); + const document = await scrapeDocument( + { + url, + teamId, + plan, + origin: url, + timeout: 30000, + isSingleUrl: true, + }, + [], + logger, + { onlyMainContent: true } + ); + + if (!document) { + logger.error(`Failed to scrape URL ${url}`); + continue; + } + + // Process scraped result + if (!document.markdown) continue; + + _logger.debug(`Generating description for ${document.metadata?.url}`); + + const completion = await openai.beta.chat.completions.parse({ + model: "gpt-4o-mini", + messages: [ + { + role: "user", + content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}` + } + ], + response_format: zodResponseFormat(DescriptionSchema, "description") + }); + + try { + const parsedResponse = completion.choices[0].message.parsed; + const description = parsedResponse!.description; + const title = parsedResponse!.title; + + llmstxt += `- [${title}](${document.metadata?.url}): ${description}\n`; + llmsFulltxt += `## ${title}\n${document.markdown}\n\n`; + + // Update progress with both generated text and full text + await updateGeneratedLlmsTxt(generationId, { + status: "processing", + generatedText: llmstxt, + fullText: llmsFulltxt, + }); + } catch (error) { + logger.error(`Failed to parse LLM response for ${document.metadata?.url}`, { error }); + continue; + } + } + + // After successful generation, save to cache + await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls); + + // Update final result with both generated text and full text + await updateGeneratedLlmsTxt(generationId, { + status: "completed", + generatedText: llmstxt, + fullText: llmsFulltxt, + showFullText: showFullText, + }); + + return { + success: true, + data: { + generatedText: llmstxt, + fullText: llmsFulltxt, + showFullText: showFullText, + }, + }; + + } catch (error: any) { + logger.error("Generate LLMs text error", { error }); + + await updateGeneratedLlmsTxt(generationId, { + status: "failed", + error: error.message || "Unknown error occurred", + }); + + throw error; + } +} \ No newline at end of file diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts new file mode 100644 index 00000000..73be813b --- /dev/null +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts @@ -0,0 +1,82 @@ +import { supabase_service } from "../../services/supabase"; +import { logger } from "../logger"; +import { normalizeUrlOnlyHostname } from "../canonical-url"; + +interface LlmsTextCache { + origin_url: string; + llmstxt: string; + llmstxt_full: string; + max_urls: number; +} + +export async function getLlmsTextFromCache( + url: string, + maxUrls: number, +): Promise { + if (process.env.USE_DB_AUTHENTICATION !== "true") { + return null; + } + + const originUrl = normalizeUrlOnlyHostname(url); + + try { + const { data, error } = await supabase_service + .from("llm_texts") + .select("*") + .eq("origin_url", originUrl) + .gte("max_urls", maxUrls) // Changed to gte since we want cached results with more URLs than requested + .order("updated_at", { ascending: false }) + .limit(1) + .single(); + + if (error) { + return null; + } + + return data; + } catch (error) { + logger.error("Failed to fetch LLMs text from cache", { error, originUrl }); + return null; + } +} + +export async function saveLlmsTextToCache( + url: string, + llmstxt: string, + llmstxt_full: string, + maxUrls: number, +): Promise { + if (process.env.USE_DB_AUTHENTICATION !== "true") { + return; + } + + const originUrl = normalizeUrlOnlyHostname(url); + + try { + // First check if there's an existing entry with fewer URLs + const { data: existingData } = await supabase_service + .from("llm_texts") + .select("*") + .eq("origin_url", originUrl) + .single(); + + // Always update the entry for the origin URL + const { error } = await supabase_service + .from("llm_texts") + .update({ + llmstxt, + llmstxt_full, + max_urls: maxUrls, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + logger.error("Error saving LLMs text to cache", { error, originUrl }); + } else { + logger.debug("Successfully cached LLMs text", { originUrl, maxUrls }); + } + } catch (error) { + logger.error("Failed to save LLMs text to cache", { error, originUrl }); + } +} diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 2a99f8aa..f5041ccb 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -29,6 +29,8 @@ import { creditUsageController } from "../controllers/v1/credit-usage"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { searchController } from "../controllers/v1/search"; import { crawlErrorsController } from "../controllers/v1/crawl-errors"; +import { generateLLMsTextController } from "../controllers/v1/generate-llmstxt"; +import { generateLLMsTextStatusController } from "../controllers/v1/generate-llmstxt-status"; import { deepResearchController } from "../controllers/v1/deep-research"; import { deepResearchStatusController } from "../controllers/v1/deep-research-status"; @@ -242,6 +244,18 @@ v1Router.get( wrap(extractStatusController), ); +v1Router.post( + "/llmstxt", + authMiddleware(RateLimiterMode.Extract), + wrap(generateLLMsTextController), +); + +v1Router.get( + "/llmstxt/:jobId", + authMiddleware(RateLimiterMode.ExtractStatus), + wrap(generateLLMsTextStatusController), +); + v1Router.post( "/deep-research", authMiddleware(RateLimiterMode.Extract), diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 1526ec25..9969c4b2 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -7,6 +7,7 @@ let extractQueue: Queue; let loggingQueue: Queue; let indexQueue: Queue; let deepResearchQueue: Queue; +let generateLlmsTxtQueue: Queue; export const redisConnection = new IORedis(process.env.REDIS_URL!, { maxRetriesPerRequest: null, @@ -16,6 +17,7 @@ export const scrapeQueueName = "{scrapeQueue}"; export const extractQueueName = "{extractQueue}"; export const loggingQueueName = "{loggingQueue}"; export const indexQueueName = "{indexQueue}"; +export const generateLlmsTxtQueueName = "{generateLlmsTxtQueue}"; export const deepResearchQueueName = "{deepResearchQueue}"; export function getScrapeQueue() { @@ -72,6 +74,24 @@ export function getIndexQueue() { return indexQueue; } +export function getGenerateLlmsTxtQueue() { + if (!generateLlmsTxtQueue) { + generateLlmsTxtQueue = new Queue(generateLlmsTxtQueueName, { + connection: redisConnection, + defaultJobOptions: { + removeOnComplete: { + age: 90000, // 25 hours + }, + removeOnFail: { + age: 90000, // 25 hours + }, + }, + }); + logger.info("LLMs TXT generation queue created"); + } + return generateLlmsTxtQueue; +} + export function getDeepResearchQueue() { if (!deepResearchQueue) { deepResearchQueue = new Queue(deepResearchQueueName, { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 124194bf..2fddbd7c 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -11,6 +11,7 @@ import { extractQueueName, deepResearchQueueName, getIndexQueue, + getGenerateLlmsTxtQueue, } from "./queue-service"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; @@ -69,6 +70,8 @@ import { billTeam } from "./billing/credit_billing"; import { saveCrawlMap } from "./indexing/crawl-maps-index"; import { updateDeepResearch } from "../lib/deep-research/deep-research-redis"; import { performDeepResearch } from "../lib/deep-research/deep-research-service"; +import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service"; +import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis"; configDotenv(); @@ -446,6 +449,76 @@ const processDeepResearchJobInternal = async ( } }; +const processGenerateLlmsTxtJobInternal = async ( + token: string, + job: Job & { id: string }, +) => { + const logger = _logger.child({ + module: "generate-llmstxt-worker", + method: "processJobInternal", + jobId: job.id, + generateId: job.data.generateId, + teamId: job.data?.teamId ?? undefined, + }); + + const extendLockInterval = setInterval(async () => { + logger.info(`🔄 Worker extending lock on job ${job.id}`); + await job.extendLock(token, jobLockExtensionTime); + }, jobLockExtendInterval); + + try { + const result = await performGenerateLlmsTxt({ + generationId: job.data.generationId, + teamId: job.data.teamId, + plan: job.data.plan, + url: job.data.request.url, + maxUrls: job.data.request.maxUrls, + showFullText: job.data.request.showFullText, + }); + + if (result.success) { + await job.moveToCompleted(result, token, false); + await updateGeneratedLlmsTxt(job.data.generateId, { + status: "completed", + generatedText: result.data.generatedText, + fullText: result.data.fullText, + }); + return result; + } else { + const error = new Error("LLMs text generation failed without specific error"); + await job.moveToFailed(error, token, false); + await updateGeneratedLlmsTxt(job.data.generateId, { + status: "failed", + error: error.message, + }); + return { success: false, error: error.message }; + } + } catch (error) { + logger.error(`🚫 Job errored ${job.id} - ${error}`, { error }); + + Sentry.captureException(error, { + data: { + job: job.id, + }, + }); + + try { + await job.moveToFailed(error, token, false); + } catch (e) { + logger.error("Failed to move job to failed state in Redis", { error }); + } + + await updateGeneratedLlmsTxt(job.data.generateId, { + status: "failed", + error: error.message || "Unknown error occurred", + }); + + return { success: false, error: error.message || "Unknown error occurred" }; + } finally { + clearInterval(extendLockInterval); + } +}; + let isShuttingDown = false; process.on("SIGINT", () => { @@ -1170,6 +1243,7 @@ async function processJob(job: Job & { id: string }, token: string) { workerFun(getScrapeQueue(), processJobInternal), workerFun(getExtractQueue(), processExtractJobInternal), workerFun(getDeepResearchQueue(), processDeepResearchJobInternal), + workerFun(getGenerateLlmsTxtQueue(), processGenerateLlmsTxtJobInternal), ]); console.log("All workers exited. Waiting for all jobs to finish..."); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 23b07d13..75164c4d 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.18.1", + "version": "1.18.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 64f1de34..7601414c 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -413,6 +413,48 @@ export interface DeepResearchStatusResponse { summaries: string[]; } +/** + * Parameters for LLMs.txt generation operations. + */ +export interface GenerateLLMsTextParams { + /** + * Maximum number of URLs to process (1-100) + * @default 10 + */ + maxUrls?: number; + /** + * Whether to show the full LLMs-full.txt in the response + * @default false + */ + showFullText?: boolean; + /** + * Experimental flag for streaming + */ + __experimental_stream?: boolean; +} + +/** + * Response interface for LLMs.txt generation operations. + */ +export interface GenerateLLMsTextResponse { + success: boolean; + id: string; +} + +/** + * Status response interface for LLMs.txt generation operations. + */ +export interface GenerateLLMsTextStatusResponse { + success: boolean; + data: { + llmstxt: string; + llmsfulltxt?: string; + }; + status: "processing" | "completed" | "failed"; + error?: string; + expiresAt: string; +} + /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. @@ -1459,6 +1501,118 @@ export default class FirecrawlApp { } return { success: false, error: "Internal server error." }; } + + /** + * Generates LLMs.txt for a given URL and polls until completion. + * @param url - The URL to generate LLMs.txt from. + * @param params - Parameters for the LLMs.txt generation operation. + * @returns The final generation results. + */ + async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { + try { + const response = await this.asyncGenerateLLMsText(url, params); + + if (!response.success || 'error' in response) { + return { success: false, error: 'error' in response ? response.error : 'Unknown error' }; + } + + if (!response.id) { + throw new FirecrawlError(`Failed to start LLMs.txt generation. No job ID returned.`, 500); + } + + const jobId = response.id; + let generationStatus; + + while (true) { + generationStatus = await this.checkGenerateLLMsTextStatus(jobId); + + if ('error' in generationStatus && !generationStatus.success) { + return generationStatus; + } + + if (generationStatus.status === "completed") { + return generationStatus; + } + + if (generationStatus.status === "failed") { + throw new FirecrawlError( + `LLMs.txt generation job ${generationStatus.status}. Error: ${generationStatus.error}`, + 500 + ); + } + + if (generationStatus.status !== "processing") { + break; + } + + await new Promise(resolve => setTimeout(resolve, 2000)); + } + + return { success: false, error: "LLMs.txt generation job terminated unexpectedly" }; + } catch (error: any) { + throw new FirecrawlError(error.message, 500, error.response?.data?.details); + } + } + + /** + * Initiates a LLMs.txt generation operation without polling. + * @param url - The URL to generate LLMs.txt from. + * @param params - Parameters for the LLMs.txt generation operation. + * @returns The response containing the generation job ID. + */ + async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { + const headers = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.postRequest( + `${this.apiUrl}/v1/llmstxt`, + { url, ...params }, + headers + ); + + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "start LLMs.txt generation"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } + + /** + * Checks the status of a LLMs.txt generation operation. + * @param id - The ID of the LLMs.txt generation operation. + * @returns The current status and results of the generation operation. + */ + async checkGenerateLLMsTextStatus(id: string): Promise { + const headers = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/llmstxt/${id}`, + headers + ); + + if (response.status === 200) { + return response.data; + } else if (response.status === 404) { + throw new FirecrawlError("LLMs.txt generation job not found", 404); + } else { + this.handleError(response, "check LLMs.txt generation status"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } } interface CrawlWatcherEvents { diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 736016ac..807ab339 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -33,6 +33,14 @@ class SearchParams(pydantic.BaseModel): timeout: Optional[int] = 60000 scrapeOptions: Optional[Dict[str, Any]] = None +class GenerateLLMsTextParams(pydantic.BaseModel): + """ + Parameters for the LLMs.txt generation operation. + """ + maxUrls: Optional[int] = 10 + showFullText: Optional[bool] = False + __experimental_stream: Optional[bool] = None + class FirecrawlApp: class SearchResponse(pydantic.BaseModel): """ @@ -756,6 +764,123 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) + def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + """ + Generate LLMs.txt for a given URL and poll until completion. + + Args: + url (str): The URL to generate LLMs.txt from. + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + + Returns: + Dict[str, Any]: A dictionary containing the generation results. The structure includes: + - 'success' (bool): Indicates if the generation was successful. + - 'status' (str): The final status of the generation job. + - 'data' (Dict): The generated LLMs.txt data. + - 'error' (Optional[str]): Error message if the generation failed. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires. + + Raises: + Exception: If the generation job fails or an error occurs during status checks. + """ + if params is None: + params = {} + + if isinstance(params, dict): + generation_params = GenerateLLMsTextParams(**params) + else: + generation_params = params + + response = self.async_generate_llms_text(url, generation_params) + if not response.get('success') or 'id' not in response: + return response + + job_id = response['id'] + while True: + status = self.check_generate_llms_text_status(job_id) + + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}') + elif status['status'] != 'processing': + break + + time.sleep(2) # Polling interval + + return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} + + def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + """ + Initiate an asynchronous LLMs.txt generation operation. + + Args: + url (str): The URL to generate LLMs.txt from. + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + + Returns: + Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes: + - 'success' (bool): Indicates if the generation initiation was successful. + - 'id' (str): The unique identifier for the generation job. + + Raises: + Exception: If the generation job initiation fails. + """ + if params is None: + params = {} + + if isinstance(params, dict): + generation_params = GenerateLLMsTextParams(**params) + else: + generation_params = params + + headers = self._prepare_headers() + json_data = {'url': url, **generation_params.dict(exclude_none=True)} + + try: + response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers) + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start LLMs.txt generation') + except Exception as e: + raise ValueError(str(e)) + + return {'success': False, 'error': 'Internal server error'} + + def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]: + """ + Check the status of a LLMs.txt generation operation. + + Args: + id (str): The ID of the LLMs.txt generation operation. + + Returns: + Dict[str, Any]: The current status and results of the generation operation. + + Raises: + Exception: If the status check fails. + """ + headers = self._prepare_headers() + try: + response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers) + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + elif response.status_code == 404: + raise Exception('LLMs.txt generation job not found') + else: + self._handle_error(response, 'check LLMs.txt generation status') + except Exception as e: + raise ValueError(str(e)) + + return {'success': False, 'error': 'Internal server error'} + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests.