mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 00:15:57 +08:00
Add llmstxt generator endpoint (#1201)
* Nick: * Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)" This reverts commit 586a10f40d354a038afc2b67809f20a7a829f8cb. * Update deep-research-service.ts * Nick: * init * part 2 * Update generate-llmstxt-service.ts * Fix queue * Update queue-worker.ts * Almost there * Final touches * Update requests.http * final touches * Update requests.http * Improve logging * Change endpoint to /llmstxt * Update queue-worker.ts * Update generate-llmstxt-service.ts * Nick: cache * Update index.ts * Update firecrawl.py * Update package.json --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
e373fab5c1
commit
d984b50400
@ -100,3 +100,23 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|||||||
###
|
###
|
||||||
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
### Generate LLMs TXT
|
||||||
|
# @name llmsTxt
|
||||||
|
POST {{baseUrl}}/v1/llmstxt HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl.dev",
|
||||||
|
"maxUrls": 2,
|
||||||
|
"showFullText": false
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Check Generate LLMs TXT Status
|
||||||
|
@llmsTxtId = {{llmsTxt.response.body.$.id}}
|
||||||
|
# @name llmsTxtStatus
|
||||||
|
GET {{baseUrl}}/v1/llmstxt/{{llmsTxtId}} HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ export async function deepResearchStatusController(
|
|||||||
|
|
||||||
let data: any = null;
|
let data: any = null;
|
||||||
|
|
||||||
if (research.status === "completed") {
|
if (research.status === "completed" && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const jobData = await supabaseGetJobsById([req.params.jobId]);
|
const jobData = await supabaseGetJobsById([req.params.jobId]);
|
||||||
if (jobData && jobData.length > 0) {
|
if (jobData && jobData.length > 0) {
|
||||||
data = jobData[0].docs[0];
|
data = jobData[0].docs[0];
|
||||||
|
41
apps/api/src/controllers/v1/generate-llmstxt-status.ts
Normal file
41
apps/api/src/controllers/v1/generate-llmstxt-status.ts
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import { RequestWithAuth } from "./types";
|
||||||
|
import { getGeneratedLlmsTxt, getGeneratedLlmsTxtExpiry } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
|
import { supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||||
|
|
||||||
|
export async function generateLLMsTextStatusController(
|
||||||
|
req: RequestWithAuth<{ jobId: string }, any, any>,
|
||||||
|
res: Response,
|
||||||
|
) {
|
||||||
|
const generation = await getGeneratedLlmsTxt(req.params.jobId);
|
||||||
|
const showFullText = generation?.showFullText ?? false;
|
||||||
|
|
||||||
|
if (!generation) {
|
||||||
|
return res.status(404).json({
|
||||||
|
success: false,
|
||||||
|
error: "llmsTxt generation job not found",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let data: any = null;
|
||||||
|
|
||||||
|
if (showFullText) {
|
||||||
|
data = {
|
||||||
|
llmstxt: generation.generatedText,
|
||||||
|
llmsfulltxt: generation.fullText,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
data = {
|
||||||
|
llmstxt: generation.generatedText,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: generation.status === "failed" ? false : true,
|
||||||
|
|
||||||
|
data: data,
|
||||||
|
status: generation.status,
|
||||||
|
error: generation?.error ?? undefined,
|
||||||
|
expiresAt: (await getGeneratedLlmsTxtExpiry(req.params.jobId)).toISOString(),
|
||||||
|
});
|
||||||
|
}
|
89
apps/api/src/controllers/v1/generate-llmstxt.ts
Normal file
89
apps/api/src/controllers/v1/generate-llmstxt.ts
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import { RequestWithAuth } from "./types";
|
||||||
|
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export const generateLLMsTextRequestSchema = z.object({
|
||||||
|
url: z.string().url().describe('The URL to generate text from'),
|
||||||
|
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
|
||||||
|
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
|
||||||
|
__experimental_stream: z.boolean().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
|
||||||
|
|
||||||
|
export type GenerateLLMsTextResponse = {
|
||||||
|
success: boolean;
|
||||||
|
id: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initiates a text generation job based on the provided URL.
|
||||||
|
* @param req - The request object containing authentication and generation parameters.
|
||||||
|
* @param res - The response object to send the generation job ID.
|
||||||
|
* @returns A promise that resolves when the generation job is queued.
|
||||||
|
*/
|
||||||
|
export async function generateLLMsTextController(
|
||||||
|
req: RequestWithAuth<{}, GenerateLLMsTextResponse, GenerateLLMsTextRequest>,
|
||||||
|
res: Response<GenerateLLMsTextResponse>,
|
||||||
|
) {
|
||||||
|
req.body = generateLLMsTextRequestSchema.parse(req.body);
|
||||||
|
|
||||||
|
const generationId = crypto.randomUUID();
|
||||||
|
const jobData = {
|
||||||
|
request: req.body,
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan,
|
||||||
|
subId: req.acuc?.sub_id,
|
||||||
|
generationId,
|
||||||
|
};
|
||||||
|
|
||||||
|
await saveGeneratedLlmsTxt(generationId, {
|
||||||
|
id: generationId,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
plan: req.auth.plan!, // Add non-null assertion since plan is required
|
||||||
|
createdAt: Date.now(),
|
||||||
|
status: "processing",
|
||||||
|
url: req.body.url,
|
||||||
|
maxUrls: req.body.maxUrls,
|
||||||
|
showFullText: req.body.showFullText,
|
||||||
|
generatedText: "",
|
||||||
|
fullText: "",
|
||||||
|
});
|
||||||
|
|
||||||
|
if (Sentry.isInitialized()) {
|
||||||
|
const size = JSON.stringify(jobData).length;
|
||||||
|
await Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Add LLMstxt generation job",
|
||||||
|
op: "queue.publish",
|
||||||
|
attributes: {
|
||||||
|
"messaging.message.id": generationId,
|
||||||
|
"messaging.destination.name": getGenerateLlmsTxtQueue().name,
|
||||||
|
"messaging.message.body.size": size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async (span) => {
|
||||||
|
await getGenerateLlmsTxtQueue().add(generationId, {
|
||||||
|
...jobData,
|
||||||
|
sentry: {
|
||||||
|
trace: Sentry.spanToTraceHeader(span),
|
||||||
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
|
size,
|
||||||
|
},
|
||||||
|
}, { jobId: generationId });
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
await getGenerateLlmsTxtQueue().add(generationId, jobData, {
|
||||||
|
jobId: generationId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
id: generationId,
|
||||||
|
});
|
||||||
|
}
|
@ -8,6 +8,7 @@ import {
|
|||||||
getExtractQueue,
|
getExtractQueue,
|
||||||
getScrapeQueue,
|
getScrapeQueue,
|
||||||
getIndexQueue,
|
getIndexQueue,
|
||||||
|
getGenerateLlmsTxtQueue,
|
||||||
getDeepResearchQueue,
|
getDeepResearchQueue,
|
||||||
} from "./services/queue-service";
|
} from "./services/queue-service";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
@ -55,6 +56,7 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
|||||||
new BullAdapter(getScrapeQueue()),
|
new BullAdapter(getScrapeQueue()),
|
||||||
new BullAdapter(getExtractQueue()),
|
new BullAdapter(getExtractQueue()),
|
||||||
new BullAdapter(getIndexQueue()),
|
new BullAdapter(getIndexQueue()),
|
||||||
|
new BullAdapter(getGenerateLlmsTxtQueue()),
|
||||||
new BullAdapter(getDeepResearchQueue()),
|
new BullAdapter(getDeepResearchQueue()),
|
||||||
],
|
],
|
||||||
serverAdapter: serverAdapter,
|
serverAdapter: serverAdapter,
|
||||||
|
70
apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
Normal file
70
apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import { redisConnection } from "../../services/queue-service";
|
||||||
|
import { logger as _logger } from "../logger";
|
||||||
|
|
||||||
|
export interface GenerationData {
|
||||||
|
id: string;
|
||||||
|
team_id: string;
|
||||||
|
plan: string;
|
||||||
|
createdAt: number;
|
||||||
|
status: "processing" | "completed" | "failed";
|
||||||
|
url: string;
|
||||||
|
maxUrls: number;
|
||||||
|
showFullText: boolean;
|
||||||
|
generatedText: string;
|
||||||
|
fullText: string;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TTL of 24 hours
|
||||||
|
const GENERATION_TTL = 24 * 60 * 60;
|
||||||
|
|
||||||
|
export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise<void> {
|
||||||
|
_logger.debug("Saving llmstxt generation " + id + " to Redis...");
|
||||||
|
await redisConnection.set("generation:" + id, JSON.stringify(data));
|
||||||
|
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getGeneratedLlmsTxt(id: string): Promise<GenerationData | null> {
|
||||||
|
const x = await redisConnection.get("generation:" + id);
|
||||||
|
return x ? JSON.parse(x) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateGeneratedLlmsTxt(
|
||||||
|
id: string,
|
||||||
|
data: Partial<GenerationData>,
|
||||||
|
): Promise<void> {
|
||||||
|
const current = await getGeneratedLlmsTxt(id);
|
||||||
|
if (!current) return;
|
||||||
|
|
||||||
|
const updatedGeneration = {
|
||||||
|
...current,
|
||||||
|
...data
|
||||||
|
};
|
||||||
|
|
||||||
|
await redisConnection.set("generation:" + id, JSON.stringify(updatedGeneration));
|
||||||
|
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getGeneratedLlmsTxtExpiry(id: string): Promise<Date> {
|
||||||
|
const d = new Date();
|
||||||
|
const ttl = await redisConnection.pttl("generation:" + id);
|
||||||
|
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||||
|
d.setMilliseconds(0);
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convenience method for status updates
|
||||||
|
export async function updateGeneratedLlmsTxtStatus(
|
||||||
|
id: string,
|
||||||
|
status: "processing" | "completed" | "failed",
|
||||||
|
generatedText?: string,
|
||||||
|
fullText?: string,
|
||||||
|
error?: string,
|
||||||
|
): Promise<void> {
|
||||||
|
const updates: Partial<GenerationData> = { status };
|
||||||
|
if (generatedText !== undefined) updates.generatedText = generatedText;
|
||||||
|
if (fullText !== undefined) updates.fullText = fullText;
|
||||||
|
if (error !== undefined) updates.error = error;
|
||||||
|
|
||||||
|
await updateGeneratedLlmsTxt(id, updates);
|
||||||
|
}
|
174
apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
Normal file
174
apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
import { logger as _logger } from "../logger";
|
||||||
|
import { updateGeneratedLlmsTxt } from "./generate-llmstxt-redis";
|
||||||
|
import { getMapResults } from "../../controllers/v1/map";
|
||||||
|
import { MapResponse, ScrapeResponse, Document } from "../../controllers/v1/types";
|
||||||
|
import { Response } from "express";
|
||||||
|
import OpenAI from "openai";
|
||||||
|
import { zodResponseFormat } from "openai/helpers/zod";
|
||||||
|
import { z } from "zod";
|
||||||
|
import { scrapeDocument } from "../extract/document-scraper";
|
||||||
|
import { PlanType } from "../../types";
|
||||||
|
import { getLlmsTextFromCache, saveLlmsTextToCache } from "./generate-llmstxt-supabase";
|
||||||
|
|
||||||
|
interface GenerateLLMsTextServiceOptions {
|
||||||
|
generationId: string;
|
||||||
|
teamId: string;
|
||||||
|
plan: PlanType;
|
||||||
|
url: string;
|
||||||
|
maxUrls: number;
|
||||||
|
showFullText: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const DescriptionSchema = z.object({
|
||||||
|
description: z.string(),
|
||||||
|
title: z.string(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export async function performGenerateLlmsTxt(options: GenerateLLMsTextServiceOptions) {
|
||||||
|
const openai = new OpenAI();
|
||||||
|
const { generationId, teamId, plan, url, maxUrls, showFullText } = options;
|
||||||
|
|
||||||
|
const logger = _logger.child({
|
||||||
|
module: "generate-llmstxt",
|
||||||
|
method: "performGenerateLlmsTxt",
|
||||||
|
generationId,
|
||||||
|
teamId,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check cache first
|
||||||
|
const cachedResult = await getLlmsTextFromCache(url, maxUrls);
|
||||||
|
if (cachedResult) {
|
||||||
|
logger.info("Found cached LLMs text", { url });
|
||||||
|
|
||||||
|
// Update final result with cached text
|
||||||
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
|
status: "completed",
|
||||||
|
generatedText: cachedResult.llmstxt,
|
||||||
|
fullText: cachedResult.llmstxt_full,
|
||||||
|
showFullText: showFullText,
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
data: {
|
||||||
|
generatedText: cachedResult.llmstxt,
|
||||||
|
fullText: cachedResult.llmstxt_full,
|
||||||
|
showFullText: showFullText,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// If not in cache, proceed with generation
|
||||||
|
// First, get all URLs from the map controller
|
||||||
|
const mapResult = await getMapResults({
|
||||||
|
url,
|
||||||
|
teamId,
|
||||||
|
plan,
|
||||||
|
limit: maxUrls,
|
||||||
|
includeSubdomains: false,
|
||||||
|
ignoreSitemap: false,
|
||||||
|
includeMetadata: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!mapResult || !mapResult.links) {
|
||||||
|
throw new Error(`Failed to map URLs`);
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.debug("Mapping URLs", mapResult.links);
|
||||||
|
|
||||||
|
const urls = mapResult.links;
|
||||||
|
let llmstxt = `# ${url} llms.txt\n\n`;
|
||||||
|
let llmsFulltxt = `# ${url} llms-full.txt\n\n`;
|
||||||
|
|
||||||
|
|
||||||
|
// Scrape each URL
|
||||||
|
for (const url of urls) {
|
||||||
|
_logger.debug(`Scraping URL: ${url}`);
|
||||||
|
const document = await scrapeDocument(
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
teamId,
|
||||||
|
plan,
|
||||||
|
origin: url,
|
||||||
|
timeout: 30000,
|
||||||
|
isSingleUrl: true,
|
||||||
|
},
|
||||||
|
[],
|
||||||
|
logger,
|
||||||
|
{ onlyMainContent: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!document) {
|
||||||
|
logger.error(`Failed to scrape URL ${url}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process scraped result
|
||||||
|
if (!document.markdown) continue;
|
||||||
|
|
||||||
|
_logger.debug(`Generating description for ${document.metadata?.url}`);
|
||||||
|
|
||||||
|
const completion = await openai.beta.chat.completions.parse({
|
||||||
|
model: "gpt-4o-mini",
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose. Here is the content: ${document.markdown}`
|
||||||
|
}
|
||||||
|
],
|
||||||
|
response_format: zodResponseFormat(DescriptionSchema, "description")
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsedResponse = completion.choices[0].message.parsed;
|
||||||
|
const description = parsedResponse!.description;
|
||||||
|
const title = parsedResponse!.title;
|
||||||
|
|
||||||
|
llmstxt += `- [${title}](${document.metadata?.url}): ${description}\n`;
|
||||||
|
llmsFulltxt += `## ${title}\n${document.markdown}\n\n`;
|
||||||
|
|
||||||
|
// Update progress with both generated text and full text
|
||||||
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
|
status: "processing",
|
||||||
|
generatedText: llmstxt,
|
||||||
|
fullText: llmsFulltxt,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to parse LLM response for ${document.metadata?.url}`, { error });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// After successful generation, save to cache
|
||||||
|
await saveLlmsTextToCache(url, llmstxt, llmsFulltxt, maxUrls);
|
||||||
|
|
||||||
|
// Update final result with both generated text and full text
|
||||||
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
|
status: "completed",
|
||||||
|
generatedText: llmstxt,
|
||||||
|
fullText: llmsFulltxt,
|
||||||
|
showFullText: showFullText,
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
data: {
|
||||||
|
generatedText: llmstxt,
|
||||||
|
fullText: llmsFulltxt,
|
||||||
|
showFullText: showFullText,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error("Generate LLMs text error", { error });
|
||||||
|
|
||||||
|
await updateGeneratedLlmsTxt(generationId, {
|
||||||
|
status: "failed",
|
||||||
|
error: error.message || "Unknown error occurred",
|
||||||
|
});
|
||||||
|
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,82 @@
|
|||||||
|
import { supabase_service } from "../../services/supabase";
|
||||||
|
import { logger } from "../logger";
|
||||||
|
import { normalizeUrlOnlyHostname } from "../canonical-url";
|
||||||
|
|
||||||
|
interface LlmsTextCache {
|
||||||
|
origin_url: string;
|
||||||
|
llmstxt: string;
|
||||||
|
llmstxt_full: string;
|
||||||
|
max_urls: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getLlmsTextFromCache(
|
||||||
|
url: string,
|
||||||
|
maxUrls: number,
|
||||||
|
): Promise<LlmsTextCache | null> {
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION !== "true") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const originUrl = normalizeUrlOnlyHostname(url);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { data, error } = await supabase_service
|
||||||
|
.from("llm_texts")
|
||||||
|
.select("*")
|
||||||
|
.eq("origin_url", originUrl)
|
||||||
|
.gte("max_urls", maxUrls) // Changed to gte since we want cached results with more URLs than requested
|
||||||
|
.order("updated_at", { ascending: false })
|
||||||
|
.limit(1)
|
||||||
|
.single();
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function saveLlmsTextToCache(
|
||||||
|
url: string,
|
||||||
|
llmstxt: string,
|
||||||
|
llmstxt_full: string,
|
||||||
|
maxUrls: number,
|
||||||
|
): Promise<void> {
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION !== "true") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const originUrl = normalizeUrlOnlyHostname(url);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// First check if there's an existing entry with fewer URLs
|
||||||
|
const { data: existingData } = await supabase_service
|
||||||
|
.from("llm_texts")
|
||||||
|
.select("*")
|
||||||
|
.eq("origin_url", originUrl)
|
||||||
|
.single();
|
||||||
|
|
||||||
|
// Always update the entry for the origin URL
|
||||||
|
const { error } = await supabase_service
|
||||||
|
.from("llm_texts")
|
||||||
|
.update({
|
||||||
|
llmstxt,
|
||||||
|
llmstxt_full,
|
||||||
|
max_urls: maxUrls,
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
|
})
|
||||||
|
.eq("origin_url", originUrl);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
logger.error("Error saving LLMs text to cache", { error, originUrl });
|
||||||
|
} else {
|
||||||
|
logger.debug("Successfully cached LLMs text", { originUrl, maxUrls });
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Failed to save LLMs text to cache", { error, originUrl });
|
||||||
|
}
|
||||||
|
}
|
@ -29,6 +29,8 @@ import { creditUsageController } from "../controllers/v1/credit-usage";
|
|||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { searchController } from "../controllers/v1/search";
|
import { searchController } from "../controllers/v1/search";
|
||||||
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||||
|
import { generateLLMsTextController } from "../controllers/v1/generate-llmstxt";
|
||||||
|
import { generateLLMsTextStatusController } from "../controllers/v1/generate-llmstxt-status";
|
||||||
import { deepResearchController } from "../controllers/v1/deep-research";
|
import { deepResearchController } from "../controllers/v1/deep-research";
|
||||||
import { deepResearchStatusController } from "../controllers/v1/deep-research-status";
|
import { deepResearchStatusController } from "../controllers/v1/deep-research-status";
|
||||||
|
|
||||||
@ -242,6 +244,18 @@ v1Router.get(
|
|||||||
wrap(extractStatusController),
|
wrap(extractStatusController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.post(
|
||||||
|
"/llmstxt",
|
||||||
|
authMiddleware(RateLimiterMode.Extract),
|
||||||
|
wrap(generateLLMsTextController),
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/llmstxt/:jobId",
|
||||||
|
authMiddleware(RateLimiterMode.ExtractStatus),
|
||||||
|
wrap(generateLLMsTextStatusController),
|
||||||
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/deep-research",
|
"/deep-research",
|
||||||
authMiddleware(RateLimiterMode.Extract),
|
authMiddleware(RateLimiterMode.Extract),
|
||||||
|
@ -7,6 +7,7 @@ let extractQueue: Queue;
|
|||||||
let loggingQueue: Queue;
|
let loggingQueue: Queue;
|
||||||
let indexQueue: Queue;
|
let indexQueue: Queue;
|
||||||
let deepResearchQueue: Queue;
|
let deepResearchQueue: Queue;
|
||||||
|
let generateLlmsTxtQueue: Queue;
|
||||||
|
|
||||||
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
|
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
|
||||||
maxRetriesPerRequest: null,
|
maxRetriesPerRequest: null,
|
||||||
@ -16,6 +17,7 @@ export const scrapeQueueName = "{scrapeQueue}";
|
|||||||
export const extractQueueName = "{extractQueue}";
|
export const extractQueueName = "{extractQueue}";
|
||||||
export const loggingQueueName = "{loggingQueue}";
|
export const loggingQueueName = "{loggingQueue}";
|
||||||
export const indexQueueName = "{indexQueue}";
|
export const indexQueueName = "{indexQueue}";
|
||||||
|
export const generateLlmsTxtQueueName = "{generateLlmsTxtQueue}";
|
||||||
export const deepResearchQueueName = "{deepResearchQueue}";
|
export const deepResearchQueueName = "{deepResearchQueue}";
|
||||||
|
|
||||||
export function getScrapeQueue() {
|
export function getScrapeQueue() {
|
||||||
@ -72,6 +74,24 @@ export function getIndexQueue() {
|
|||||||
return indexQueue;
|
return indexQueue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getGenerateLlmsTxtQueue() {
|
||||||
|
if (!generateLlmsTxtQueue) {
|
||||||
|
generateLlmsTxtQueue = new Queue(generateLlmsTxtQueueName, {
|
||||||
|
connection: redisConnection,
|
||||||
|
defaultJobOptions: {
|
||||||
|
removeOnComplete: {
|
||||||
|
age: 90000, // 25 hours
|
||||||
|
},
|
||||||
|
removeOnFail: {
|
||||||
|
age: 90000, // 25 hours
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
logger.info("LLMs TXT generation queue created");
|
||||||
|
}
|
||||||
|
return generateLlmsTxtQueue;
|
||||||
|
}
|
||||||
|
|
||||||
export function getDeepResearchQueue() {
|
export function getDeepResearchQueue() {
|
||||||
if (!deepResearchQueue) {
|
if (!deepResearchQueue) {
|
||||||
deepResearchQueue = new Queue(deepResearchQueueName, {
|
deepResearchQueue = new Queue(deepResearchQueueName, {
|
||||||
|
@ -11,6 +11,7 @@ import {
|
|||||||
extractQueueName,
|
extractQueueName,
|
||||||
deepResearchQueueName,
|
deepResearchQueueName,
|
||||||
getIndexQueue,
|
getIndexQueue,
|
||||||
|
getGenerateLlmsTxtQueue,
|
||||||
} from "./queue-service";
|
} from "./queue-service";
|
||||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
@ -69,6 +70,8 @@ import { billTeam } from "./billing/credit_billing";
|
|||||||
import { saveCrawlMap } from "./indexing/crawl-maps-index";
|
import { saveCrawlMap } from "./indexing/crawl-maps-index";
|
||||||
import { updateDeepResearch } from "../lib/deep-research/deep-research-redis";
|
import { updateDeepResearch } from "../lib/deep-research/deep-research-redis";
|
||||||
import { performDeepResearch } from "../lib/deep-research/deep-research-service";
|
import { performDeepResearch } from "../lib/deep-research/deep-research-service";
|
||||||
|
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
||||||
|
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -446,6 +449,76 @@ const processDeepResearchJobInternal = async (
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const processGenerateLlmsTxtJobInternal = async (
|
||||||
|
token: string,
|
||||||
|
job: Job & { id: string },
|
||||||
|
) => {
|
||||||
|
const logger = _logger.child({
|
||||||
|
module: "generate-llmstxt-worker",
|
||||||
|
method: "processJobInternal",
|
||||||
|
jobId: job.id,
|
||||||
|
generateId: job.data.generateId,
|
||||||
|
teamId: job.data?.teamId ?? undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
const extendLockInterval = setInterval(async () => {
|
||||||
|
logger.info(`🔄 Worker extending lock on job ${job.id}`);
|
||||||
|
await job.extendLock(token, jobLockExtensionTime);
|
||||||
|
}, jobLockExtendInterval);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await performGenerateLlmsTxt({
|
||||||
|
generationId: job.data.generationId,
|
||||||
|
teamId: job.data.teamId,
|
||||||
|
plan: job.data.plan,
|
||||||
|
url: job.data.request.url,
|
||||||
|
maxUrls: job.data.request.maxUrls,
|
||||||
|
showFullText: job.data.request.showFullText,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
await job.moveToCompleted(result, token, false);
|
||||||
|
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||||
|
status: "completed",
|
||||||
|
generatedText: result.data.generatedText,
|
||||||
|
fullText: result.data.fullText,
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
const error = new Error("LLMs text generation failed without specific error");
|
||||||
|
await job.moveToFailed(error, token, false);
|
||||||
|
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||||
|
status: "failed",
|
||||||
|
error: error.message,
|
||||||
|
});
|
||||||
|
return { success: false, error: error.message };
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
|
||||||
|
|
||||||
|
Sentry.captureException(error, {
|
||||||
|
data: {
|
||||||
|
job: job.id,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await job.moveToFailed(error, token, false);
|
||||||
|
} catch (e) {
|
||||||
|
logger.error("Failed to move job to failed state in Redis", { error });
|
||||||
|
}
|
||||||
|
|
||||||
|
await updateGeneratedLlmsTxt(job.data.generateId, {
|
||||||
|
status: "failed",
|
||||||
|
error: error.message || "Unknown error occurred",
|
||||||
|
});
|
||||||
|
|
||||||
|
return { success: false, error: error.message || "Unknown error occurred" };
|
||||||
|
} finally {
|
||||||
|
clearInterval(extendLockInterval);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let isShuttingDown = false;
|
let isShuttingDown = false;
|
||||||
|
|
||||||
process.on("SIGINT", () => {
|
process.on("SIGINT", () => {
|
||||||
@ -1170,6 +1243,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
workerFun(getScrapeQueue(), processJobInternal),
|
workerFun(getScrapeQueue(), processJobInternal),
|
||||||
workerFun(getExtractQueue(), processExtractJobInternal),
|
workerFun(getExtractQueue(), processExtractJobInternal),
|
||||||
workerFun(getDeepResearchQueue(), processDeepResearchJobInternal),
|
workerFun(getDeepResearchQueue(), processDeepResearchJobInternal),
|
||||||
|
workerFun(getGenerateLlmsTxtQueue(), processGenerateLlmsTxtJobInternal),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
console.log("All workers exited. Waiting for all jobs to finish...");
|
console.log("All workers exited. Waiting for all jobs to finish...");
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.18.1",
|
"version": "1.18.2",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -413,6 +413,48 @@ export interface DeepResearchStatusResponse {
|
|||||||
summaries: string[];
|
summaries: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parameters for LLMs.txt generation operations.
|
||||||
|
*/
|
||||||
|
export interface GenerateLLMsTextParams {
|
||||||
|
/**
|
||||||
|
* Maximum number of URLs to process (1-100)
|
||||||
|
* @default 10
|
||||||
|
*/
|
||||||
|
maxUrls?: number;
|
||||||
|
/**
|
||||||
|
* Whether to show the full LLMs-full.txt in the response
|
||||||
|
* @default false
|
||||||
|
*/
|
||||||
|
showFullText?: boolean;
|
||||||
|
/**
|
||||||
|
* Experimental flag for streaming
|
||||||
|
*/
|
||||||
|
__experimental_stream?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for LLMs.txt generation operations.
|
||||||
|
*/
|
||||||
|
export interface GenerateLLMsTextResponse {
|
||||||
|
success: boolean;
|
||||||
|
id: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Status response interface for LLMs.txt generation operations.
|
||||||
|
*/
|
||||||
|
export interface GenerateLLMsTextStatusResponse {
|
||||||
|
success: boolean;
|
||||||
|
data: {
|
||||||
|
llmstxt: string;
|
||||||
|
llmsfulltxt?: string;
|
||||||
|
};
|
||||||
|
status: "processing" | "completed" | "failed";
|
||||||
|
error?: string;
|
||||||
|
expiresAt: string;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class for interacting with the Firecrawl API.
|
* Main class for interacting with the Firecrawl API.
|
||||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||||
@ -1459,6 +1501,118 @@ export default class FirecrawlApp {
|
|||||||
}
|
}
|
||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error." };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates LLMs.txt for a given URL and polls until completion.
|
||||||
|
* @param url - The URL to generate LLMs.txt from.
|
||||||
|
* @param params - Parameters for the LLMs.txt generation operation.
|
||||||
|
* @returns The final generation results.
|
||||||
|
*/
|
||||||
|
async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
||||||
|
try {
|
||||||
|
const response = await this.asyncGenerateLLMsText(url, params);
|
||||||
|
|
||||||
|
if (!response.success || 'error' in response) {
|
||||||
|
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.id) {
|
||||||
|
throw new FirecrawlError(`Failed to start LLMs.txt generation. No job ID returned.`, 500);
|
||||||
|
}
|
||||||
|
|
||||||
|
const jobId = response.id;
|
||||||
|
let generationStatus;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
generationStatus = await this.checkGenerateLLMsTextStatus(jobId);
|
||||||
|
|
||||||
|
if ('error' in generationStatus && !generationStatus.success) {
|
||||||
|
return generationStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (generationStatus.status === "completed") {
|
||||||
|
return generationStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (generationStatus.status === "failed") {
|
||||||
|
throw new FirecrawlError(
|
||||||
|
`LLMs.txt generation job ${generationStatus.status}. Error: ${generationStatus.error}`,
|
||||||
|
500
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (generationStatus.status !== "processing") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: false, error: "LLMs.txt generation job terminated unexpectedly" };
|
||||||
|
} catch (error: any) {
|
||||||
|
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initiates a LLMs.txt generation operation without polling.
|
||||||
|
* @param url - The URL to generate LLMs.txt from.
|
||||||
|
* @param params - Parameters for the LLMs.txt generation operation.
|
||||||
|
* @returns The response containing the generation job ID.
|
||||||
|
*/
|
||||||
|
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.postRequest(
|
||||||
|
`${this.apiUrl}/v1/llmstxt`,
|
||||||
|
{ url, ...params },
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
|
||||||
|
if (response.status === 200) {
|
||||||
|
return response.data;
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "start LLMs.txt generation");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.data?.error) {
|
||||||
|
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||||
|
} else {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks the status of a LLMs.txt generation operation.
|
||||||
|
* @param id - The ID of the LLMs.txt generation operation.
|
||||||
|
* @returns The current status and results of the generation operation.
|
||||||
|
*/
|
||||||
|
async checkGenerateLLMsTextStatus(id: string): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.getRequest(
|
||||||
|
`${this.apiUrl}/v1/llmstxt/${id}`,
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
|
||||||
|
if (response.status === 200) {
|
||||||
|
return response.data;
|
||||||
|
} else if (response.status === 404) {
|
||||||
|
throw new FirecrawlError("LLMs.txt generation job not found", 404);
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "check LLMs.txt generation status");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.data?.error) {
|
||||||
|
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||||
|
} else {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
interface CrawlWatcherEvents {
|
interface CrawlWatcherEvents {
|
||||||
|
@ -33,6 +33,14 @@ class SearchParams(pydantic.BaseModel):
|
|||||||
timeout: Optional[int] = 60000
|
timeout: Optional[int] = 60000
|
||||||
scrapeOptions: Optional[Dict[str, Any]] = None
|
scrapeOptions: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
class GenerateLLMsTextParams(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Parameters for the LLMs.txt generation operation.
|
||||||
|
"""
|
||||||
|
maxUrls: Optional[int] = 10
|
||||||
|
showFullText: Optional[bool] = False
|
||||||
|
__experimental_stream: Optional[bool] = None
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
class SearchResponse(pydantic.BaseModel):
|
class SearchResponse(pydantic.BaseModel):
|
||||||
"""
|
"""
|
||||||
@ -756,6 +764,123 @@ class FirecrawlApp:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(str(e), 500)
|
raise ValueError(str(e), 500)
|
||||||
|
|
||||||
|
def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate LLMs.txt for a given URL and poll until completion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to generate LLMs.txt from.
|
||||||
|
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: A dictionary containing the generation results. The structure includes:
|
||||||
|
- 'success' (bool): Indicates if the generation was successful.
|
||||||
|
- 'status' (str): The final status of the generation job.
|
||||||
|
- 'data' (Dict): The generated LLMs.txt data.
|
||||||
|
- 'error' (Optional[str]): Error message if the generation failed.
|
||||||
|
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the generation job fails or an error occurs during status checks.
|
||||||
|
"""
|
||||||
|
if params is None:
|
||||||
|
params = {}
|
||||||
|
|
||||||
|
if isinstance(params, dict):
|
||||||
|
generation_params = GenerateLLMsTextParams(**params)
|
||||||
|
else:
|
||||||
|
generation_params = params
|
||||||
|
|
||||||
|
response = self.async_generate_llms_text(url, generation_params)
|
||||||
|
if not response.get('success') or 'id' not in response:
|
||||||
|
return response
|
||||||
|
|
||||||
|
job_id = response['id']
|
||||||
|
while True:
|
||||||
|
status = self.check_generate_llms_text_status(job_id)
|
||||||
|
|
||||||
|
if status['status'] == 'completed':
|
||||||
|
return status
|
||||||
|
elif status['status'] == 'failed':
|
||||||
|
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
|
||||||
|
elif status['status'] != 'processing':
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(2) # Polling interval
|
||||||
|
|
||||||
|
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
||||||
|
|
||||||
|
def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Initiate an asynchronous LLMs.txt generation operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to generate LLMs.txt from.
|
||||||
|
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes:
|
||||||
|
- 'success' (bool): Indicates if the generation initiation was successful.
|
||||||
|
- 'id' (str): The unique identifier for the generation job.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the generation job initiation fails.
|
||||||
|
"""
|
||||||
|
if params is None:
|
||||||
|
params = {}
|
||||||
|
|
||||||
|
if isinstance(params, dict):
|
||||||
|
generation_params = GenerateLLMsTextParams(**params)
|
||||||
|
else:
|
||||||
|
generation_params = params
|
||||||
|
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
json_data = {'url': url, **generation_params.dict(exclude_none=True)}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception('Failed to parse Firecrawl response as JSON.')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, 'start LLMs.txt generation')
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
return {'success': False, 'error': 'Internal server error'}
|
||||||
|
|
||||||
|
def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Check the status of a LLMs.txt generation operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (str): The ID of the LLMs.txt generation operation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: The current status and results of the generation operation.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the status check fails.
|
||||||
|
"""
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
try:
|
||||||
|
response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception('Failed to parse Firecrawl response as JSON.')
|
||||||
|
elif response.status_code == 404:
|
||||||
|
raise Exception('LLMs.txt generation job not found')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, 'check LLMs.txt generation status')
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
return {'success': False, 'error': 'Internal server error'}
|
||||||
|
|
||||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Prepare the headers for API requests.
|
Prepare the headers for API requests.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user