diff --git a/apps/api/requests.http b/apps/api/requests.http index 7627979d..55d315d1 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -104,7 +104,8 @@ content-type: application/json { "url": "https://firecrawl.dev", "maxUrls": 1, - "showFullText": false + "showFullText": false, + "cache": true } diff --git a/apps/api/src/controllers/v1/generate-llmstxt.ts b/apps/api/src/controllers/v1/generate-llmstxt.ts index 52358ba8..aaf34c3b 100644 --- a/apps/api/src/controllers/v1/generate-llmstxt.ts +++ b/apps/api/src/controllers/v1/generate-llmstxt.ts @@ -42,6 +42,7 @@ export async function generateLLMsTextController( url: req.body.url, maxUrls: req.body.maxUrls, showFullText: req.body.showFullText, + cache: req.body.cache, generatedText: "", fullText: "", }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1525640a..0ae2acc3 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1211,6 +1211,10 @@ export const generateLLMsTextRequestSchema = z.object({ .boolean() .default(false) .describe("Whether to show the full LLMs-full.txt in the response"), + cache: z + .boolean() + .default(true) + .describe("Whether to use cached content if available"), __experimental_stream: z.boolean().optional(), }); diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts index 980cec8e..673b7692 100644 --- a/apps/api/src/lib/canonical-url.test.ts +++ b/apps/api/src/lib/canonical-url.test.ts @@ -36,6 +36,18 @@ describe("normalizeUrlOnlyHostname", () => { const expected = "not a valid url"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); + + it("should handle URLs with subdomains", () => { + const url = "https://blog.example.com"; + const expected = "blog.example.com"; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it("should handle URLs with multiple subdomains", () => { + const url = "https://dev.blog.example.com"; + const expected = "dev.blog.example.com"; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); }); describe("normalizeUrl", () => { diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts index b32d034d..c5bf6479 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts @@ -9,6 +9,7 @@ export interface GenerationData { url: string; maxUrls: number; showFullText: boolean; + cache?: boolean; generatedText: string; fullText: string; error?: string; @@ -66,4 +67,4 @@ export async function updateGeneratedLlmsTxtStatus( if (error !== undefined) updates.error = error; await updateGeneratedLlmsTxt(id, updates); -} \ No newline at end of file +} \ No newline at end of file diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index a3b30188..497cf734 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -19,6 +19,7 @@ interface GenerateLLMsTextServiceOptions { url: string; maxUrls: number; showFullText: boolean; + cache?: boolean; subId?: string; } @@ -63,7 +64,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string { export async function performGenerateLlmsTxt( options: GenerateLLMsTextServiceOptions, ) { - const { generationId, teamId, url, maxUrls = 100, showFullText, subId } = + const { generationId, teamId, url, maxUrls = 100, showFullText, cache = true, subId } = options; const startTime = Date.now(); const logger = _logger.child({ @@ -79,8 +80,8 @@ export async function performGenerateLlmsTxt( // Enforce max URL limit const effectiveMaxUrls = Math.min(maxUrls, 5000); - // Check cache first - const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls); + // Check cache first, unless cache is set to false + const cachedResult = cache ? await getLlmsTextFromCache(url, effectiveMaxUrls) : null; if (cachedResult) { logger.info("Found cached LLMs text", { url }); diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts index 4120170a..ff4268e6 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts @@ -1,6 +1,6 @@ import { supabase_service } from "../../services/supabase"; import { logger } from "../logger"; -import { normalizeUrlOnlyHostname } from "../canonical-url"; +import { normalizeUrl, normalizeUrlOnlyHostname } from "../canonical-url"; interface LlmsTextCache { origin_url: string; @@ -41,7 +41,7 @@ export async function getLlmsTextFromCache( return null; } - return data; + return data } catch (error) { logger.error("Failed to fetch LLMs text from cache", { error, originUrl }); return null; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 622d5cfb..72efb5d8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -653,6 +653,7 @@ const processGenerateLlmsTxtJobInternal = async ( maxUrls: job.data.request.maxUrls, showFullText: job.data.request.showFullText, subId: job.data.subId, + cache: job.data.request.cache, }); if (result.success) { diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 905f0949..4583b6bf 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.24.0", + "version": "1.25.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index ccb64521..7639160e 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -520,6 +520,11 @@ export interface GenerateLLMsTextParams { * @default false */ showFullText?: boolean; + /** + * Whether to use cached content if available + * @default true + */ + cache?: boolean; /** * Experimental flag for streaming */ diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4031da9f..3c7c8b3b 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa -__version__ = "2.5.4" +__version__ = "2.6.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1cf62cf7..81488c8a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel): """ maxUrls: Optional[int] = 10 showFullText: Optional[bool] = False + cache: Optional[bool] = True __experimental_stream: Optional[bool] = None class DeepResearchParams(pydantic.BaseModel): @@ -1870,6 +1871,7 @@ class FirecrawlApp: *, max_urls: Optional[int] = None, show_full_text: Optional[bool] = None, + cache: Optional[bool] = None, experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and poll until completion. @@ -1878,6 +1880,7 @@ class FirecrawlApp: url (str): Target URL to generate LLMs.txt from max_urls (Optional[int]): Maximum URLs to process (default: 10) show_full_text (Optional[bool]): Include full text in output (default: False) + cache (Optional[bool]): Whether to use cached content if available (default: True) experimental_stream (Optional[bool]): Enable experimental streaming Returns: @@ -1893,6 +1896,7 @@ class FirecrawlApp: params = GenerateLLMsTextParams( maxUrls=max_urls, showFullText=show_full_text, + cache=cache, __experimental_stream=experimental_stream ) @@ -1900,6 +1904,7 @@ class FirecrawlApp: url, max_urls=max_urls, show_full_text=show_full_text, + cache=cache, experimental_stream=experimental_stream ) @@ -1935,6 +1940,7 @@ class FirecrawlApp: *, max_urls: Optional[int] = None, show_full_text: Optional[bool] = None, + cache: Optional[bool] = None, experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation operation. @@ -1943,6 +1949,7 @@ class FirecrawlApp: url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. max_urls (Optional[int]): Maximum URLs to process (default: 10) show_full_text (Optional[bool]): Include full text in output (default: False) + cache (Optional[bool]): Whether to use cached content if available (default: True) experimental_stream (Optional[bool]): Enable experimental streaming Returns: @@ -1957,6 +1964,7 @@ class FirecrawlApp: params = GenerateLLMsTextParams( maxUrls=max_urls, showFullText=show_full_text, + cache=cache, __experimental_stream=experimental_stream ) @@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp): url, max_urls=max_urls, show_full_text=show_full_text, + cache=cache, experimental_stream=experimental_stream ) if not response.get('success') or 'id' not in response: @@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp): *, max_urls: Optional[int] = None, show_full_text: Optional[bool] = None, + cache: Optional[bool] = None, experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation job without waiting for completion. @@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp): url (str): Target URL to generate LLMs.txt from max_urls (Optional[int]): Maximum URLs to process (default: 10) show_full_text (Optional[bool]): Include full text in output (default: False) + cache (Optional[bool]): Whether to use cached content if available (default: True) experimental_stream (Optional[bool]): Enable experimental streaming Returns: @@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp): params = GenerateLLMsTextParams( maxUrls=max_urls, showFullText=show_full_text, + cache=cache, __experimental_stream=experimental_stream )