mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 00:05:54 +08:00
Fix LLMs.txt cache bug with subdomains and add bypass option (#1557)
* Fix LLMs.txt cache bug with subdomains and add bypass option (#1519) Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Nick: * Update LLMs.txt test file to use helper functions and concurrent tests Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Remove LLMs.txt test file as requested Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Change parameter name to 'cache' and keep 7-day expiration Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Update generate-llmstxt-supabase.ts * Update JS and Python SDKs to include cache parameter Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Fix LLMs.txt cache implementation to use normalizeUrl and exact matching Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Revert "Fix LLMs.txt cache implementation to use normalizeUrl and exact matching" This reverts commit d05b9964677b7b2384453329d2ac99d841467053. * Nick: --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello+firecrawl@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
ab30c8e4ac
commit
7ccbbec488
@ -104,7 +104,8 @@ content-type: application/json
|
|||||||
{
|
{
|
||||||
"url": "https://firecrawl.dev",
|
"url": "https://firecrawl.dev",
|
||||||
"maxUrls": 1,
|
"maxUrls": 1,
|
||||||
"showFullText": false
|
"showFullText": false,
|
||||||
|
"cache": true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@ export async function generateLLMsTextController(
|
|||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
maxUrls: req.body.maxUrls,
|
maxUrls: req.body.maxUrls,
|
||||||
showFullText: req.body.showFullText,
|
showFullText: req.body.showFullText,
|
||||||
|
cache: req.body.cache,
|
||||||
generatedText: "",
|
generatedText: "",
|
||||||
fullText: "",
|
fullText: "",
|
||||||
});
|
});
|
||||||
|
@ -1211,6 +1211,10 @@ export const generateLLMsTextRequestSchema = z.object({
|
|||||||
.boolean()
|
.boolean()
|
||||||
.default(false)
|
.default(false)
|
||||||
.describe("Whether to show the full LLMs-full.txt in the response"),
|
.describe("Whether to show the full LLMs-full.txt in the response"),
|
||||||
|
cache: z
|
||||||
|
.boolean()
|
||||||
|
.default(true)
|
||||||
|
.describe("Whether to use cached content if available"),
|
||||||
__experimental_stream: z.boolean().optional(),
|
__experimental_stream: z.boolean().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -36,6 +36,18 @@ describe("normalizeUrlOnlyHostname", () => {
|
|||||||
const expected = "not a valid url";
|
const expected = "not a valid url";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with subdomains", () => {
|
||||||
|
const url = "https://blog.example.com";
|
||||||
|
const expected = "blog.example.com";
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with multiple subdomains", () => {
|
||||||
|
const url = "https://dev.blog.example.com";
|
||||||
|
const expected = "dev.blog.example.com";
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("normalizeUrl", () => {
|
describe("normalizeUrl", () => {
|
||||||
|
@ -9,6 +9,7 @@ export interface GenerationData {
|
|||||||
url: string;
|
url: string;
|
||||||
maxUrls: number;
|
maxUrls: number;
|
||||||
showFullText: boolean;
|
showFullText: boolean;
|
||||||
|
cache?: boolean;
|
||||||
generatedText: string;
|
generatedText: string;
|
||||||
fullText: string;
|
fullText: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -66,4 +67,4 @@ export async function updateGeneratedLlmsTxtStatus(
|
|||||||
if (error !== undefined) updates.error = error;
|
if (error !== undefined) updates.error = error;
|
||||||
|
|
||||||
await updateGeneratedLlmsTxt(id, updates);
|
await updateGeneratedLlmsTxt(id, updates);
|
||||||
}
|
}
|
@ -19,6 +19,7 @@ interface GenerateLLMsTextServiceOptions {
|
|||||||
url: string;
|
url: string;
|
||||||
maxUrls: number;
|
maxUrls: number;
|
||||||
showFullText: boolean;
|
showFullText: boolean;
|
||||||
|
cache?: boolean;
|
||||||
subId?: string;
|
subId?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -63,7 +64,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
|
|||||||
export async function performGenerateLlmsTxt(
|
export async function performGenerateLlmsTxt(
|
||||||
options: GenerateLLMsTextServiceOptions,
|
options: GenerateLLMsTextServiceOptions,
|
||||||
) {
|
) {
|
||||||
const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =
|
const { generationId, teamId, url, maxUrls = 100, showFullText, cache = true, subId } =
|
||||||
options;
|
options;
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
@ -79,8 +80,8 @@ export async function performGenerateLlmsTxt(
|
|||||||
// Enforce max URL limit
|
// Enforce max URL limit
|
||||||
const effectiveMaxUrls = Math.min(maxUrls, 5000);
|
const effectiveMaxUrls = Math.min(maxUrls, 5000);
|
||||||
|
|
||||||
// Check cache first
|
// Check cache first, unless cache is set to false
|
||||||
const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
|
const cachedResult = cache ? await getLlmsTextFromCache(url, effectiveMaxUrls) : null;
|
||||||
if (cachedResult) {
|
if (cachedResult) {
|
||||||
logger.info("Found cached LLMs text", { url });
|
logger.info("Found cached LLMs text", { url });
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { supabase_service } from "../../services/supabase";
|
import { supabase_service } from "../../services/supabase";
|
||||||
import { logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
import { normalizeUrlOnlyHostname } from "../canonical-url";
|
import { normalizeUrl, normalizeUrlOnlyHostname } from "../canonical-url";
|
||||||
|
|
||||||
interface LlmsTextCache {
|
interface LlmsTextCache {
|
||||||
origin_url: string;
|
origin_url: string;
|
||||||
@ -41,7 +41,7 @@ export async function getLlmsTextFromCache(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return data;
|
return data
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||||
return null;
|
return null;
|
||||||
|
@ -653,6 +653,7 @@ const processGenerateLlmsTxtJobInternal = async (
|
|||||||
maxUrls: job.data.request.maxUrls,
|
maxUrls: job.data.request.maxUrls,
|
||||||
showFullText: job.data.request.showFullText,
|
showFullText: job.data.request.showFullText,
|
||||||
subId: job.data.subId,
|
subId: job.data.subId,
|
||||||
|
cache: job.data.request.cache,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.24.0",
|
"version": "1.25.0",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -520,6 +520,11 @@ export interface GenerateLLMsTextParams {
|
|||||||
* @default false
|
* @default false
|
||||||
*/
|
*/
|
||||||
showFullText?: boolean;
|
showFullText?: boolean;
|
||||||
|
/**
|
||||||
|
* Whether to use cached content if available
|
||||||
|
* @default true
|
||||||
|
*/
|
||||||
|
cache?: boolean;
|
||||||
/**
|
/**
|
||||||
* Experimental flag for streaming
|
* Experimental flag for streaming
|
||||||
*/
|
*/
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||||
|
|
||||||
__version__ = "2.5.4"
|
__version__ = "2.6.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
|||||||
"""
|
"""
|
||||||
maxUrls: Optional[int] = 10
|
maxUrls: Optional[int] = 10
|
||||||
showFullText: Optional[bool] = False
|
showFullText: Optional[bool] = False
|
||||||
|
cache: Optional[bool] = True
|
||||||
__experimental_stream: Optional[bool] = None
|
__experimental_stream: Optional[bool] = None
|
||||||
|
|
||||||
class DeepResearchParams(pydantic.BaseModel):
|
class DeepResearchParams(pydantic.BaseModel):
|
||||||
@ -1870,6 +1871,7 @@ class FirecrawlApp:
|
|||||||
*,
|
*,
|
||||||
max_urls: Optional[int] = None,
|
max_urls: Optional[int] = None,
|
||||||
show_full_text: Optional[bool] = None,
|
show_full_text: Optional[bool] = None,
|
||||||
|
cache: Optional[bool] = None,
|
||||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
||||||
"""
|
"""
|
||||||
Generate LLMs.txt for a given URL and poll until completion.
|
Generate LLMs.txt for a given URL and poll until completion.
|
||||||
@ -1878,6 +1880,7 @@ class FirecrawlApp:
|
|||||||
url (str): Target URL to generate LLMs.txt from
|
url (str): Target URL to generate LLMs.txt from
|
||||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||||
|
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1893,6 +1896,7 @@ class FirecrawlApp:
|
|||||||
params = GenerateLLMsTextParams(
|
params = GenerateLLMsTextParams(
|
||||||
maxUrls=max_urls,
|
maxUrls=max_urls,
|
||||||
showFullText=show_full_text,
|
showFullText=show_full_text,
|
||||||
|
cache=cache,
|
||||||
__experimental_stream=experimental_stream
|
__experimental_stream=experimental_stream
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1900,6 +1904,7 @@ class FirecrawlApp:
|
|||||||
url,
|
url,
|
||||||
max_urls=max_urls,
|
max_urls=max_urls,
|
||||||
show_full_text=show_full_text,
|
show_full_text=show_full_text,
|
||||||
|
cache=cache,
|
||||||
experimental_stream=experimental_stream
|
experimental_stream=experimental_stream
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1935,6 +1940,7 @@ class FirecrawlApp:
|
|||||||
*,
|
*,
|
||||||
max_urls: Optional[int] = None,
|
max_urls: Optional[int] = None,
|
||||||
show_full_text: Optional[bool] = None,
|
show_full_text: Optional[bool] = None,
|
||||||
|
cache: Optional[bool] = None,
|
||||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous LLMs.txt generation operation.
|
Initiate an asynchronous LLMs.txt generation operation.
|
||||||
@ -1943,6 +1949,7 @@ class FirecrawlApp:
|
|||||||
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
||||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||||
|
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1957,6 +1964,7 @@ class FirecrawlApp:
|
|||||||
params = GenerateLLMsTextParams(
|
params = GenerateLLMsTextParams(
|
||||||
maxUrls=max_urls,
|
maxUrls=max_urls,
|
||||||
showFullText=show_full_text,
|
showFullText=show_full_text,
|
||||||
|
cache=cache,
|
||||||
__experimental_stream=experimental_stream
|
__experimental_stream=experimental_stream
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
url,
|
url,
|
||||||
max_urls=max_urls,
|
max_urls=max_urls,
|
||||||
show_full_text=show_full_text,
|
show_full_text=show_full_text,
|
||||||
|
cache=cache,
|
||||||
experimental_stream=experimental_stream
|
experimental_stream=experimental_stream
|
||||||
)
|
)
|
||||||
if not response.get('success') or 'id' not in response:
|
if not response.get('success') or 'id' not in response:
|
||||||
@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
*,
|
*,
|
||||||
max_urls: Optional[int] = None,
|
max_urls: Optional[int] = None,
|
||||||
show_full_text: Optional[bool] = None,
|
show_full_text: Optional[bool] = None,
|
||||||
|
cache: Optional[bool] = None,
|
||||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
||||||
@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
url (str): Target URL to generate LLMs.txt from
|
url (str): Target URL to generate LLMs.txt from
|
||||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||||
|
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
params = GenerateLLMsTextParams(
|
params = GenerateLLMsTextParams(
|
||||||
maxUrls=max_urls,
|
maxUrls=max_urls,
|
||||||
showFullText=show_full_text,
|
showFullText=show_full_text,
|
||||||
|
cache=cache,
|
||||||
__experimental_stream=experimental_stream
|
__experimental_stream=experimental_stream
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user