mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 14:20:36 +08:00
feat: cache
This commit is contained in:
parent
a1c018fdb0
commit
49ff37afb4
50
apps/api/src/lib/cache.ts
Normal file
50
apps/api/src/lib/cache.ts
Normal file
@ -0,0 +1,50 @@
|
||||
import IORedis from "ioredis";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { logger as _logger } from "./logger";
|
||||
const logger = _logger.child({module: "cache"});
|
||||
|
||||
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
}) : null;
|
||||
|
||||
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
|
||||
if (!cacheRedis) return null;
|
||||
|
||||
// these options disqualify a cache
|
||||
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|
||||
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
|
||||
}
|
||||
|
||||
export type CacheEntry = {
|
||||
url: string;
|
||||
html: string;
|
||||
statusCode: number;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
||||
if (!cacheRedis) return;
|
||||
|
||||
try {
|
||||
await cacheRedis.set(key, JSON.stringify(entry));
|
||||
} catch (error) {
|
||||
logger.warn("Failed to save to cache", { key, error });
|
||||
}
|
||||
}
|
||||
|
||||
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
|
||||
if (!cacheRedis) return null;
|
||||
|
||||
try {
|
||||
return JSON.parse(await cacheRedis.get(key) ?? "null");
|
||||
} catch (error) {
|
||||
logger.warn("Failed to get from cache", { key, error });
|
||||
return null;
|
||||
}
|
||||
}
|
19
apps/api/src/scraper/scrapeURL/engines/cache/index.ts
vendored
Normal file
19
apps/api/src/scraper/scrapeURL/engines/cache/index.ts
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { EngineError } from "../../error";
|
||||
|
||||
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
if (key === null) throw new EngineError("Scrape not eligible for caching");
|
||||
|
||||
const entry = await getEntryFromCache(key);
|
||||
if (entry === null) throw new EngineError("Cache missed");
|
||||
|
||||
return {
|
||||
url: entry.url,
|
||||
html: entry.html,
|
||||
statusCode: entry.statusCode,
|
||||
error: entry.error,
|
||||
};
|
||||
}
|
@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
import { scrapeCache } from "./cache";
|
||||
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
...(useCache ? [ "cache" as const ] : []),
|
||||
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
||||
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
||||
...(usePlaywright ? [ "playwright" as const ] : []),
|
||||
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
||||
} = {
|
||||
"cache": scrapeCache,
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||
@ -95,6 +99,22 @@ export const engineOptions: {
|
||||
quality: number,
|
||||
}
|
||||
} = {
|
||||
"cache": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false, // TODO: figure this out
|
||||
"docx": false, // TODO: figure this out
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 1000, // cache should always be tried first
|
||||
},
|
||||
"fire-engine;chrome-cdp": {
|
||||
features: {
|
||||
"actions": true,
|
||||
|
24
apps/api/src/scraper/scrapeURL/transformers/cache.ts
Normal file
24
apps/api/src/scraper/scrapeURL/transformers/cache.ts
Normal file
@ -0,0 +1,24 @@
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { Meta } from "..";
|
||||
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
||||
|
||||
export function saveToCache(meta: Meta, document: Document): Document {
|
||||
if (document.rawHtml === undefined) {
|
||||
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
|
||||
}
|
||||
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
|
||||
if (key !== null) {
|
||||
const entry: CacheEntry = {
|
||||
html: document.rawHtml!,
|
||||
statusCode: document.metadata.statusCode!,
|
||||
url: document.metadata.url ?? document.metadata.sourceURL!,
|
||||
error: document.metadata.error ?? undefined,
|
||||
};
|
||||
|
||||
saveEntryToCache(key, entry);
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
|
||||
import { performLLMExtract } from "./llmExtract";
|
||||
import { uploadScreenshot } from "./uploadScreenshot";
|
||||
import { removeBase64Images } from "./removeBase64Images";
|
||||
import { saveToCache } from "./cache";
|
||||
|
||||
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
|
||||
|
||||
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
|
||||
|
||||
// TODO: allow some of these to run in parallel
|
||||
export const transformerStack: Transformer[] = [
|
||||
saveToCache,
|
||||
deriveHTMLFromRawHTML,
|
||||
deriveMarkdownFromHTML,
|
||||
deriveLinksFromHTML,
|
||||
|
@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
|
||||
}
|
||||
}
|
||||
}
|
||||
}, 500);
|
||||
}, 100);
|
||||
})
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user