feat: cache

This commit is contained in:
Gergő Móricz 2024-11-14 19:47:12 +01:00
parent a1c018fdb0
commit 49ff37afb4
6 changed files with 117 additions and 2 deletions

50
apps/api/src/lib/cache.ts Normal file
View File

@ -0,0 +1,50 @@
import IORedis from "ioredis";
import { ScrapeOptions } from "../controllers/v1/types";
import { InternalOptions } from "../scraper/scrapeURL";
import { logger as _logger } from "./logger";
const logger = _logger.child({module: "cache"});
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
maxRetriesPerRequest: null,
}) : null;
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
if (!cacheRedis) return null;
// these options disqualify a cache
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
) {
return null;
}
return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
}
export type CacheEntry = {
url: string;
html: string;
statusCode: number;
error?: string;
};
export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
try {
await cacheRedis.set(key, JSON.stringify(entry));
} catch (error) {
logger.warn("Failed to save to cache", { key, error });
}
}
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
if (!cacheRedis) return null;
try {
return JSON.parse(await cacheRedis.get(key) ?? "null");
} catch (error) {
logger.warn("Failed to get from cache", { key, error });
return null;
}
}

View File

@ -0,0 +1,19 @@
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { EngineError } from "../../error";
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key === null) throw new EngineError("Scrape not eligible for caching");
const entry = await getEntryFromCache(key);
if (entry === null) throw new EngineError("Cache missed");
return {
url: entry.url,
html: entry.html,
statusCode: entry.statusCode,
error: entry.error,
};
}

View File

@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
import { scrapeCache } from "./cache";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [
...(useCache ? [ "cache" as const ] : []),
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
...(usePlaywright ? [ "playwright" as const ] : []),
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
} = {
"cache": scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
@ -95,6 +99,22 @@ export const engineOptions: {
quality: number,
}
} = {
"cache": {
features: {
"actions": false,
"waitFor": true,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false, // TODO: figure this out
"docx": false, // TODO: figure this out
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 1000, // cache should always be tried first
},
"fire-engine;chrome-cdp": {
features: {
"actions": true,

View File

@ -0,0 +1,24 @@
import { Document } from "../../../controllers/v1/types";
import { Meta } from "..";
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
export function saveToCache(meta: Meta, document: Document): Document {
if (document.rawHtml === undefined) {
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) {
const entry: CacheEntry = {
html: document.rawHtml!,
statusCode: document.metadata.statusCode!,
url: document.metadata.url ?? document.metadata.sourceURL!,
error: document.metadata.error ?? undefined,
};
saveEntryToCache(key, entry);
}
return document;
}

View File

@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache";
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
// TODO: allow some of these to run in parallel
export const transformerStack: Transformer[] = [
saveToCache,
deriveHTMLFromRawHTML,
deriveMarkdownFromHTML,
deriveLinksFromHTML,

View File

@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
}
}
}
}, 500);
}, 100);
})
}