From 49ff37afb4ee389e8fa6bc353862b1d141b9bda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 14 Nov 2024 19:47:12 +0100 Subject: [PATCH] feat: cache --- apps/api/src/lib/cache.ts | 50 +++++++++++++++++++ .../scraper/scrapeURL/engines/cache/index.ts | 19 +++++++ .../src/scraper/scrapeURL/engines/index.ts | 22 +++++++- .../scraper/scrapeURL/transformers/cache.ts | 24 +++++++++ .../scraper/scrapeURL/transformers/index.ts | 2 + apps/api/src/services/queue-jobs.ts | 2 +- 6 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/lib/cache.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/cache/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/cache.ts diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts new file mode 100644 index 00000000..896d9429 --- /dev/null +++ b/apps/api/src/lib/cache.ts @@ -0,0 +1,50 @@ +import IORedis from "ioredis"; +import { ScrapeOptions } from "../controllers/v1/types"; +import { InternalOptions } from "../scraper/scrapeURL"; +import { logger as _logger } from "./logger"; +const logger = _logger.child({module: "cache"}); + +export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, { + maxRetriesPerRequest: null, +}) : null; + +export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null { + if (!cacheRedis) return null; + + // these options disqualify a cache + if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv + || (scrapeOptions.actions && scrapeOptions.actions.length > 0) + ) { + return null; + } + + return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor; +} + +export type CacheEntry = { + url: string; + html: string; + statusCode: number; + error?: string; +}; + +export async function saveEntryToCache(key: string, entry: CacheEntry) { + if (!cacheRedis) return; + + try { + await cacheRedis.set(key, JSON.stringify(entry)); + } catch (error) { + logger.warn("Failed to save to cache", { key, error }); + } +} + +export async function getEntryFromCache(key: string): Promise { + if (!cacheRedis) return null; + + try { + return JSON.parse(await cacheRedis.get(key) ?? "null"); + } catch (error) { + logger.warn("Failed to get from cache", { key, error }); + return null; + } +} diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts new file mode 100644 index 00000000..9506be0f --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -0,0 +1,19 @@ +import { cacheKey, getEntryFromCache } from "../../../../lib/cache"; +import { EngineScrapeResult } from ".."; +import { Meta } from "../.."; +import { EngineError } from "../../error"; + +export async function scrapeCache(meta: Meta): Promise { + const key = cacheKey(meta.url, meta.options, meta.internalOptions); + if (key === null) throw new EngineError("Scrape not eligible for caching"); + + const entry = await getEntryFromCache(key); + if (entry === null) throw new EngineError("Cache missed"); + + return { + url: entry.url, + html: entry.html, + statusCode: entry.statusCode, + error: entry.error, + }; +} \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index aadef7fc..d9168669 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf"; import { scrapeURLWithScrapingBee } from "./scrapingbee"; import { scrapeURLWithFetch } from "./fetch"; import { scrapeURLWithPlaywright } from "./playwright"; +import { scrapeCache } from "./cache"; -export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx"; +export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache"; const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined; +const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ + ...(useCache ? [ "cache" as const ] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []), ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []), ...(usePlaywright ? [ "playwright" as const ] : []), @@ -74,6 +77,7 @@ export type EngineScrapeResult = { const engineHandlers: { [E in Engine]: (meta: Meta) => Promise } = { + "cache": scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, @@ -95,6 +99,22 @@ export const engineOptions: { quality: number, } } = { + "cache": { + features: { + "actions": false, + "waitFor": true, + "screenshot": false, + "screenshot@fullScreen": false, + "pdf": false, // TODO: figure this out + "docx": false, // TODO: figure this out + "atsv": false, + "location": false, + "mobile": false, + "skipTlsVerification": false, + "useFastMode": false, + }, + quality: 1000, // cache should always be tried first + }, "fire-engine;chrome-cdp": { features: { "actions": true, diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts new file mode 100644 index 00000000..785047a1 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -0,0 +1,24 @@ +import { Document } from "../../../controllers/v1/types"; +import { Meta } from ".."; +import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache"; + +export function saveToCache(meta: Meta, document: Document): Document { + if (document.rawHtml === undefined) { + throw new Error("rawHtml is undefined -- this transformer is being called out of order"); + } + + const key = cacheKey(meta.url, meta.options, meta.internalOptions); + + if (key !== null) { + const entry: CacheEntry = { + html: document.rawHtml!, + statusCode: document.metadata.statusCode!, + url: document.metadata.url ?? document.metadata.sourceURL!, + error: document.metadata.error ?? undefined, + }; + + saveEntryToCache(key, entry); + } + + return document; +} \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index d839f8bc..b8063f7e 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata"; import { performLLMExtract } from "./llmExtract"; import { uploadScreenshot } from "./uploadScreenshot"; import { removeBase64Images } from "./removeBase64Images"; +import { saveToCache } from "./cache"; export type Transformer = (meta: Meta, document: Document) => Document | Promise; @@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document // TODO: allow some of these to run in parallel export const transformerStack: Transformer[] = [ + saveToCache, deriveHTMLFromRawHTML, deriveMarkdownFromHTML, deriveLinksFromHTML, diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index e4a5ace8..d59056bb 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -109,6 +109,6 @@ export function waitForJob(jobId: string, timeout: number): Promise } } } - }, 500); + }, 100); }) }