diff --git a/apps/api/src/__tests__/snips/pdf-cache.test.ts b/apps/api/src/__tests__/snips/pdf-cache.test.ts new file mode 100644 index 00000000..e60b5f07 --- /dev/null +++ b/apps/api/src/__tests__/snips/pdf-cache.test.ts @@ -0,0 +1,106 @@ +import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache'; + +jest.mock('@google-cloud/storage', () => { + const mockSave = jest.fn().mockResolvedValue(undefined); + const mockExists = jest.fn().mockResolvedValue([true]); + const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({ + markdown: 'cached markdown', + html: 'cached html' + }))]); + const mockFile = jest.fn().mockImplementation((path) => ({ + save: mockSave, + exists: mockExists, + download: mockDownload + })); + + return { + Storage: jest.fn().mockImplementation(() => ({ + bucket: jest.fn().mockImplementation(() => ({ + file: mockFile + })) + })), + _getMockFile: () => mockFile, + _getMockSave: () => mockSave + }; +}); + +process.env.GCS_BUCKET_NAME = 'test-bucket'; + +describe('PDF Caching', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + test('createPdfCacheKey generates consistent keys', () => { + const pdfContent1 = 'test-pdf-content'; + const pdfContent2 = 'test-pdf-content'; + const pdfContent3 = 'different-pdf-content'; + + const key1 = createPdfCacheKey(pdfContent1); + const key2 = createPdfCacheKey(pdfContent2); + const key3 = createPdfCacheKey(pdfContent3); + + expect(key1).toBe(key2); // Same content should generate same key + expect(key1).not.toBe(key3); // Different content should generate different key + + expect(key1).toMatch(/^[a-f0-9]{64}$/); + }); + + test('createPdfCacheKey works directly with base64 content', () => { + const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy'; + + const key = createPdfCacheKey(base64Content); + + expect(key).toMatch(/^[a-f0-9]{64}$/); + + expect(createPdfCacheKey(base64Content)).toBe(key); + + }); + + test('savePdfResultToCache saves results to GCS', async () => { + const pdfContent = 'test-pdf-content'; + const result = { markdown: 'test markdown', html: 'test html' }; + + const { _getMockFile, _getMockSave } = require('@google-cloud/storage'); + const mockFile = _getMockFile(); + const mockSave = _getMockSave(); + + mockFile.mockClear(); + mockSave.mockClear(); + + const cacheKey = await savePdfResultToCache(pdfContent, result); + + expect(cacheKey).not.toBeNull(); + + expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/')); + + expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), { + contentType: 'application/json', + metadata: expect.objectContaining({ + source: 'runpod_pdf_conversion', + cache_type: 'pdf_markdown', + created_at: expect.any(String) + }) + }); + }); + + test('getPdfResultFromCache retrieves results from GCS', async () => { + const pdfContent = 'test-pdf-content'; + + const result = await getPdfResultFromCache(pdfContent); + + expect(result).not.toBeNull(); + expect(result?.markdown).toBe('cached markdown'); + expect(result?.html).toBe('cached html'); + }); + + test('getPdfResultFromCache returns null when cache miss', async () => { + const { Storage } = require('@google-cloud/storage'); + const mockExists = Storage().bucket().file().exists; + mockExists.mockResolvedValueOnce([false]); + + const result = await getPdfResultFromCache('uncached-content'); + + expect(result).toBeNull(); + }); +}); diff --git a/apps/api/src/lib/gcs-pdf-cache.ts b/apps/api/src/lib/gcs-pdf-cache.ts new file mode 100644 index 00000000..90348974 --- /dev/null +++ b/apps/api/src/lib/gcs-pdf-cache.ts @@ -0,0 +1,112 @@ +import { Storage } from "@google-cloud/storage"; +import { logger } from "./logger"; +import crypto from "crypto"; + +const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; +const PDF_CACHE_PREFIX = "pdf-cache/"; + +/** + * Creates a SHA-256 hash of the PDF content to use as a cache key + * Directly hashes the content without any conversion + */ +export function createPdfCacheKey(pdfContent: string | Buffer): string { + return crypto + .createHash('sha256') + .update(pdfContent) + .digest('hex'); +} + +/** + * Save RunPod markdown results to GCS cache + */ +export async function savePdfResultToCache( + pdfContent: string, + result: { markdown: string; html: string } +): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const cacheKey = createPdfCacheKey(pdfContent); + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`); + + for (let i = 0; i < 3; i++) { + try { + await blob.save(JSON.stringify(result), { + contentType: "application/json", + metadata: { + source: "runpod_pdf_conversion", + cache_type: "pdf_markdown", + created_at: new Date().toISOString(), + } + }); + + logger.info(`Saved PDF RunPod result to GCS cache`, { + cacheKey, + }); + + return cacheKey; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving PDF RunPod result to GCS cache, retrying`, { + error, + cacheKey, + i, + }); + } + } + } + + return cacheKey; + } catch (error) { + logger.error(`Error saving PDF RunPod result to GCS cache`, { + error, + }); + return null; + } +} + +/** + * Get cached RunPod markdown results from GCS + */ +export async function getPdfResultFromCache( + pdfContent: string +): Promise<{ markdown: string; html: string } | null> { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const cacheKey = createPdfCacheKey(pdfContent); + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`); + + const [exists] = await blob.exists(); + if (!exists) { + logger.debug(`PDF RunPod result not found in GCS cache`, { + cacheKey, + }); + return null; + } + + const [content] = await blob.download(); + const result = JSON.parse(content.toString()); + + logger.info(`Retrieved PDF RunPod result from GCS cache`, { + cacheKey, + }); + + return result; + } catch (error) { + logger.error(`Error retrieving PDF RunPod result from GCS cache`, { + error, + }); + return null; + } +} diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 727e12c9..29a07a5d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -11,6 +11,7 @@ import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../.. import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import type { Response } from "undici"; +import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache"; type PDFProcessorResult = { html: string; markdown?: string }; @@ -26,6 +27,22 @@ async function scrapePDFWithRunPodMU( tempFilePath, }); + try { + const cachedResult = await getPdfResultFromCache(base64Content); + + if (cachedResult) { + meta.logger.info("Using cached RunPod MU result for PDF", { + tempFilePath, + }); + return cachedResult; + } + } catch (error) { + meta.logger.warn("Error checking PDF cache, proceeding with RunPod MU", { + error, + tempFilePath, + }); + } + const result = await robustFetch({ url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", @@ -50,10 +67,21 @@ async function scrapePDFWithRunPodMU( mock: meta.mock, }); - return { + const processorResult = { markdown: result.output.markdown, html: await marked.parse(result.output.markdown, { async: true }), }; + + try { + await savePdfResultToCache(base64Content, processorResult); + } catch (error) { + meta.logger.warn("Error saving PDF to cache", { + error, + tempFilePath, + }); + } + + return processorResult; } async function scrapePDFWithParsePDF(