mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 02:45:53 +08:00
Add caching for RunPod PDF markdown results in GCS (#1561)
* Add caching for RunPod PDF markdown results in GCS Co-Authored-By: thomas@sideguide.dev <thomas@sideguide.dev> * Update PDF caching to hash base64 directly and add metadata Co-Authored-By: thomas@sideguide.dev <thomas@sideguide.dev> * Fix PDF caching to directly hash content and fix test expectations Co-Authored-By: thomas@sideguide.dev <thomas@sideguide.dev> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: thomas@sideguide.dev <thomas@sideguide.dev>
This commit is contained in:
parent
bd9673e104
commit
526165e1b9
106
apps/api/src/__tests__/snips/pdf-cache.test.ts
Normal file
106
apps/api/src/__tests__/snips/pdf-cache.test.ts
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache';
|
||||||
|
|
||||||
|
jest.mock('@google-cloud/storage', () => {
|
||||||
|
const mockSave = jest.fn().mockResolvedValue(undefined);
|
||||||
|
const mockExists = jest.fn().mockResolvedValue([true]);
|
||||||
|
const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({
|
||||||
|
markdown: 'cached markdown',
|
||||||
|
html: 'cached html'
|
||||||
|
}))]);
|
||||||
|
const mockFile = jest.fn().mockImplementation((path) => ({
|
||||||
|
save: mockSave,
|
||||||
|
exists: mockExists,
|
||||||
|
download: mockDownload
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
Storage: jest.fn().mockImplementation(() => ({
|
||||||
|
bucket: jest.fn().mockImplementation(() => ({
|
||||||
|
file: mockFile
|
||||||
|
}))
|
||||||
|
})),
|
||||||
|
_getMockFile: () => mockFile,
|
||||||
|
_getMockSave: () => mockSave
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
process.env.GCS_BUCKET_NAME = 'test-bucket';
|
||||||
|
|
||||||
|
describe('PDF Caching', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('createPdfCacheKey generates consistent keys', () => {
|
||||||
|
const pdfContent1 = 'test-pdf-content';
|
||||||
|
const pdfContent2 = 'test-pdf-content';
|
||||||
|
const pdfContent3 = 'different-pdf-content';
|
||||||
|
|
||||||
|
const key1 = createPdfCacheKey(pdfContent1);
|
||||||
|
const key2 = createPdfCacheKey(pdfContent2);
|
||||||
|
const key3 = createPdfCacheKey(pdfContent3);
|
||||||
|
|
||||||
|
expect(key1).toBe(key2); // Same content should generate same key
|
||||||
|
expect(key1).not.toBe(key3); // Different content should generate different key
|
||||||
|
|
||||||
|
expect(key1).toMatch(/^[a-f0-9]{64}$/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('createPdfCacheKey works directly with base64 content', () => {
|
||||||
|
const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy';
|
||||||
|
|
||||||
|
const key = createPdfCacheKey(base64Content);
|
||||||
|
|
||||||
|
expect(key).toMatch(/^[a-f0-9]{64}$/);
|
||||||
|
|
||||||
|
expect(createPdfCacheKey(base64Content)).toBe(key);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
test('savePdfResultToCache saves results to GCS', async () => {
|
||||||
|
const pdfContent = 'test-pdf-content';
|
||||||
|
const result = { markdown: 'test markdown', html: 'test html' };
|
||||||
|
|
||||||
|
const { _getMockFile, _getMockSave } = require('@google-cloud/storage');
|
||||||
|
const mockFile = _getMockFile();
|
||||||
|
const mockSave = _getMockSave();
|
||||||
|
|
||||||
|
mockFile.mockClear();
|
||||||
|
mockSave.mockClear();
|
||||||
|
|
||||||
|
const cacheKey = await savePdfResultToCache(pdfContent, result);
|
||||||
|
|
||||||
|
expect(cacheKey).not.toBeNull();
|
||||||
|
|
||||||
|
expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/'));
|
||||||
|
|
||||||
|
expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), {
|
||||||
|
contentType: 'application/json',
|
||||||
|
metadata: expect.objectContaining({
|
||||||
|
source: 'runpod_pdf_conversion',
|
||||||
|
cache_type: 'pdf_markdown',
|
||||||
|
created_at: expect.any(String)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('getPdfResultFromCache retrieves results from GCS', async () => {
|
||||||
|
const pdfContent = 'test-pdf-content';
|
||||||
|
|
||||||
|
const result = await getPdfResultFromCache(pdfContent);
|
||||||
|
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.markdown).toBe('cached markdown');
|
||||||
|
expect(result?.html).toBe('cached html');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('getPdfResultFromCache returns null when cache miss', async () => {
|
||||||
|
const { Storage } = require('@google-cloud/storage');
|
||||||
|
const mockExists = Storage().bucket().file().exists;
|
||||||
|
mockExists.mockResolvedValueOnce([false]);
|
||||||
|
|
||||||
|
const result = await getPdfResultFromCache('uncached-content');
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
112
apps/api/src/lib/gcs-pdf-cache.ts
Normal file
112
apps/api/src/lib/gcs-pdf-cache.ts
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import { Storage } from "@google-cloud/storage";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
|
const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined;
|
||||||
|
const PDF_CACHE_PREFIX = "pdf-cache/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a SHA-256 hash of the PDF content to use as a cache key
|
||||||
|
* Directly hashes the content without any conversion
|
||||||
|
*/
|
||||||
|
export function createPdfCacheKey(pdfContent: string | Buffer): string {
|
||||||
|
return crypto
|
||||||
|
.createHash('sha256')
|
||||||
|
.update(pdfContent)
|
||||||
|
.digest('hex');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save RunPod markdown results to GCS cache
|
||||||
|
*/
|
||||||
|
export async function savePdfResultToCache(
|
||||||
|
pdfContent: string,
|
||||||
|
result: { markdown: string; html: string }
|
||||||
|
): Promise<string | null> {
|
||||||
|
try {
|
||||||
|
if (!process.env.GCS_BUCKET_NAME) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cacheKey = createPdfCacheKey(pdfContent);
|
||||||
|
const storage = new Storage({ credentials });
|
||||||
|
const bucket = storage.bucket(process.env.GCS_BUCKET_NAME);
|
||||||
|
const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`);
|
||||||
|
|
||||||
|
for (let i = 0; i < 3; i++) {
|
||||||
|
try {
|
||||||
|
await blob.save(JSON.stringify(result), {
|
||||||
|
contentType: "application/json",
|
||||||
|
metadata: {
|
||||||
|
source: "runpod_pdf_conversion",
|
||||||
|
cache_type: "pdf_markdown",
|
||||||
|
created_at: new Date().toISOString(),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info(`Saved PDF RunPod result to GCS cache`, {
|
||||||
|
cacheKey,
|
||||||
|
});
|
||||||
|
|
||||||
|
return cacheKey;
|
||||||
|
} catch (error) {
|
||||||
|
if (i === 2) {
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
logger.error(`Error saving PDF RunPod result to GCS cache, retrying`, {
|
||||||
|
error,
|
||||||
|
cacheKey,
|
||||||
|
i,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cacheKey;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error saving PDF RunPod result to GCS cache`, {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cached RunPod markdown results from GCS
|
||||||
|
*/
|
||||||
|
export async function getPdfResultFromCache(
|
||||||
|
pdfContent: string
|
||||||
|
): Promise<{ markdown: string; html: string } | null> {
|
||||||
|
try {
|
||||||
|
if (!process.env.GCS_BUCKET_NAME) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cacheKey = createPdfCacheKey(pdfContent);
|
||||||
|
const storage = new Storage({ credentials });
|
||||||
|
const bucket = storage.bucket(process.env.GCS_BUCKET_NAME);
|
||||||
|
const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`);
|
||||||
|
|
||||||
|
const [exists] = await blob.exists();
|
||||||
|
if (!exists) {
|
||||||
|
logger.debug(`PDF RunPod result not found in GCS cache`, {
|
||||||
|
cacheKey,
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const [content] = await blob.download();
|
||||||
|
const result = JSON.parse(content.toString());
|
||||||
|
|
||||||
|
logger.info(`Retrieved PDF RunPod result from GCS cache`, {
|
||||||
|
cacheKey,
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error retrieving PDF RunPod result from GCS cache`, {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
@ -11,6 +11,7 @@ import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../..
|
|||||||
import { readFile, unlink } from "node:fs/promises";
|
import { readFile, unlink } from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import type { Response } from "undici";
|
import type { Response } from "undici";
|
||||||
|
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
|
||||||
|
|
||||||
type PDFProcessorResult = { html: string; markdown?: string };
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
|
|
||||||
@ -26,6 +27,22 @@ async function scrapePDFWithRunPodMU(
|
|||||||
tempFilePath,
|
tempFilePath,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const cachedResult = await getPdfResultFromCache(base64Content);
|
||||||
|
|
||||||
|
if (cachedResult) {
|
||||||
|
meta.logger.info("Using cached RunPod MU result for PDF", {
|
||||||
|
tempFilePath,
|
||||||
|
});
|
||||||
|
return cachedResult;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.warn("Error checking PDF cache, proceeding with RunPod MU", {
|
||||||
|
error,
|
||||||
|
tempFilePath,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const result = await robustFetch({
|
const result = await robustFetch({
|
||||||
url:
|
url:
|
||||||
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
||||||
@ -50,10 +67,21 @@ async function scrapePDFWithRunPodMU(
|
|||||||
mock: meta.mock,
|
mock: meta.mock,
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
const processorResult = {
|
||||||
markdown: result.output.markdown,
|
markdown: result.output.markdown,
|
||||||
html: await marked.parse(result.output.markdown, { async: true }),
|
html: await marked.parse(result.output.markdown, { async: true }),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
await savePdfResultToCache(base64Content, processorResult);
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.warn("Error saving PDF to cache", {
|
||||||
|
error,
|
||||||
|
tempFilePath,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return processorResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapePDFWithParsePDF(
|
async function scrapePDFWithParsePDF(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user