diff --git a/apps/api/src/__tests__/snips/pdf-cache.test.ts b/apps/api/src/__tests__/snips/pdf-cache.test.ts deleted file mode 100644 index e60b5f07..00000000 --- a/apps/api/src/__tests__/snips/pdf-cache.test.ts +++ /dev/null @@ -1,106 +0,0 @@ -import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache'; - -jest.mock('@google-cloud/storage', () => { - const mockSave = jest.fn().mockResolvedValue(undefined); - const mockExists = jest.fn().mockResolvedValue([true]); - const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({ - markdown: 'cached markdown', - html: 'cached html' - }))]); - const mockFile = jest.fn().mockImplementation((path) => ({ - save: mockSave, - exists: mockExists, - download: mockDownload - })); - - return { - Storage: jest.fn().mockImplementation(() => ({ - bucket: jest.fn().mockImplementation(() => ({ - file: mockFile - })) - })), - _getMockFile: () => mockFile, - _getMockSave: () => mockSave - }; -}); - -process.env.GCS_BUCKET_NAME = 'test-bucket'; - -describe('PDF Caching', () => { - beforeEach(() => { - jest.clearAllMocks(); - }); - - test('createPdfCacheKey generates consistent keys', () => { - const pdfContent1 = 'test-pdf-content'; - const pdfContent2 = 'test-pdf-content'; - const pdfContent3 = 'different-pdf-content'; - - const key1 = createPdfCacheKey(pdfContent1); - const key2 = createPdfCacheKey(pdfContent2); - const key3 = createPdfCacheKey(pdfContent3); - - expect(key1).toBe(key2); // Same content should generate same key - expect(key1).not.toBe(key3); // Different content should generate different key - - expect(key1).toMatch(/^[a-f0-9]{64}$/); - }); - - test('createPdfCacheKey works directly with base64 content', () => { - const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy'; - - const key = createPdfCacheKey(base64Content); - - expect(key).toMatch(/^[a-f0-9]{64}$/); - - expect(createPdfCacheKey(base64Content)).toBe(key); - - }); - - test('savePdfResultToCache saves results to GCS', async () => { - const pdfContent = 'test-pdf-content'; - const result = { markdown: 'test markdown', html: 'test html' }; - - const { _getMockFile, _getMockSave } = require('@google-cloud/storage'); - const mockFile = _getMockFile(); - const mockSave = _getMockSave(); - - mockFile.mockClear(); - mockSave.mockClear(); - - const cacheKey = await savePdfResultToCache(pdfContent, result); - - expect(cacheKey).not.toBeNull(); - - expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/')); - - expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), { - contentType: 'application/json', - metadata: expect.objectContaining({ - source: 'runpod_pdf_conversion', - cache_type: 'pdf_markdown', - created_at: expect.any(String) - }) - }); - }); - - test('getPdfResultFromCache retrieves results from GCS', async () => { - const pdfContent = 'test-pdf-content'; - - const result = await getPdfResultFromCache(pdfContent); - - expect(result).not.toBeNull(); - expect(result?.markdown).toBe('cached markdown'); - expect(result?.html).toBe('cached html'); - }); - - test('getPdfResultFromCache returns null when cache miss', async () => { - const { Storage } = require('@google-cloud/storage'); - const mockExists = Storage().bucket().file().exists; - mockExists.mockResolvedValueOnce([false]); - - const result = await getPdfResultFromCache('uncached-content'); - - expect(result).toBeNull(); - }); -}); diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 98a35654..8eb38f81 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -6,7 +6,6 @@ import { } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../types"; -import { logJob } from "../../services/logging/log_job"; import { fromLegacyCombo, TeamFlags, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ea43ead6..8da72dae 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -10,7 +10,6 @@ import { import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from "uuid"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; -import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById } from "../../lib/supabase-jobs"; diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 020cd563..908a51d3 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -749,6 +749,7 @@ export type Document = { statusCode: number; scrapeId?: string; error?: string; + numPages?: number; proxyUsed: "basic" | "stealth"; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; diff --git a/apps/api/src/lib/gcs-pdf-cache.ts b/apps/api/src/lib/gcs-pdf-cache.ts index 90348974..90eaa67c 100644 --- a/apps/api/src/lib/gcs-pdf-cache.ts +++ b/apps/api/src/lib/gcs-pdf-cache.ts @@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string { */ export async function savePdfResultToCache( pdfContent: string, - result: { markdown: string; html: string } + result: { markdown: string; html: string; numPages: number } ): Promise { try { if (!process.env.GCS_BUCKET_NAME) { @@ -76,7 +76,7 @@ export async function savePdfResultToCache( */ export async function getPdfResultFromCache( pdfContent: string -): Promise<{ markdown: string; html: string } | null> { +): Promise<{ markdown: string; html: string; numPages: number } | null> { try { if (!process.env.GCS_BUCKET_NAME) { return null; @@ -102,7 +102,10 @@ export async function getPdfResultFromCache( cacheKey, }); - return result; + return { + ...result, + numPages: result.numPages ?? 1, // default to 1 page if cache is old + }; } catch (error) { logger.error(`Error retrieving PDF RunPod result from GCS cache`, { error, diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 08fb4a3b..b59313e3 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -109,6 +109,8 @@ export type EngineScrapeResult = { value: unknown }[]; }; + + numPages?: number; }; const engineHandlers: { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index ca95cb32..d66eb3f5 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -13,7 +13,7 @@ import path from "node:path"; import type { Response } from "undici"; import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache"; -type PDFProcessorResult = { html: string; markdown?: string }; +type PDFProcessorResult = { html: string; markdown?: string; numPages: number }; const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB @@ -68,6 +68,7 @@ async function scrapePDFWithRunPodMU( schema: z.object({ output: z.object({ markdown: z.string(), + num_pages: z.number(), }), }), mock: meta.mock, @@ -77,6 +78,7 @@ async function scrapePDFWithRunPodMU( const processorResult = { markdown: result.output.markdown, html: await marked.parse(result.output.markdown, { async: true }), + numPages: result.output.num_pages, }; try { @@ -103,6 +105,7 @@ async function scrapePDFWithParsePDF( return { markdown: escaped, html: escaped, + numPages: result.numpages, }; } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index d5d5a680..dbe5867c 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -374,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise { url: result.result.url, statusCode: result.result.statusCode, error: result.result.error, + numPages: result.result.numPages, proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic", }, }; diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 7149709b..a1e0a8d5 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -103,6 +103,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { tokens_billed: job.tokens_billed, is_migrated: true, cost_tracking: job.cost_tracking, + pdf_num_pages: job.pdf_num_pages ?? null, }; // Send job to external server @@ -183,6 +184,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { retry: job.retry, tokens_billed: job.tokens_billed, cost_tracking: job.cost_tracking, + pdf_num_pages: job.pdf_num_pages, }, }; if (job.mode !== "single_urls") { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 3b4afa29..79ab6c69 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1250,6 +1250,7 @@ async function processJob(job: Job & { id: string }, token: string) { origin: job.data.origin, crawl_id: job.data.crawl_id, cost_tracking: costTracking, + pdf_num_pages: doc.metadata.numPages, }, true, ); @@ -1370,6 +1371,7 @@ async function processJob(job: Job & { id: string }, token: string) { origin: job.data.origin, num_tokens: 0, // TODO: fix cost_tracking: costTracking, + pdf_num_pages: doc.metadata.numPages, }); indexJob(job, doc); diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 34e2f60f..7b2cc640 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -94,6 +94,7 @@ export interface FirecrawlJob { tokens_billed?: number; sources?: Record; cost_tracking?: CostTracking; + pdf_num_pages?: number; } export interface FirecrawlScrapeResponse {