feat(scrapeURL, logJob): log pdf page count to db (FIR-2068) (#1587)

* feat(scrapeURL, logJob): log pdf page count to db

* devin stop the test littering pls
This commit is contained in:
Gergő Móricz 2025-05-22 22:26:01 +02:00 committed by GitHub
parent cc2c968425
commit fd74299134
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 19 additions and 112 deletions

View File

@ -1,106 +0,0 @@
import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache';
jest.mock('@google-cloud/storage', () => {
const mockSave = jest.fn().mockResolvedValue(undefined);
const mockExists = jest.fn().mockResolvedValue([true]);
const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({
markdown: 'cached markdown',
html: 'cached html'
}))]);
const mockFile = jest.fn().mockImplementation((path) => ({
save: mockSave,
exists: mockExists,
download: mockDownload
}));
return {
Storage: jest.fn().mockImplementation(() => ({
bucket: jest.fn().mockImplementation(() => ({
file: mockFile
}))
})),
_getMockFile: () => mockFile,
_getMockSave: () => mockSave
};
});
process.env.GCS_BUCKET_NAME = 'test-bucket';
describe('PDF Caching', () => {
beforeEach(() => {
jest.clearAllMocks();
});
test('createPdfCacheKey generates consistent keys', () => {
const pdfContent1 = 'test-pdf-content';
const pdfContent2 = 'test-pdf-content';
const pdfContent3 = 'different-pdf-content';
const key1 = createPdfCacheKey(pdfContent1);
const key2 = createPdfCacheKey(pdfContent2);
const key3 = createPdfCacheKey(pdfContent3);
expect(key1).toBe(key2); // Same content should generate same key
expect(key1).not.toBe(key3); // Different content should generate different key
expect(key1).toMatch(/^[a-f0-9]{64}$/);
});
test('createPdfCacheKey works directly with base64 content', () => {
const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy';
const key = createPdfCacheKey(base64Content);
expect(key).toMatch(/^[a-f0-9]{64}$/);
expect(createPdfCacheKey(base64Content)).toBe(key);
});
test('savePdfResultToCache saves results to GCS', async () => {
const pdfContent = 'test-pdf-content';
const result = { markdown: 'test markdown', html: 'test html' };
const { _getMockFile, _getMockSave } = require('@google-cloud/storage');
const mockFile = _getMockFile();
const mockSave = _getMockSave();
mockFile.mockClear();
mockSave.mockClear();
const cacheKey = await savePdfResultToCache(pdfContent, result);
expect(cacheKey).not.toBeNull();
expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/'));
expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), {
contentType: 'application/json',
metadata: expect.objectContaining({
source: 'runpod_pdf_conversion',
cache_type: 'pdf_markdown',
created_at: expect.any(String)
})
});
});
test('getPdfResultFromCache retrieves results from GCS', async () => {
const pdfContent = 'test-pdf-content';
const result = await getPdfResultFromCache(pdfContent);
expect(result).not.toBeNull();
expect(result?.markdown).toBe('cached markdown');
expect(result?.html).toBe('cached html');
});
test('getPdfResultFromCache returns null when cache miss', async () => {
const { Storage } = require('@google-cloud/storage');
const mockExists = Storage().bucket().file().exists;
mockExists.mockResolvedValueOnce([false]);
const result = await getPdfResultFromCache('uncached-content');
expect(result).toBeNull();
});
});

View File

@ -6,7 +6,6 @@ import {
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import {
fromLegacyCombo,
TeamFlags,

View File

@ -10,7 +10,6 @@ import {
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById } from "../../lib/supabase-jobs";

View File

@ -749,6 +749,7 @@ export type Document = {
statusCode: number;
scrapeId?: string;
error?: string;
numPages?: number;
proxyUsed: "basic" | "stealth";
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
};

View File

@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string {
*/
export async function savePdfResultToCache(
pdfContent: string,
result: { markdown: string; html: string }
result: { markdown: string; html: string; numPages: number }
): Promise<string | null> {
try {
if (!process.env.GCS_BUCKET_NAME) {
@ -76,7 +76,7 @@ export async function savePdfResultToCache(
*/
export async function getPdfResultFromCache(
pdfContent: string
): Promise<{ markdown: string; html: string } | null> {
): Promise<{ markdown: string; html: string; numPages: number } | null> {
try {
if (!process.env.GCS_BUCKET_NAME) {
return null;
@ -102,7 +102,10 @@ export async function getPdfResultFromCache(
cacheKey,
});
return result;
return {
...result,
numPages: result.numPages ?? 1, // default to 1 page if cache is old
};
} catch (error) {
logger.error(`Error retrieving PDF RunPod result from GCS cache`, {
error,

View File

@ -109,6 +109,8 @@ export type EngineScrapeResult = {
value: unknown
}[];
};
numPages?: number;
};
const engineHandlers: {

View File

@ -13,7 +13,7 @@ import path from "node:path";
import type { Response } from "undici";
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
type PDFProcessorResult = { html: string; markdown?: string };
type PDFProcessorResult = { html: string; markdown?: string; numPages: number };
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
@ -68,6 +68,7 @@ async function scrapePDFWithRunPodMU(
schema: z.object({
output: z.object({
markdown: z.string(),
num_pages: z.number(),
}),
}),
mock: meta.mock,
@ -77,6 +78,7 @@ async function scrapePDFWithRunPodMU(
const processorResult = {
markdown: result.output.markdown,
html: await marked.parse(result.output.markdown, { async: true }),
numPages: result.output.num_pages,
};
try {
@ -103,6 +105,7 @@ async function scrapePDFWithParsePDF(
return {
markdown: escaped,
html: escaped,
numPages: result.numpages,
};
}

View File

@ -374,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
url: result.result.url,
statusCode: result.result.statusCode,
error: result.result.error,
numPages: result.result.numPages,
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
},
};

View File

@ -103,6 +103,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
tokens_billed: job.tokens_billed,
is_migrated: true,
cost_tracking: job.cost_tracking,
pdf_num_pages: job.pdf_num_pages ?? null,
};
// Send job to external server
@ -183,6 +184,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
retry: job.retry,
tokens_billed: job.tokens_billed,
cost_tracking: job.cost_tracking,
pdf_num_pages: job.pdf_num_pages,
},
};
if (job.mode !== "single_urls") {

View File

@ -1250,6 +1250,7 @@ async function processJob(job: Job & { id: string }, token: string) {
origin: job.data.origin,
crawl_id: job.data.crawl_id,
cost_tracking: costTracking,
pdf_num_pages: doc.metadata.numPages,
},
true,
);
@ -1370,6 +1371,7 @@ async function processJob(job: Job & { id: string }, token: string) {
origin: job.data.origin,
num_tokens: 0, // TODO: fix
cost_tracking: costTracking,
pdf_num_pages: doc.metadata.numPages,
});
indexJob(job, doc);

View File

@ -94,6 +94,7 @@ export interface FirecrawlJob {
tokens_billed?: number;
sources?: Record<string, string[]>;
cost_tracking?: CostTracking;
pdf_num_pages?: number;
}
export interface FirecrawlScrapeResponse {