mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 11:35:53 +08:00
feat(scrapeURL, logJob): log pdf page count to db (FIR-2068) (#1587)
* feat(scrapeURL, logJob): log pdf page count to db * devin stop the test littering pls
This commit is contained in:
parent
cc2c968425
commit
fd74299134
@ -1,106 +0,0 @@
|
||||
import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache';
|
||||
|
||||
jest.mock('@google-cloud/storage', () => {
|
||||
const mockSave = jest.fn().mockResolvedValue(undefined);
|
||||
const mockExists = jest.fn().mockResolvedValue([true]);
|
||||
const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({
|
||||
markdown: 'cached markdown',
|
||||
html: 'cached html'
|
||||
}))]);
|
||||
const mockFile = jest.fn().mockImplementation((path) => ({
|
||||
save: mockSave,
|
||||
exists: mockExists,
|
||||
download: mockDownload
|
||||
}));
|
||||
|
||||
return {
|
||||
Storage: jest.fn().mockImplementation(() => ({
|
||||
bucket: jest.fn().mockImplementation(() => ({
|
||||
file: mockFile
|
||||
}))
|
||||
})),
|
||||
_getMockFile: () => mockFile,
|
||||
_getMockSave: () => mockSave
|
||||
};
|
||||
});
|
||||
|
||||
process.env.GCS_BUCKET_NAME = 'test-bucket';
|
||||
|
||||
describe('PDF Caching', () => {
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
test('createPdfCacheKey generates consistent keys', () => {
|
||||
const pdfContent1 = 'test-pdf-content';
|
||||
const pdfContent2 = 'test-pdf-content';
|
||||
const pdfContent3 = 'different-pdf-content';
|
||||
|
||||
const key1 = createPdfCacheKey(pdfContent1);
|
||||
const key2 = createPdfCacheKey(pdfContent2);
|
||||
const key3 = createPdfCacheKey(pdfContent3);
|
||||
|
||||
expect(key1).toBe(key2); // Same content should generate same key
|
||||
expect(key1).not.toBe(key3); // Different content should generate different key
|
||||
|
||||
expect(key1).toMatch(/^[a-f0-9]{64}$/);
|
||||
});
|
||||
|
||||
test('createPdfCacheKey works directly with base64 content', () => {
|
||||
const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy';
|
||||
|
||||
const key = createPdfCacheKey(base64Content);
|
||||
|
||||
expect(key).toMatch(/^[a-f0-9]{64}$/);
|
||||
|
||||
expect(createPdfCacheKey(base64Content)).toBe(key);
|
||||
|
||||
});
|
||||
|
||||
test('savePdfResultToCache saves results to GCS', async () => {
|
||||
const pdfContent = 'test-pdf-content';
|
||||
const result = { markdown: 'test markdown', html: 'test html' };
|
||||
|
||||
const { _getMockFile, _getMockSave } = require('@google-cloud/storage');
|
||||
const mockFile = _getMockFile();
|
||||
const mockSave = _getMockSave();
|
||||
|
||||
mockFile.mockClear();
|
||||
mockSave.mockClear();
|
||||
|
||||
const cacheKey = await savePdfResultToCache(pdfContent, result);
|
||||
|
||||
expect(cacheKey).not.toBeNull();
|
||||
|
||||
expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/'));
|
||||
|
||||
expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), {
|
||||
contentType: 'application/json',
|
||||
metadata: expect.objectContaining({
|
||||
source: 'runpod_pdf_conversion',
|
||||
cache_type: 'pdf_markdown',
|
||||
created_at: expect.any(String)
|
||||
})
|
||||
});
|
||||
});
|
||||
|
||||
test('getPdfResultFromCache retrieves results from GCS', async () => {
|
||||
const pdfContent = 'test-pdf-content';
|
||||
|
||||
const result = await getPdfResultFromCache(pdfContent);
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.markdown).toBe('cached markdown');
|
||||
expect(result?.html).toBe('cached html');
|
||||
});
|
||||
|
||||
test('getPdfResultFromCache returns null when cache miss', async () => {
|
||||
const { Storage } = require('@google-cloud/storage');
|
||||
const mockExists = Storage().bucket().file().exists;
|
||||
mockExists.mockResolvedValueOnce([false]);
|
||||
|
||||
const result = await getPdfResultFromCache('uncached-content');
|
||||
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
});
|
@ -6,7 +6,6 @@ import {
|
||||
} from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import {
|
||||
fromLegacyCombo,
|
||||
TeamFlags,
|
||||
|
@ -10,7 +10,6 @@ import {
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById } from "../../lib/supabase-jobs";
|
||||
|
@ -749,6 +749,7 @@ export type Document = {
|
||||
statusCode: number;
|
||||
scrapeId?: string;
|
||||
error?: string;
|
||||
numPages?: number;
|
||||
proxyUsed: "basic" | "stealth";
|
||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||
};
|
||||
|
@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string {
|
||||
*/
|
||||
export async function savePdfResultToCache(
|
||||
pdfContent: string,
|
||||
result: { markdown: string; html: string }
|
||||
result: { markdown: string; html: string; numPages: number }
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
if (!process.env.GCS_BUCKET_NAME) {
|
||||
@ -76,7 +76,7 @@ export async function savePdfResultToCache(
|
||||
*/
|
||||
export async function getPdfResultFromCache(
|
||||
pdfContent: string
|
||||
): Promise<{ markdown: string; html: string } | null> {
|
||||
): Promise<{ markdown: string; html: string; numPages: number } | null> {
|
||||
try {
|
||||
if (!process.env.GCS_BUCKET_NAME) {
|
||||
return null;
|
||||
@ -102,7 +102,10 @@ export async function getPdfResultFromCache(
|
||||
cacheKey,
|
||||
});
|
||||
|
||||
return result;
|
||||
return {
|
||||
...result,
|
||||
numPages: result.numPages ?? 1, // default to 1 page if cache is old
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Error retrieving PDF RunPod result from GCS cache`, {
|
||||
error,
|
||||
|
@ -109,6 +109,8 @@ export type EngineScrapeResult = {
|
||||
value: unknown
|
||||
}[];
|
||||
};
|
||||
|
||||
numPages?: number;
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
|
@ -13,7 +13,7 @@ import path from "node:path";
|
||||
import type { Response } from "undici";
|
||||
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
|
||||
|
||||
type PDFProcessorResult = { html: string; markdown?: string };
|
||||
type PDFProcessorResult = { html: string; markdown?: string; numPages: number };
|
||||
|
||||
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
||||
|
||||
@ -68,6 +68,7 @@ async function scrapePDFWithRunPodMU(
|
||||
schema: z.object({
|
||||
output: z.object({
|
||||
markdown: z.string(),
|
||||
num_pages: z.number(),
|
||||
}),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
@ -77,6 +78,7 @@ async function scrapePDFWithRunPodMU(
|
||||
const processorResult = {
|
||||
markdown: result.output.markdown,
|
||||
html: await marked.parse(result.output.markdown, { async: true }),
|
||||
numPages: result.output.num_pages,
|
||||
};
|
||||
|
||||
try {
|
||||
@ -103,6 +105,7 @@ async function scrapePDFWithParsePDF(
|
||||
return {
|
||||
markdown: escaped,
|
||||
html: escaped,
|
||||
numPages: result.numpages,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -374,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
url: result.result.url,
|
||||
statusCode: result.result.statusCode,
|
||||
error: result.result.error,
|
||||
numPages: result.result.numPages,
|
||||
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||
},
|
||||
};
|
||||
|
@ -103,6 +103,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
tokens_billed: job.tokens_billed,
|
||||
is_migrated: true,
|
||||
cost_tracking: job.cost_tracking,
|
||||
pdf_num_pages: job.pdf_num_pages ?? null,
|
||||
};
|
||||
|
||||
// Send job to external server
|
||||
@ -183,6 +184,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
retry: job.retry,
|
||||
tokens_billed: job.tokens_billed,
|
||||
cost_tracking: job.cost_tracking,
|
||||
pdf_num_pages: job.pdf_num_pages,
|
||||
},
|
||||
};
|
||||
if (job.mode !== "single_urls") {
|
||||
|
@ -1250,6 +1250,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
cost_tracking: costTracking,
|
||||
pdf_num_pages: doc.metadata.numPages,
|
||||
},
|
||||
true,
|
||||
);
|
||||
@ -1370,6 +1371,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
origin: job.data.origin,
|
||||
num_tokens: 0, // TODO: fix
|
||||
cost_tracking: costTracking,
|
||||
pdf_num_pages: doc.metadata.numPages,
|
||||
});
|
||||
|
||||
indexJob(job, doc);
|
||||
|
@ -94,6 +94,7 @@ export interface FirecrawlJob {
|
||||
tokens_billed?: number;
|
||||
sources?: Record<string, string[]>;
|
||||
cost_tracking?: CostTracking;
|
||||
pdf_num_pages?: number;
|
||||
}
|
||||
|
||||
export interface FirecrawlScrapeResponse {
|
||||
|
Loading…
x
Reference in New Issue
Block a user