From d0d0845a6698787bf498200c5addfb848241527f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 14 May 2025 23:56:00 +0200 Subject: [PATCH] feat(scrapeURL/pdf): bill n credits per page --- apps/api/src/controllers/v1/scrape.ts | 3 +++ apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/scraper/scrapeURL/engines/index.ts | 2 ++ apps/api/src/scraper/scrapeURL/engines/pdf/index.ts | 10 +++++++--- apps/api/src/scraper/scrapeURL/index.ts | 1 + apps/api/src/services/queue-worker.ts | 3 +++ 6 files changed, 17 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index d492156f..3edb107a 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -153,6 +153,9 @@ export async function scrapeController( } else { creditsToBeBilled = 150; } + } else if (doc.metadata.numPages !== undefined && doc.metadata.numPages > 1) { + const creditsPerPDFPage = 5; + creditsToBeBilled = creditsPerPDFPage * doc.metadata.numPages; } if (req.body.proxy === "stealth") { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1525640a..1654d299 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -738,6 +738,7 @@ export type Document = { statusCode: number; scrapeId?: string; error?: string; + numPages?: number; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; serpResults?: { diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 6a2a2e40..235ff17c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -101,6 +101,8 @@ export type EngineScrapeResult = { value: unknown }[]; }; + + numPages?: number; }; const engineHandlers: { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 727e12c9..944f384c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -12,7 +12,7 @@ import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import type { Response } from "undici"; -type PDFProcessorResult = { html: string; markdown?: string }; +type PDFProcessorResult = { html: string; markdown?: string; numPages: number }; const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB @@ -45,6 +45,7 @@ async function scrapePDFWithRunPodMU( schema: z.object({ output: z.object({ markdown: z.string(), + num_pages: z.number(), }), }), mock: meta.mock, @@ -53,6 +54,7 @@ async function scrapePDFWithRunPodMU( return { markdown: result.output.markdown, html: await marked.parse(result.output.markdown, { async: true }), + numPages: result.output.num_pages, }; } @@ -68,6 +70,7 @@ async function scrapePDFWithParsePDF( return { markdown: escaped, html: escaped, + numPages: result.numpages, }; } @@ -172,7 +175,8 @@ export async function scrapePDF( return { url: response.url ?? meta.url, statusCode: response.status, - html: result?.html ?? "", - markdown: result?.markdown ?? "", + html: result.html ?? "", + markdown: result.markdown ?? "", + numPages: result.numPages, }; } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index baf29752..5f604dbf 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -368,6 +368,7 @@ async function scrapeURLLoop(meta: Meta): Promise { url: result.result.url, statusCode: result.result.statusCode, error: result.result.error, + numPages: result.result.numPages, }, }; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 2a1e47f7..21e4f3b3 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1365,6 +1365,9 @@ async function processJob(job: Job & { id: string }, token: string) { } else { creditsToBeBilled = 150; } + } else if (doc.metadata.numPages !== undefined && doc.metadata.numPages > 1) { + const creditsPerPDFPage = 5; + creditsToBeBilled = creditsPerPDFPage * doc.metadata.numPages; } if (job.data.scrapeOptions.proxy === "stealth") {