mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 00:25:58 +08:00
feat(scrapeURL/pdf): bill n credits per page
This commit is contained in:
parent
cee481a3a9
commit
d0d0845a66
@ -153,6 +153,9 @@ export async function scrapeController(
|
||||
} else {
|
||||
creditsToBeBilled = 150;
|
||||
}
|
||||
} else if (doc.metadata.numPages !== undefined && doc.metadata.numPages > 1) {
|
||||
const creditsPerPDFPage = 5;
|
||||
creditsToBeBilled = creditsPerPDFPage * doc.metadata.numPages;
|
||||
}
|
||||
|
||||
if (req.body.proxy === "stealth") {
|
||||
|
@ -738,6 +738,7 @@ export type Document = {
|
||||
statusCode: number;
|
||||
scrapeId?: string;
|
||||
error?: string;
|
||||
numPages?: number;
|
||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||
};
|
||||
serpResults?: {
|
||||
|
@ -101,6 +101,8 @@ export type EngineScrapeResult = {
|
||||
value: unknown
|
||||
}[];
|
||||
};
|
||||
|
||||
numPages?: number;
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
|
@ -12,7 +12,7 @@ import { readFile, unlink } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import type { Response } from "undici";
|
||||
|
||||
type PDFProcessorResult = { html: string; markdown?: string };
|
||||
type PDFProcessorResult = { html: string; markdown?: string; numPages: number };
|
||||
|
||||
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
||||
|
||||
@ -45,6 +45,7 @@ async function scrapePDFWithRunPodMU(
|
||||
schema: z.object({
|
||||
output: z.object({
|
||||
markdown: z.string(),
|
||||
num_pages: z.number(),
|
||||
}),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
@ -53,6 +54,7 @@ async function scrapePDFWithRunPodMU(
|
||||
return {
|
||||
markdown: result.output.markdown,
|
||||
html: await marked.parse(result.output.markdown, { async: true }),
|
||||
numPages: result.output.num_pages,
|
||||
};
|
||||
}
|
||||
|
||||
@ -68,6 +70,7 @@ async function scrapePDFWithParsePDF(
|
||||
return {
|
||||
markdown: escaped,
|
||||
html: escaped,
|
||||
numPages: result.numpages,
|
||||
};
|
||||
}
|
||||
|
||||
@ -172,7 +175,8 @@ export async function scrapePDF(
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
statusCode: response.status,
|
||||
html: result?.html ?? "",
|
||||
markdown: result?.markdown ?? "",
|
||||
html: result.html ?? "",
|
||||
markdown: result.markdown ?? "",
|
||||
numPages: result.numPages,
|
||||
};
|
||||
}
|
||||
|
@ -368,6 +368,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
url: result.result.url,
|
||||
statusCode: result.result.statusCode,
|
||||
error: result.result.error,
|
||||
numPages: result.result.numPages,
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -1365,6 +1365,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
} else {
|
||||
creditsToBeBilled = 150;
|
||||
}
|
||||
} else if (doc.metadata.numPages !== undefined && doc.metadata.numPages > 1) {
|
||||
const creditsPerPDFPage = 5;
|
||||
creditsToBeBilled = creditsPerPDFPage * doc.metadata.numPages;
|
||||
}
|
||||
|
||||
if (job.data.scrapeOptions.proxy === "stealth") {
|
||||
|
Loading…
x
Reference in New Issue
Block a user