feat: allow passing base64 encoded pdf

2025-08-19 01:55:59 +08:00 · 2024-08-22 14:56:09 +08:00 · 2024-08-22 14:56:09 +08:00 · 080056e889
commit 080056e889
parent de50c93825
3 changed files with 66 additions and 6 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;

            return;
        }
+
+        if (crawlerOpts?.pdf) {
+            const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
+            const fakeSnapshot = {
+                href: urlToCrawl.toString(),
+                html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
+                title: '',
+                text: '',
+                pdfs: [pdfDataUrl],
+            } as PageSnapshot;
+
+            yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
+
+            return;
+        }
+
        let cache;

        const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
    @Prop()
    html?: string;

+    @Prop({
+        desc: 'Base64 encoded PDF.',
+    })
+    pdf?: string;
+
    @Prop({
        default: 'default',
    })
--- a/backend/functions/src/services/pdf-extract.ts
+++ b/backend/functions/src/services/pdf-extract.ts
@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
 import dayjs from 'dayjs';
 import { FirebaseStorageBucketControl } from '../shared';
 import { randomUUID } from 'crypto';
+import { PDFDocumentLoadingTask } from 'pdfjs-dist';
 const utc = require('dayjs/plugin/utc');  // Import the UTC plugin
 dayjs.extend(utc);  // Extend dayjs with the UTC plugin
 const timezone = require('dayjs/plugin/timezone');
@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
        this.emit('ready');
    }

+    isDataUrl(url: string) {
+        return /^data:.+\/(.+);base64,(.*)$/.test(url);
+    }
+
+    parseDataUrl(url: string) {
+        const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
+        if (!matches || matches.length !== 3) {
+            throw new Error('Invalid data URL');
+        }
+
+        if (matches[1] !== 'pdf') {
+            throw new Error('Invalid data URL type');
+        }
+
+        return {
+            type: matches[1],
+            data: matches[2]
+        }
+    }
+
    async extract(url: string | URL) {
-        const loadingTask = this.pdfjs.getDocument({
+        let loadingTask: PDFDocumentLoadingTask;
+
+        if (typeof url === 'string' && this.isDataUrl(url)) {
+            const { data } = this.parseDataUrl(url);
+
+            loadingTask = this.pdfjs.getDocument({
+                data: atob(decodeURIComponent(data)),
+                disableFontFace: true,
+                verbosity: 0
+            });
+        } else {
+            loadingTask = this.pdfjs.getDocument({
                url,
                disableFontFace: true,
                verbosity: 0
            });
+        }
+

        const doc = await loadingTask.promise;
        const meta = await doc.getMetadata();
@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {

        const digest = md5Hasher.hash(url.toString());

+        const data = url;
+        if (typeof url === 'string' && this.isDataUrl(url)) {
+            url = `dataurl://digest:${digest}`;
+        }
+
        const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];

        if (cache) {
@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
        let extracted;

        try {
-            extracted = await this.extract(url);
+            extracted = await this.extract(data);

            const theID = randomUUID();
            await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,