feat: allow passing base64 encoded pdf

2025-08-19 06:45:53 +08:00 · 2024-08-22 14:56:09 +08:00 · 2024-08-22 14:56:09 +08:00 · 080056e889
commit 080056e889
parent de50c93825
3 changed files with 66 additions and 6 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
            return;
        }
        if (crawlerOpts?.pdf) {
            const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
            const fakeSnapshot = {
                href: urlToCrawl.toString(),
                html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
                title: '',
                text: '',
                pdfs: [pdfDataUrl],
            } as PageSnapshot;
            yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
            return;
        }
        let cache;
        const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
    @Prop()
    html?: string;
    @Prop({
        desc: 'Base64 encoded PDF.',
    })
    pdf?: string;
    @Prop({
        default: 'default',
    })
--- a/backend/functions/src/services/pdf-extract.ts
+++ b/backend/functions/src/services/pdf-extract.ts
@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
 import dayjs from 'dayjs';
 import { FirebaseStorageBucketControl } from '../shared';
 import { randomUUID } from 'crypto';
 import { PDFDocumentLoadingTask } from 'pdfjs-dist';
 const utc = require('dayjs/plugin/utc');  // Import the UTC plugin
 dayjs.extend(utc);  // Extend dayjs with the UTC plugin
 const timezone = require('dayjs/plugin/timezone');
@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
        this.emit('ready');
    }
    isDataUrl(url: string) {
        return /^data:.+\/(.+);base64,(.*)$/.test(url);
    }
    parseDataUrl(url: string) {
        const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
        if (!matches || matches.length !== 3) {
            throw new Error('Invalid data URL');
        }
        if (matches[1] !== 'pdf') {
            throw new Error('Invalid data URL type');
        }
        return {
            type: matches[1],
            data: matches[2]
        }
    }
    async extract(url: string | URL) {
-        const loadingTask = this.pdfjs.getDocument({
+        let loadingTask: PDFDocumentLoadingTask;
        if (typeof url === 'string' && this.isDataUrl(url)) {
            const { data } = this.parseDataUrl(url);
            loadingTask = this.pdfjs.getDocument({
                data: atob(decodeURIComponent(data)),
                disableFontFace: true,
                verbosity: 0
            });
        } else {
            loadingTask = this.pdfjs.getDocument({
                url,
                disableFontFace: true,
                verbosity: 0
            });
        }
        const doc = await loadingTask.promise;
        const meta = await doc.getMetadata();
@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {
        const digest = md5Hasher.hash(url.toString());
        const data = url;
        if (typeof url === 'string' && this.isDataUrl(url)) {
            url = `dataurl://digest:${digest}`;
        }
        const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
        if (cache) {
@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
        let extracted;
        try {
-            extracted = await this.extract(url);
+            extracted = await this.extract(data);
            const theID = randomUUID();
            await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,