feat: allow passing base64 encoded pdf

This commit is contained in:
Zhaofeng Miao 2024-08-22 14:56:09 +08:00
parent de50c93825
commit 080056e889
3 changed files with 66 additions and 6 deletions

View File

@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return; return;
} }
if (crawlerOpts?.pdf) {
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
title: '',
text: '',
pdfs: [pdfDataUrl],
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
}
let cache; let cache;
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs; const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;

View File

@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop() @Prop()
html?: string; html?: string;
@Prop({
desc: 'Base64 encoded PDF.',
})
pdf?: string;
@Prop({ @Prop({
default: 'default', default: 'default',
}) })

View File

@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs'; import dayjs from 'dayjs';
import { FirebaseStorageBucketControl } from '../shared'; import { FirebaseStorageBucketControl } from '../shared';
import { randomUUID } from 'crypto'; import { randomUUID } from 'crypto';
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
dayjs.extend(utc); // Extend dayjs with the UTC plugin dayjs.extend(utc); // Extend dayjs with the UTC plugin
const timezone = require('dayjs/plugin/timezone'); const timezone = require('dayjs/plugin/timezone');
@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
this.emit('ready'); this.emit('ready');
} }
isDataUrl(url: string) {
return /^data:.+\/(.+);base64,(.*)$/.test(url);
}
parseDataUrl(url: string) {
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
if (!matches || matches.length !== 3) {
throw new Error('Invalid data URL');
}
if (matches[1] !== 'pdf') {
throw new Error('Invalid data URL type');
}
return {
type: matches[1],
data: matches[2]
}
}
async extract(url: string | URL) { async extract(url: string | URL) {
const loadingTask = this.pdfjs.getDocument({ let loadingTask: PDFDocumentLoadingTask;
url,
disableFontFace: true, if (typeof url === 'string' && this.isDataUrl(url)) {
verbosity: 0 const { data } = this.parseDataUrl(url);
});
loadingTask = this.pdfjs.getDocument({
data: atob(decodeURIComponent(data)),
disableFontFace: true,
verbosity: 0
});
} else {
loadingTask = this.pdfjs.getDocument({
url,
disableFontFace: true,
verbosity: 0
});
}
const doc = await loadingTask.promise; const doc = await loadingTask.promise;
const meta = await doc.getMetadata(); const meta = await doc.getMetadata();
@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {
const digest = md5Hasher.hash(url.toString()); const digest = md5Hasher.hash(url.toString());
const data = url;
if (typeof url === 'string' && this.isDataUrl(url)) {
url = `dataurl://digest:${digest}`;
}
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) { if (cache) {
@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
let extracted; let extracted;
try { try {
extracted = await this.extract(url); extracted = await this.extract(data);
const theID = randomUUID(); const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,