mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 01:55:59 +08:00
feat: allow passing base64 encoded pdf
This commit is contained in:
parent
de50c93825
commit
080056e889
@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlerOpts?.pdf) {
|
||||
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
|
||||
const fakeSnapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
||||
title: '',
|
||||
text: '',
|
||||
pdfs: [pdfDataUrl],
|
||||
} as PageSnapshot;
|
||||
|
||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let cache;
|
||||
|
||||
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
||||
|
@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
html?: string;
|
||||
|
||||
@Prop({
|
||||
desc: 'Base64 encoded PDF.',
|
||||
})
|
||||
pdf?: string;
|
||||
|
||||
@Prop({
|
||||
default: 'default',
|
||||
})
|
||||
|
@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
|
||||
import dayjs from 'dayjs';
|
||||
import { FirebaseStorageBucketControl } from '../shared';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
||||
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
||||
const timezone = require('dayjs/plugin/timezone');
|
||||
@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
isDataUrl(url: string) {
|
||||
return /^data:.+\/(.+);base64,(.*)$/.test(url);
|
||||
}
|
||||
|
||||
parseDataUrl(url: string) {
|
||||
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
|
||||
if (!matches || matches.length !== 3) {
|
||||
throw new Error('Invalid data URL');
|
||||
}
|
||||
|
||||
if (matches[1] !== 'pdf') {
|
||||
throw new Error('Invalid data URL type');
|
||||
}
|
||||
|
||||
return {
|
||||
type: matches[1],
|
||||
data: matches[2]
|
||||
}
|
||||
}
|
||||
|
||||
async extract(url: string | URL) {
|
||||
const loadingTask = this.pdfjs.getDocument({
|
||||
let loadingTask: PDFDocumentLoadingTask;
|
||||
|
||||
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||
const { data } = this.parseDataUrl(url);
|
||||
|
||||
loadingTask = this.pdfjs.getDocument({
|
||||
data: atob(decodeURIComponent(data)),
|
||||
disableFontFace: true,
|
||||
verbosity: 0
|
||||
});
|
||||
} else {
|
||||
loadingTask = this.pdfjs.getDocument({
|
||||
url,
|
||||
disableFontFace: true,
|
||||
verbosity: 0
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
const doc = await loadingTask.promise;
|
||||
const meta = await doc.getMetadata();
|
||||
@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {
|
||||
|
||||
const digest = md5Hasher.hash(url.toString());
|
||||
|
||||
const data = url;
|
||||
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||
url = `dataurl://digest:${digest}`;
|
||||
}
|
||||
|
||||
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
||||
|
||||
if (cache) {
|
||||
@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
|
||||
let extracted;
|
||||
|
||||
try {
|
||||
extracted = await this.extract(url);
|
||||
extracted = await this.extract(data);
|
||||
|
||||
const theID = randomUUID();
|
||||
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
||||
|
Loading…
x
Reference in New Issue
Block a user