mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:45:53 +08:00
feat: allow passing base64 encoded pdf
This commit is contained in:
parent
de50c93825
commit
080056e889
@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (crawlerOpts?.pdf) {
|
||||||
|
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
|
||||||
|
const fakeSnapshot = {
|
||||||
|
href: urlToCrawl.toString(),
|
||||||
|
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
||||||
|
title: '',
|
||||||
|
text: '',
|
||||||
|
pdfs: [pdfDataUrl],
|
||||||
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let cache;
|
let cache;
|
||||||
|
|
||||||
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
||||||
|
@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
html?: string;
|
html?: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Base64 encoded PDF.',
|
||||||
|
})
|
||||||
|
pdf?: string;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'default',
|
default: 'default',
|
||||||
})
|
})
|
||||||
|
@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
|
|||||||
import dayjs from 'dayjs';
|
import dayjs from 'dayjs';
|
||||||
import { FirebaseStorageBucketControl } from '../shared';
|
import { FirebaseStorageBucketControl } from '../shared';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
|
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||||
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
||||||
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
||||||
const timezone = require('dayjs/plugin/timezone');
|
const timezone = require('dayjs/plugin/timezone');
|
||||||
@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isDataUrl(url: string) {
|
||||||
|
return /^data:.+\/(.+);base64,(.*)$/.test(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
parseDataUrl(url: string) {
|
||||||
|
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
|
||||||
|
if (!matches || matches.length !== 3) {
|
||||||
|
throw new Error('Invalid data URL');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matches[1] !== 'pdf') {
|
||||||
|
throw new Error('Invalid data URL type');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
type: matches[1],
|
||||||
|
data: matches[2]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async extract(url: string | URL) {
|
async extract(url: string | URL) {
|
||||||
const loadingTask = this.pdfjs.getDocument({
|
let loadingTask: PDFDocumentLoadingTask;
|
||||||
|
|
||||||
|
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||||
|
const { data } = this.parseDataUrl(url);
|
||||||
|
|
||||||
|
loadingTask = this.pdfjs.getDocument({
|
||||||
|
data: atob(decodeURIComponent(data)),
|
||||||
|
disableFontFace: true,
|
||||||
|
verbosity: 0
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
loadingTask = this.pdfjs.getDocument({
|
||||||
url,
|
url,
|
||||||
disableFontFace: true,
|
disableFontFace: true,
|
||||||
verbosity: 0
|
verbosity: 0
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const doc = await loadingTask.promise;
|
const doc = await loadingTask.promise;
|
||||||
const meta = await doc.getMetadata();
|
const meta = await doc.getMetadata();
|
||||||
@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {
|
|||||||
|
|
||||||
const digest = md5Hasher.hash(url.toString());
|
const digest = md5Hasher.hash(url.toString());
|
||||||
|
|
||||||
|
const data = url;
|
||||||
|
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||||
|
url = `dataurl://digest:${digest}`;
|
||||||
|
}
|
||||||
|
|
||||||
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
||||||
|
|
||||||
if (cache) {
|
if (cache) {
|
||||||
@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
|
|||||||
let extracted;
|
let extracted;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
extracted = await this.extract(url);
|
extracted = await this.extract(data);
|
||||||
|
|
||||||
const theID = randomUUID();
|
const theID = randomUUID();
|
||||||
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user