mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 08:36:02 +08:00
fix: pdf upload in multipart
This commit is contained in:
parent
deb0b6dc23
commit
f6c89e878c
24
backend/functions/package-lock.json
generated
24
backend/functions/package-lock.json
generated
@ -15,13 +15,14 @@
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.2-2eddf1b",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
@ -48,6 +49,7 @@
|
||||
"devDependencies": {
|
||||
"@types/archiver": "^5.3.4",
|
||||
"@types/bcrypt": "^5.0.0",
|
||||
"@types/busboy": "^1.5.4",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/node": "^20.14.13",
|
||||
@ -2135,6 +2137,16 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/busboy": {
|
||||
"version": "1.5.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.4.tgz",
|
||||
"integrity": "sha512-kG7WrUuAKK0NoyxfQHsVE6j1m01s6kMma64E+OZenQABMQyTJop1DumUWcLwAQ2JzpefU7PDYoRDKl8uZosFjw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/cacheable-request": {
|
||||
"version": "6.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
|
||||
@ -3540,7 +3552,6 @@
|
||||
"version": "1.6.0",
|
||||
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
|
||||
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamsearch": "^1.1.0"
|
||||
},
|
||||
@ -5539,9 +5550,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/firebase-functions": {
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
|
||||
"integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
|
||||
"version": "6.1.1",
|
||||
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.1.tgz",
|
||||
"integrity": "sha512-q+4zsQhX04YJUz6hqaiH/j5kixljPj0PMxkm8KN3juYp3I4NC6CZ4qfy5JRfwvV8VfXM2KkJrZuyJtLyZr97aw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/cors": "^2.8.5",
|
||||
@ -5557,7 +5568,7 @@
|
||||
"node": ">=14.10.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"firebase-admin": "^11.10.0 || ^12.0.0"
|
||||
"firebase-admin": "^11.10.0 || ^12.0.0 || ^13.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/firebase-functions-test": {
|
||||
@ -10960,7 +10971,6 @@
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
|
||||
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
}
|
||||
|
@ -35,13 +35,14 @@
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.2-2eddf1b",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
@ -68,6 +69,7 @@
|
||||
"devDependencies": {
|
||||
"@types/archiver": "^5.3.4",
|
||||
"@types/bcrypt": "^5.0.0",
|
||||
"@types/busboy": "^1.5.4",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/node": "^20.14.13",
|
||||
|
@ -374,7 +374,8 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const targetUrlFromGet = originPath.slice(1);
|
||||
if (crawlerOptions.pdf) {
|
||||
url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`;
|
||||
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
||||
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
|
||||
} else if (targetUrlFromGet) {
|
||||
url = targetUrlFromGet.trim();
|
||||
} else if (crawlerOptions.url) {
|
||||
@ -552,7 +553,9 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (crawlerOpts?.pdf) {
|
||||
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
|
||||
|
||||
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
||||
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
||||
const fakeSnapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
||||
|
@ -171,8 +171,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
|
||||
@Prop({
|
||||
desc: 'Base64 encoded PDF.',
|
||||
type: [File, String]
|
||||
})
|
||||
pdf?: string;
|
||||
pdf?: File | string;
|
||||
|
||||
@Prop({
|
||||
default: CONTENT_FORMAT.CONTENT,
|
||||
|
@ -64,23 +64,25 @@ export class PDFExtractor extends AsyncService {
|
||||
}
|
||||
|
||||
isDataUrl(url: string) {
|
||||
return /^data:.+\/(.+);base64,(.*)$/.test(url);
|
||||
return url.startsWith('data:');
|
||||
}
|
||||
|
||||
parseDataUrl(url: string) {
|
||||
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
|
||||
if (!matches || matches.length !== 3) {
|
||||
const protocol = url.slice(0, url.indexOf(':'));
|
||||
const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
|
||||
const data = url.slice(url.indexOf(',') + 1);
|
||||
if (protocol !== 'data' || !data) {
|
||||
throw new Error('Invalid data URL');
|
||||
}
|
||||
|
||||
if (matches[1] !== 'pdf') {
|
||||
if (contentType !== 'application/pdf') {
|
||||
throw new Error('Invalid data URL type');
|
||||
}
|
||||
|
||||
return {
|
||||
type: matches[1],
|
||||
data: matches[2]
|
||||
}
|
||||
type: contentType,
|
||||
data: data
|
||||
};
|
||||
}
|
||||
|
||||
async extract(url: string | URL) {
|
||||
@ -88,9 +90,9 @@ export class PDFExtractor extends AsyncService {
|
||||
|
||||
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||
const { data } = this.parseDataUrl(url);
|
||||
|
||||
const binary = Uint8Array.from(Buffer.from(data, 'base64'));
|
||||
loadingTask = this.pdfjs.getDocument({
|
||||
data: atob(decodeURIComponent(data)),
|
||||
data: binary,
|
||||
disableFontFace: true,
|
||||
verbosity: 0
|
||||
});
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 296fe56d235c08978eda384d8fcddbacdd6f7863
|
||||
Subproject commit a90669ca91d2c8cb470e75bf2cdfa06812e5ba7a
|
Loading…
x
Reference in New Issue
Block a user