fix: pdf upload in multipart

This commit is contained in:
yanlong.wang 2024-11-25 17:50:01 +08:00
parent deb0b6dc23
commit f6c89e878c
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
6 changed files with 39 additions and 21 deletions

View File

@ -15,13 +15,14 @@
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.2-2eddf1b",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.0",
"firebase-functions": "^6.1.1",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
@ -48,6 +49,7 @@
"devDependencies": {
"@types/archiver": "^5.3.4",
"@types/bcrypt": "^5.0.0",
"@types/busboy": "^1.5.4",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^20.14.13",
@ -2135,6 +2137,16 @@
"@types/node": "*"
}
},
"node_modules/@types/busboy": {
"version": "1.5.4",
"resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.4.tgz",
"integrity": "sha512-kG7WrUuAKK0NoyxfQHsVE6j1m01s6kMma64E+OZenQABMQyTJop1DumUWcLwAQ2JzpefU7PDYoRDKl8uZosFjw==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/cacheable-request": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
@ -3540,7 +3552,6 @@
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
"optional": true,
"dependencies": {
"streamsearch": "^1.1.0"
},
@ -5539,9 +5550,9 @@
}
},
"node_modules/firebase-functions": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
"integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
"version": "6.1.1",
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.1.tgz",
"integrity": "sha512-q+4zsQhX04YJUz6hqaiH/j5kixljPj0PMxkm8KN3juYp3I4NC6CZ4qfy5JRfwvV8VfXM2KkJrZuyJtLyZr97aw==",
"license": "MIT",
"dependencies": {
"@types/cors": "^2.8.5",
@ -5557,7 +5568,7 @@
"node": ">=14.10.0"
},
"peerDependencies": {
"firebase-admin": "^11.10.0 || ^12.0.0"
"firebase-admin": "^11.10.0 || ^12.0.0 || ^13.0.0"
}
},
"node_modules/firebase-functions-test": {
@ -10960,7 +10971,6 @@
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
"optional": true,
"engines": {
"node": ">=10.0.0"
}

View File

@ -35,13 +35,14 @@
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.2-2eddf1b",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.0",
"firebase-functions": "^6.1.1",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
@ -68,6 +69,7 @@
"devDependencies": {
"@types/archiver": "^5.3.4",
"@types/bcrypt": "^5.0.0",
"@types/busboy": "^1.5.4",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^20.14.13",

View File

@ -374,7 +374,8 @@ export class CrawlerHost extends RPCHost {
const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) {
url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`;
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
} else if (targetUrlFromGet) {
url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) {
@ -552,7 +553,9 @@ export class CrawlerHost extends RPCHost {
}
if (crawlerOpts?.pdf) {
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,

View File

@ -171,8 +171,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop({
desc: 'Base64 encoded PDF.',
type: [File, String]
})
pdf?: string;
pdf?: File | string;
@Prop({
default: CONTENT_FORMAT.CONTENT,

View File

@ -64,23 +64,25 @@ export class PDFExtractor extends AsyncService {
}
isDataUrl(url: string) {
return /^data:.+\/(.+);base64,(.*)$/.test(url);
return url.startsWith('data:');
}
parseDataUrl(url: string) {
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
if (!matches || matches.length !== 3) {
const protocol = url.slice(0, url.indexOf(':'));
const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
const data = url.slice(url.indexOf(',') + 1);
if (protocol !== 'data' || !data) {
throw new Error('Invalid data URL');
}
if (matches[1] !== 'pdf') {
if (contentType !== 'application/pdf') {
throw new Error('Invalid data URL type');
}
return {
type: matches[1],
data: matches[2]
}
type: contentType,
data: data
};
}
async extract(url: string | URL) {
@ -88,9 +90,9 @@ export class PDFExtractor extends AsyncService {
if (typeof url === 'string' && this.isDataUrl(url)) {
const { data } = this.parseDataUrl(url);
const binary = Uint8Array.from(Buffer.from(data, 'base64'));
loadingTask = this.pdfjs.getDocument({
data: atob(decodeURIComponent(data)),
data: binary,
disableFontFace: true,
verbosity: 0
});

@ -1 +1 @@
Subproject commit 296fe56d235c08978eda384d8fcddbacdd6f7863
Subproject commit a90669ca91d2c8cb470e75bf2cdfa06812e5ba7a