diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index a20de7c..69795f9 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -15,13 +15,14 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", + "busboy": "^1.6.0", "civkit": "^0.8.2-2eddf1b", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", "express": "^4.19.2", "firebase-admin": "^12.1.0", - "firebase-functions": "^6.1.0", + "firebase-functions": "^6.1.1", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", @@ -48,6 +49,7 @@ "devDependencies": { "@types/archiver": "^5.3.4", "@types/bcrypt": "^5.0.0", + "@types/busboy": "^1.5.4", "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", "@types/node": "^20.14.13", @@ -2135,6 +2137,16 @@ "@types/node": "*" } }, + "node_modules/@types/busboy": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.4.tgz", + "integrity": "sha512-kG7WrUuAKK0NoyxfQHsVE6j1m01s6kMma64E+OZenQABMQyTJop1DumUWcLwAQ2JzpefU7PDYoRDKl8uZosFjw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/cacheable-request": { "version": "6.0.3", "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz", @@ -3540,7 +3552,6 @@ "version": "1.6.0", "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz", "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==", - "optional": true, "dependencies": { "streamsearch": "^1.1.0" }, @@ -5539,9 +5550,9 @@ } }, "node_modules/firebase-functions": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz", - "integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==", + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.1.tgz", + "integrity": "sha512-q+4zsQhX04YJUz6hqaiH/j5kixljPj0PMxkm8KN3juYp3I4NC6CZ4qfy5JRfwvV8VfXM2KkJrZuyJtLyZr97aw==", "license": "MIT", "dependencies": { "@types/cors": "^2.8.5", @@ -5557,7 +5568,7 @@ "node": ">=14.10.0" }, "peerDependencies": { - "firebase-admin": "^11.10.0 || ^12.0.0" + "firebase-admin": "^11.10.0 || ^12.0.0 || ^13.0.0" } }, "node_modules/firebase-functions-test": { @@ -10960,7 +10971,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==", - "optional": true, "engines": { "node": ">=10.0.0" } diff --git a/backend/functions/package.json b/backend/functions/package.json index e3be929..6c14517 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -35,13 +35,14 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", + "busboy": "^1.6.0", "civkit": "^0.8.2-2eddf1b", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", "express": "^4.19.2", "firebase-admin": "^12.1.0", - "firebase-functions": "^6.1.0", + "firebase-functions": "^6.1.1", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", @@ -68,6 +69,7 @@ "devDependencies": { "@types/archiver": "^5.3.4", "@types/bcrypt": "^5.0.0", + "@types/busboy": "^1.5.4", "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", "@types/node": "^20.14.13", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 30af2ef..d235777 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -374,7 +374,8 @@ export class CrawlerHost extends RPCHost { const targetUrlFromGet = originPath.slice(1); if (crawlerOptions.pdf) { - url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`; + const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64'); + url = `file://pdf.${md5Hasher.hash(pdfBuf)}`; } else if (targetUrlFromGet) { url = targetUrlFromGet.trim(); } else if (crawlerOptions.url) { @@ -552,7 +553,9 @@ export class CrawlerHost extends RPCHost { } if (crawlerOpts?.pdf) { - const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`; + + const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64'); + const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`; const fakeSnapshot = { href: urlToCrawl.toString(), html: ``, diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index cd8ae66..94f1a54 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -171,8 +171,9 @@ export class CrawlerOptions extends AutoCastable { @Prop({ desc: 'Base64 encoded PDF.', + type: [File, String] }) - pdf?: string; + pdf?: File | string; @Prop({ default: CONTENT_FORMAT.CONTENT, diff --git a/backend/functions/src/services/pdf-extract.ts b/backend/functions/src/services/pdf-extract.ts index 7de7e54..396ef97 100644 --- a/backend/functions/src/services/pdf-extract.ts +++ b/backend/functions/src/services/pdf-extract.ts @@ -64,23 +64,25 @@ export class PDFExtractor extends AsyncService { } isDataUrl(url: string) { - return /^data:.+\/(.+);base64,(.*)$/.test(url); + return url.startsWith('data:'); } parseDataUrl(url: string) { - const matches = url.match(/^data:.+\/(.+);base64,(.*)$/); - if (!matches || matches.length !== 3) { + const protocol = url.slice(0, url.indexOf(':')); + const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';')); + const data = url.slice(url.indexOf(',') + 1); + if (protocol !== 'data' || !data) { throw new Error('Invalid data URL'); } - if (matches[1] !== 'pdf') { + if (contentType !== 'application/pdf') { throw new Error('Invalid data URL type'); } return { - type: matches[1], - data: matches[2] - } + type: contentType, + data: data + }; } async extract(url: string | URL) { @@ -88,9 +90,9 @@ export class PDFExtractor extends AsyncService { if (typeof url === 'string' && this.isDataUrl(url)) { const { data } = this.parseDataUrl(url); - + const binary = Uint8Array.from(Buffer.from(data, 'base64')); loadingTask = this.pdfjs.getDocument({ - data: atob(decodeURIComponent(data)), + data: binary, disableFontFace: true, verbosity: 0 }); diff --git a/thinapps-shared b/thinapps-shared index 296fe56..a90669c 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 296fe56d235c08978eda384d8fcddbacdd6f7863 +Subproject commit a90669ca91d2c8cb470e75bf2cdfa06812e5ba7a