mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-04-18 11:50:00 +08:00
fix: do img filtering in node instead of browser
This commit is contained in:
parent
3b0e0207f7
commit
a471a6137c
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.9.0-848ef4e",
|
||||
"civkit": "^0.9.0-2570394",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -4003,9 +4003,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.9.0-848ef4e",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-848ef4e.tgz",
|
||||
"integrity": "sha512-yxk5AKaiZSN4ntlwybVHYgUer402CSw06KzN7wvfaYra9evZkZ7MiFHGULqMnY7657k3CH0WV4n6jGfRj1Vpvw==",
|
||||
"version": "0.9.0-2570394",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-2570394.tgz",
|
||||
"integrity": "sha512-w77agnElTEP6g+l66KhX1Ib9z7JXbR3FaR5/2yTUPIPjm32qsWkmKRvv0mZ83IcMSSmTjF9LxboYAliyTx7cIA==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -26,7 +26,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.9.0-848ef4e",
|
||||
"civkit": "^0.9.0-2570394",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
@ -1069,7 +1069,6 @@ export class CrawlerHost extends RPCHost {
|
||||
title: snapshot.title,
|
||||
content: snapshot.parsed?.textContent,
|
||||
url: presumedURL?.href || snapshot.href,
|
||||
[Symbol.dispose]: () => undefined,
|
||||
};
|
||||
|
||||
Object.defineProperty(output, 'textRepresentation', {
|
||||
|
@ -33,8 +33,11 @@ export class AltTextService extends AsyncService {
|
||||
try {
|
||||
const img = await this.canvasService.loadImage(url);
|
||||
const contentTypeHint = Reflect.get(img, 'contentType');
|
||||
if (Math.min(img.naturalHeight, img.naturalWidth) <= 1) {
|
||||
return `A ${img.naturalWidth}x${img.naturalHeight} image, likely be a tacker probe`;
|
||||
}
|
||||
if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
|
||||
throw new AssertionFailureError({ message: `Image is too small to generate alt text for url ${url}` });
|
||||
return `A ${img.naturalWidth}x${img.naturalHeight} small image, likely a logo, icon or avatar`;
|
||||
}
|
||||
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||
@ -63,6 +66,32 @@ export class AltTextService extends AsyncService {
|
||||
}
|
||||
const digest = md5Hasher.hash(imgBrief.src);
|
||||
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||
let dims: number[] = [];
|
||||
do {
|
||||
if (imgBrief.loaded) {
|
||||
if (imgBrief.naturalWidth && imgBrief.naturalHeight) {
|
||||
if (Math.min(imgBrief.naturalWidth, imgBrief.naturalHeight) < 64) {
|
||||
dims = [imgBrief.naturalWidth, imgBrief.naturalHeight];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (imgBrief.width && imgBrief.height) {
|
||||
if (Math.min(imgBrief.width, imgBrief.height) < 64) {
|
||||
dims = [imgBrief.width, imgBrief.height];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} while (false);
|
||||
|
||||
if (Math.min(...dims) <= 1) {
|
||||
return `A ${dims[0]}x${dims[1]} image, likely be a tacker probe`;
|
||||
}
|
||||
if (Math.min(...dims) < 64) {
|
||||
return `A ${dims[0]}x${dims[1]} small image, likely a logo, icon or avatar`;
|
||||
}
|
||||
|
||||
const existing = await ImgAlt.fromFirestore(shortDigest);
|
||||
|
||||
@ -102,4 +131,4 @@ export class AltTextService extends AsyncService {
|
||||
|
||||
return generatedCaption;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -407,18 +407,7 @@ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
|
||||
if (document.baseURI !== r.href) {
|
||||
r.rebase = document.baseURI;
|
||||
}
|
||||
r.imgs = briefImgs().filter((x)=> {
|
||||
if (x.complete) {
|
||||
if (Math.min(x.width, x.height, x.naturalWidth, x.naturalHeight) < 64) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const m = Math.min(x.width, x.height);
|
||||
if (m && m < 64) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
r.imgs = briefImgs();
|
||||
|
||||
return r;
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ export interface FormattedPage {
|
||||
|
||||
textRepresentation?: string;
|
||||
|
||||
[Symbol.dispose]: () => void;
|
||||
[Symbol.dispose]?: () => void;
|
||||
}
|
||||
|
||||
export const md5Hasher = new HashManager('md5', 'hex');
|
||||
@ -199,7 +199,6 @@ export class SnapshotFormatter extends AsyncService {
|
||||
description: (snapshot.description || '').trim(),
|
||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||
[Symbol.dispose]: () => { },
|
||||
};
|
||||
|
||||
Object.assign(f, formatted);
|
||||
@ -395,7 +394,6 @@ export class SnapshotFormatter extends AsyncService {
|
||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||
content: contentText,
|
||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||
[Symbol.dispose]: () => { },
|
||||
};
|
||||
|
||||
if (snapshot.status) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user