fix: do img filtering in node instead of browser

This commit is contained in:
Yanlong Wang 2025-03-22 12:33:06 +08:00
parent 3b0e0207f7
commit a471a6137c
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
6 changed files with 38 additions and 23 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.9.0-848ef4e",
"civkit": "^0.9.0-2570394",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -4003,9 +4003,9 @@
}
},
"node_modules/civkit": {
"version": "0.9.0-848ef4e",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-848ef4e.tgz",
"integrity": "sha512-yxk5AKaiZSN4ntlwybVHYgUer402CSw06KzN7wvfaYra9evZkZ7MiFHGULqMnY7657k3CH0WV4n6jGfRj1Vpvw==",
"version": "0.9.0-2570394",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-2570394.tgz",
"integrity": "sha512-w77agnElTEP6g+l66KhX1Ib9z7JXbR3FaR5/2yTUPIPjm32qsWkmKRvv0mZ83IcMSSmTjF9LxboYAliyTx7cIA==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -26,7 +26,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.9.0-848ef4e",
"civkit": "^0.9.0-2570394",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

View File

@ -1069,7 +1069,6 @@ export class CrawlerHost extends RPCHost {
title: snapshot.title,
content: snapshot.parsed?.textContent,
url: presumedURL?.href || snapshot.href,
[Symbol.dispose]: () => undefined,
};
Object.defineProperty(output, 'textRepresentation', {

View File

@ -33,8 +33,11 @@ export class AltTextService extends AsyncService {
try {
const img = await this.canvasService.loadImage(url);
const contentTypeHint = Reflect.get(img, 'contentType');
if (Math.min(img.naturalHeight, img.naturalWidth) <= 1) {
return `A ${img.naturalWidth}x${img.naturalHeight} image, likely be a tacker probe`;
}
if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
throw new AssertionFailureError({ message: `Image is too small to generate alt text for url ${url}` });
return `A ${img.naturalWidth}x${img.naturalHeight} small image, likely a logo, icon or avatar`;
}
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
@ -63,6 +66,32 @@ export class AltTextService extends AsyncService {
}
const digest = md5Hasher.hash(imgBrief.src);
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
let dims: number[] = [];
do {
if (imgBrief.loaded) {
if (imgBrief.naturalWidth && imgBrief.naturalHeight) {
if (Math.min(imgBrief.naturalWidth, imgBrief.naturalHeight) < 64) {
dims = [imgBrief.naturalWidth, imgBrief.naturalHeight];
break;
}
}
}
if (imgBrief.width && imgBrief.height) {
if (Math.min(imgBrief.width, imgBrief.height) < 64) {
dims = [imgBrief.width, imgBrief.height];
break;
}
}
} while (false);
if (Math.min(...dims) <= 1) {
return `A ${dims[0]}x${dims[1]} image, likely be a tacker probe`;
}
if (Math.min(...dims) < 64) {
return `A ${dims[0]}x${dims[1]} small image, likely a logo, icon or avatar`;
}
const existing = await ImgAlt.fromFirestore(shortDigest);
@ -102,4 +131,4 @@ export class AltTextService extends AsyncService {
return generatedCaption;
}
}
};

View File

@ -407,18 +407,7 @@ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
r.imgs = briefImgs().filter((x)=> {
if (x.complete) {
if (Math.min(x.width, x.height, x.naturalWidth, x.naturalHeight) < 64) {
return false;
}
}
const m = Math.min(x.width, x.height);
if (m && m < 64) {
return false;
}
return true;
});
r.imgs = briefImgs();
return r;
}

View File

@ -43,7 +43,7 @@ export interface FormattedPage {
textRepresentation?: string;
[Symbol.dispose]: () => void;
[Symbol.dispose]?: () => void;
}
export const md5Hasher = new HashManager('md5', 'hex');
@ -199,7 +199,6 @@ export class SnapshotFormatter extends AsyncService {
description: (snapshot.description || '').trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(),
publishedTime: snapshot.parsed?.publishedTime || undefined,
[Symbol.dispose]: () => { },
};
Object.assign(f, formatted);
@ -395,7 +394,6 @@ export class SnapshotFormatter extends AsyncService {
url: nominalUrl?.toString() || snapshot.href?.trim(),
content: contentText,
publishedTime: snapshot.parsed?.publishedTime || undefined,
[Symbol.dispose]: () => { },
};
if (snapshot.status) {