From e824303d87cc68c2e32859db40c57350b3111626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 16 Jan 2025 16:51:33 +0100 Subject: [PATCH] feat(html): always pick largest image from srcset --- .../scrapeURL/lib/removeUnwantedElements.ts | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 3536211d..2c23c2f1 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -114,6 +114,30 @@ export const removeUnwantedElements = ( }); } + // always return biggest image + soup("img[srcset]").each((_, el) => { + const sizes = el.attribs.srcset.split(",").map(x => { + const tok = x.trim().split(" "); + return { + url: tok[0], + size: parseInt((tok[1] ?? "1x").slice(0, -1), 10), + isX: (tok[1] ?? "").endsWith("x") + }; + }); + + if (sizes.every(x => x.isX) && el.attribs.src) { + sizes.push({ + url: el.attribs.src, + size: 1, + isX: true, + }); + } + + sizes.sort((a,b) => b.size - a.size); + + el.attribs.src = sizes[0]?.url; + }); + const cleanedHtml = soup.html(); return cleanedHtml; };