From 6b1f30e0fbf2e4f0aa1a28c2e81ba19d101687a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 4 Dec 2024 19:05:12 +0100 Subject: [PATCH] fix(scrapeURL/removeUnwantedElements): try to fix onlyMainContent for poorly structured sites --- .../src/scraper/scrapeURL/lib/removeUnwantedElements.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index e06ee337..9458ed0f 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -50,6 +50,10 @@ const excludeNonMainTags = [ "#cookie" ]; +const forceIncludeMainTags = [ + "#main" +]; + export const removeUnwantedElements = ( html: string, scrapeOptions: ScrapeOptions @@ -101,7 +105,9 @@ export const removeUnwantedElements = ( if (scrapeOptions.onlyMainContent) { excludeNonMainTags.forEach((tag) => { - const elementsToRemove = soup(tag); + const elementsToRemove = soup(tag) + .filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join("")); + elementsToRemove.remove(); }); }