fix(scrapeURL/removeUnwantedElements): try to fix onlyMainContent for poorly structured sites

This commit is contained in:
Gergő Móricz 2024-12-04 19:05:12 +01:00
parent 88a16b18a3
commit 6b1f30e0fb

View File

@ -50,6 +50,10 @@ const excludeNonMainTags = [
"#cookie" "#cookie"
]; ];
const forceIncludeMainTags = [
"#main"
];
export const removeUnwantedElements = ( export const removeUnwantedElements = (
html: string, html: string,
scrapeOptions: ScrapeOptions scrapeOptions: ScrapeOptions
@ -101,7 +105,9 @@ export const removeUnwantedElements = (
if (scrapeOptions.onlyMainContent) { if (scrapeOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag); const elementsToRemove = soup(tag)
.filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join(""));
elementsToRemove.remove(); elementsToRemove.remove();
}); });
} }