fix(scrapeURL/removeUnwantedElements): try to fix onlyMainContent for poorly structured sites

This commit is contained in:
Gergő Móricz 2024-12-04 19:05:12 +01:00
parent 88a16b18a3
commit 6b1f30e0fb

View File

@ -50,6 +50,10 @@ const excludeNonMainTags = [
"#cookie"
];
const forceIncludeMainTags = [
"#main"
];
export const removeUnwantedElements = (
html: string,
scrapeOptions: ScrapeOptions
@ -101,7 +105,9 @@ export const removeUnwantedElements = (
if (scrapeOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
const elementsToRemove = soup(tag)
.filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join(""));
elementsToRemove.remove();
});
}