import { AnyNode, Cheerio, load } from "cheerio"; import { PageOptions } from "../../../lib/entities"; import { excludeNonMainTags } from "./excludeTags"; export const removeUnwantedElements = ( html: string, pageOptions: PageOptions, ) => { let soup = load(html); if ( pageOptions.onlyIncludeTags && pageOptions.onlyIncludeTags.length > 0 && pageOptions.onlyIncludeTags[0] !== "" ) { if (typeof pageOptions.onlyIncludeTags === "string") { pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags]; } if (pageOptions.onlyIncludeTags.length !== 0) { // Create a new root element to hold the tags to keep const newRoot = load("
")("div"); pageOptions.onlyIncludeTags.forEach((tag) => { soup(tag).each((index, element) => { newRoot.append(soup(element).clone()); }); }); soup = load(newRoot.html()); } } soup("script, style, iframe, noscript, meta, head").remove(); if ( pageOptions.removeTags && pageOptions.removeTags.length > 0 && pageOptions.removeTags[0] !== "" ) { if (typeof pageOptions.removeTags === "string") { pageOptions.removeTags = [pageOptions.removeTags]; } if (Array.isArray(pageOptions.removeTags)) { pageOptions.removeTags.forEach((tag) => { let elementsToRemove: Cheerio