Nick: small changes

This commit is contained in:
Nicolas 2024-06-26 21:15:42 -03:00
parent 1d4907acc9
commit 388ce3cbce
2 changed files with 28 additions and 23 deletions

View File

@ -129,11 +129,11 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('<aside>'); expect(result).not.toContain('<aside>');
}); });
it('should handle no specified tags', () => { it('should not handle no specified tags, return full content', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`; const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
const options: PageOptions = { onlyIncludeTags: [] }; const options: PageOptions = { onlyIncludeTags: [] };
const result = removeUnwantedElements(html, options); const result = removeUnwantedElements(html, options);
expect(result).toBe(''); expect(result).toBe(html);
}); });
it('should handle specified tags as a string', () => { it('should handle specified tags as a string', () => {

View File

@ -2,27 +2,32 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities"; import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags"; import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions
) => {
const soup = cheerio.load(html); const soup = cheerio.load(html);
if (pageOptions.onlyIncludeTags) { if (pageOptions.onlyIncludeTags) {
if (typeof pageOptions.onlyIncludeTags === 'string') { if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags]; pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
} }
// Create a new root element to hold the tags to keep if (pageOptions.onlyIncludeTags.length !== 0) {
const newRoot = cheerio.load('<div></div>')('div'); // Create a new root element to hold the tags to keep
pageOptions.onlyIncludeTags.forEach(tag => { const newRoot = cheerio.load("<div></div>")("div");
soup(tag).each((index, element) => { pageOptions.onlyIncludeTags.forEach((tag) => {
newRoot.append(soup(element).clone()); soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
}); });
}); return newRoot.html();
return newRoot.html(); }
} }
soup("script, style, iframe, noscript, meta, head").remove(); soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) { if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') { if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags]; pageOptions.removeTags = [pageOptions.removeTags];
} }
@ -32,16 +37,16 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
if (tag.startsWith("*") && tag.endsWith("*")) { if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false; let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), 'i'); const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup('*').filter((i, element) => { elementsToRemove = soup("*").filter((i, element) => {
if (element.type === 'tag') { if (element.type === "tag") {
const attributes = element.attribs; const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name); const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr => const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`) regexPattern.test(`${attr}="${attributes[attr]}"`)
); );
if (tag.startsWith('*.')) { if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some(attr => classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`) regexPattern.test(`class="${attributes[attr]}"`)
); );
} }