This commit is contained in:
Nicolas 2024-06-26 21:02:58 -03:00
parent 3b92fb8433
commit 1d4907acc9
3 changed files with 60 additions and 0 deletions

View File

@ -21,6 +21,7 @@ export type PageOptions = {
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
removeTags?: string | string[];
onlyIncludeTags?: string | string[];
};
export type ExtractorOptions = {

View File

@ -100,4 +100,48 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"');
});
it('should only include specified tags', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle multiple specified tags', () => {
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<header>Header Content</header>');
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle nested specified tags', () => {
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main><section>Main Section</section></main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle no specified tags', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: [] };
const result = removeUnwantedElements(html, options);
expect(result).toBe('');
});
it('should handle specified tags as a string', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: 'main' };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).not.toContain('<aside>');
expect(result).not.toContain('<footer>');
});
});

View File

@ -4,6 +4,21 @@ import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
if (pageOptions.onlyIncludeTags) {
if (typeof pageOptions.onlyIncludeTags === 'string') {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
// Create a new root element to hold the tags to keep
const newRoot = cheerio.load('<div></div>')('div');
pageOptions.onlyIncludeTags.forEach(tag => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
});
return newRoot.html();
}
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {