mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 16:19:00 +08:00
Nick:
This commit is contained in:
parent
3b92fb8433
commit
1d4907acc9
@ -21,6 +21,7 @@ export type PageOptions = {
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
@ -100,4 +100,48 @@ describe('removeUnwantedElements', () => {
|
||||
expect(result).not.toContain('id="remove-this"');
|
||||
expect(result).toContain('class="keep"');
|
||||
});
|
||||
|
||||
it('should only include specified tags', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle multiple specified tags', () => {
|
||||
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<header>Header Content</header>');
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle nested specified tags', () => {
|
||||
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main><section>Main Section</section></main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle no specified tags', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: [] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toBe('');
|
||||
});
|
||||
|
||||
it('should handle specified tags as a string', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: 'main' };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
expect(result).not.toContain('<footer>');
|
||||
});
|
||||
});
|
||||
|
@ -4,6 +4,21 @@ import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
const soup = cheerio.load(html);
|
||||
|
||||
if (pageOptions.onlyIncludeTags) {
|
||||
if (typeof pageOptions.onlyIncludeTags === 'string') {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load('<div></div>')('div');
|
||||
pageOptions.onlyIncludeTags.forEach(tag => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
return newRoot.html();
|
||||
}
|
||||
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
|
||||
if (pageOptions.removeTags) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user