diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 12d8c366..2f43b9a4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -21,6 +21,7 @@ export type PageOptions = { replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; removeTags?: string | string[]; + onlyIncludeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts index 0dc24c89..23b9cb69 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -100,4 +100,48 @@ describe('removeUnwantedElements', () => { expect(result).not.toContain('id="remove-this"'); expect(result).toContain('class="keep"'); }); + + it('should only include specified tags', () => { + const html = `
Main Content
`; + const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] }; + const result = removeUnwantedElements(html, options); + expect(result).toContain('
Main Content
'); + expect(result).toContain(''); + expect(result).not.toContain('