diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 12d8c366..2f43b9a4 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -21,6 +21,7 @@ export type PageOptions = {
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
removeTags?: string | string[];
+ onlyIncludeTags?: string | string[];
};
export type ExtractorOptions = {
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
index 0dc24c89..23b9cb69 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
@@ -100,4 +100,48 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"');
});
+
+ it('should only include specified tags', () => {
+ const html = `
`;
+ const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
+ const result = removeUnwantedElements(html, options);
+ expect(result).toContain('Main Content');
+ expect(result).toContain('');
+ expect(result).not.toContain('