diff --git a/apps/api/src/lib/__tests__/html-transformer.test.ts b/apps/api/src/lib/__tests__/html-transformer.test.ts new file mode 100644 index 00000000..866a3950 --- /dev/null +++ b/apps/api/src/lib/__tests__/html-transformer.test.ts @@ -0,0 +1,319 @@ +import { + extractLinks, + extractMetadata, + transformHtml, +} from "../html-transformer"; + +describe("HTML Transformer", () => { + describe("extractLinks", () => { + it("should return empty array for null or undefined input", async () => { + expect(await extractLinks(null)).toEqual([]); + expect(await extractLinks(undefined)).toEqual([]); + }); + + it("should extract links from HTML content", async () => { + const html = ` + + + Example + Test + + + `; + const links = await extractLinks(html); + expect(links).toContain("https://example.com"); + expect(links).toContain("https://test.com"); + }); + + it("should handle relative links", async () => { + const html = ` + + + Relative + Parent Path + Local Path + Implicit Relative + Query Param + Hash Link + + + `; + const links = await extractLinks(html); + expect(links).toEqual([ + "/path/to/page", + "../another/page", + "./local/page", + "relative/path", + "?param=value", + "#section", + ]); + }); + + it("should handle complex nested HTML structure", async () => { + const html = ` + + +
+ +
+ +
+
+ + + `; + const links = await extractLinks(html); + expect(links).toContain("https://nav1.com"); + expect(links).toContain("https://nav2.com"); + expect(links).toContain("https://inline.com"); + expect(links).toContain("https://nested.com"); + }); + + it("should handle malformed HTML gracefully", async () => { + const html = ` +
+ Valid + Invalid + No href + Empty href + JavaScript href + Email link +
+ `; + const links = await extractLinks(html); + expect(links).toContain("https://valid.com"); + // Other links should be filtered out or handled appropriately + }); + }); + + describe("extractMetadata", () => { + it("should return empty array for null or undefined input", async () => { + expect(await extractMetadata(null)).toEqual([]); + expect(await extractMetadata(undefined)).toEqual([]); + }); + + it("should extract comprehensive metadata from HTML content", async () => { + const html = ` + + + Test Page Title + + + + + + + + + + + + + `; + const metadata = await extractMetadata(html); + expect(metadata).toMatchObject({ + "twitter:title": "Twitter Title", + ogImage: "https://example.com/image.jpg", + "og:image": "https://example.com/image.jpg", + ogDescription: "OpenGraph Description", + "twitter:card": "summary", + title: "Test Page Title", + ogTitle: "OpenGraph Title", + author: "Test Author", + keywords: "test,page,keywords", + "og:title": "OpenGraph Title", + "og:description": "OpenGraph Description", + description: "Detailed page description", + }); + }); + + it("should handle metadata with special characters and encoding", async () => { + const html = ` + + + Test & Page with ©️ symbols + + + + + + `; + const metadata = await extractMetadata(html); + expect(metadata.title).toContain("&"); + expect(metadata.description).toContain("quotes"); + }); + + it("should handle missing or malformed metadata gracefully", async () => { + const html = ` + + + + + + + + + `; + const metadata = await extractMetadata(html); + expect(metadata).toBeDefined(); + }); + }); + + describe("transformHtml", () => { + it("should transform HTML content according to options", async () => { + const options = { + html: "

Test

Remove me
", + url: "https://example.com", + include_tags: ["p"], + exclude_tags: ["span"], + only_main_content: true, + }; + + const result = await transformHtml(options); + expect(result).toContain("

"); + expect(result).not.toContain(""); + }); + + it("should handle complex content filtering", async () => { + const options = { + html: ` +

+
+ +
+
+
+

Title

+

Important content

+
Advertisement
+ + +
+
+ +
+ `, + url: "https://example.com", + include_tags: ["article", "h1", "p"], + exclude_tags: ["nav", "aside", "footer", ".ads", ".social-share"], + only_main_content: true, + }; + + const result = await transformHtml(options); + expect(result).toContain("

Title

"); + expect(result).toContain("

Important content

"); + expect(result).not.toContain("Navigation"); + expect(result).not.toContain("Advertisement"); + expect(result).not.toContain("Share buttons"); + expect(result).not.toContain("Footer content"); + }); + + it("should handle nested content preservation and absolute links", async () => { + const options = { + html: ` +
+
+

Section

+

Text with bold and emphasis

+ +
+
+ `, + url: "https://example.com", + include_tags: ["article", "p", "ul", "li"], + exclude_tags: [], + only_main_content: true, + }; + + const result = await transformHtml(options); + expect(result).toContain("bold"); + expect(result).toContain("emphasis"); + expect(result).toContain(''); + }); + + it("should handle empty HTML content", async () => { + const options = { + html: "", + url: "https://example.com", + include_tags: [], + exclude_tags: [], + only_main_content: false, + }; + + const result = await transformHtml(options); + expect(result).toBe(""); + }); + + it("should handle malformed HTML", async () => { + const options = { + html: "
Unclosed div", + url: "https://example.com", + include_tags: [], + exclude_tags: [], + only_main_content: false, + }; + + const result = await transformHtml(options); + expect(result).toBe("
Unclosed div
"); + }); + + it("should handle HTML with comments and scripts", async () => { + const options = { + html: ` +
+ + +

Real content

+ + +
+ `, + url: "https://example.com", + include_tags: ["p"], + exclude_tags: ["script", "style", "noscript"], + only_main_content: true, + }; + + const result = await transformHtml(options); + expect(result).toContain("

Real content

"); + expect(result).not.toContain("alert"); + expect(result).not.toContain("color: red"); + expect(result).not.toContain("Enable JavaScript"); + }); + + it("should handle special characters and encoding", async () => { + const options = { + html: ` +
+

© 2024

+

<tag>

+

Special chars: á é í ó ú ñ

+

Emojis: 🎉 👍 🚀

+
+ `, + url: "https://example.com", + include_tags: ["p"], + exclude_tags: [], + only_main_content: true, + }; + + const result = await transformHtml(options); + expect(result).toContain("©"); + expect(result).toContain("á é í ó ú ñ"); + expect(result).toContain("🎉 👍 🚀"); + }); + }); +});