diff --git a/apps/api/src/lib/__tests__/html-transformer.test.ts b/apps/api/src/lib/__tests__/html-transformer.test.ts
new file mode 100644
index 00000000..866a3950
--- /dev/null
+++ b/apps/api/src/lib/__tests__/html-transformer.test.ts
@@ -0,0 +1,319 @@
+import {
+ extractLinks,
+ extractMetadata,
+ transformHtml,
+} from "../html-transformer";
+
+describe("HTML Transformer", () => {
+ describe("extractLinks", () => {
+ it("should return empty array for null or undefined input", async () => {
+ expect(await extractLinks(null)).toEqual([]);
+ expect(await extractLinks(undefined)).toEqual([]);
+ });
+
+ it("should extract links from HTML content", async () => {
+ const html = `
+
+
+ Example
+ Test
+
+
+ `;
+ const links = await extractLinks(html);
+ expect(links).toContain("https://example.com");
+ expect(links).toContain("https://test.com");
+ });
+
+ it("should handle relative links", async () => {
+ const html = `
+
+
+ Relative
+ Parent Path
+ Local Path
+ Implicit Relative
+ Query Param
+ Hash Link
+
+
+ `;
+ const links = await extractLinks(html);
+ expect(links).toEqual([
+ "/path/to/page",
+ "../another/page",
+ "./local/page",
+ "relative/path",
+ "?param=value",
+ "#section",
+ ]);
+ });
+
+ it("should handle complex nested HTML structure", async () => {
+ const html = `
+
+
+
+
+
+
+ Some text with a link
+
+
+
+
+
+
+ `;
+ const links = await extractLinks(html);
+ expect(links).toContain("https://nav1.com");
+ expect(links).toContain("https://nav2.com");
+ expect(links).toContain("https://inline.com");
+ expect(links).toContain("https://nested.com");
+ });
+
+ it("should handle malformed HTML gracefully", async () => {
+ const html = `
+
+ `;
+ const links = await extractLinks(html);
+ expect(links).toContain("https://valid.com");
+ // Other links should be filtered out or handled appropriately
+ });
+ });
+
+ describe("extractMetadata", () => {
+ it("should return empty array for null or undefined input", async () => {
+ expect(await extractMetadata(null)).toEqual([]);
+ expect(await extractMetadata(undefined)).toEqual([]);
+ });
+
+ it("should extract comprehensive metadata from HTML content", async () => {
+ const html = `
+
+
+ Test Page Title
+
+
+
+
+
+
+
+
+
+
+
+
+ `;
+ const metadata = await extractMetadata(html);
+ expect(metadata).toMatchObject({
+ "twitter:title": "Twitter Title",
+ ogImage: "https://example.com/image.jpg",
+ "og:image": "https://example.com/image.jpg",
+ ogDescription: "OpenGraph Description",
+ "twitter:card": "summary",
+ title: "Test Page Title",
+ ogTitle: "OpenGraph Title",
+ author: "Test Author",
+ keywords: "test,page,keywords",
+ "og:title": "OpenGraph Title",
+ "og:description": "OpenGraph Description",
+ description: "Detailed page description",
+ });
+ });
+
+ it("should handle metadata with special characters and encoding", async () => {
+ const html = `
+
+
+ Test & Page with ©️ symbols
+
+
+
+
+
+ `;
+ const metadata = await extractMetadata(html);
+ expect(metadata.title).toContain("&");
+ expect(metadata.description).toContain("quotes");
+ });
+
+ it("should handle missing or malformed metadata gracefully", async () => {
+ const html = `
+
+
+
+
+
+
+
+
+ `;
+ const metadata = await extractMetadata(html);
+ expect(metadata).toBeDefined();
+ });
+ });
+
+ describe("transformHtml", () => {
+ it("should transform HTML content according to options", async () => {
+ const options = {
+ html: "",
+ url: "https://example.com",
+ include_tags: ["p"],
+ exclude_tags: ["span"],
+ only_main_content: true,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toContain("");
+ expect(result).not.toContain("");
+ });
+
+ it("should handle complex content filtering", async () => {
+ const options = {
+ html: `
+
+
+
+
+ Title
+ Important content
+ Advertisement
+
+ Share buttons
+
+
+
+
+ `,
+ url: "https://example.com",
+ include_tags: ["article", "h1", "p"],
+ exclude_tags: ["nav", "aside", "footer", ".ads", ".social-share"],
+ only_main_content: true,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toContain("Title
");
+ expect(result).toContain("Important content
");
+ expect(result).not.toContain("Navigation");
+ expect(result).not.toContain("Advertisement");
+ expect(result).not.toContain("Share buttons");
+ expect(result).not.toContain("Footer content");
+ });
+
+ it("should handle nested content preservation and absolute links", async () => {
+ const options = {
+ html: `
+
+
+
Section
+
Text with bold and emphasis
+
+
+
+ `,
+ url: "https://example.com",
+ include_tags: ["article", "p", "ul", "li"],
+ exclude_tags: [],
+ only_main_content: true,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toContain("bold");
+ expect(result).toContain("emphasis");
+ expect(result).toContain('');
+ });
+
+ it("should handle empty HTML content", async () => {
+ const options = {
+ html: "",
+ url: "https://example.com",
+ include_tags: [],
+ exclude_tags: [],
+ only_main_content: false,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toBe("");
+ });
+
+ it("should handle malformed HTML", async () => {
+ const options = {
+ html: "Unclosed div",
+ url: "https://example.com",
+ include_tags: [],
+ exclude_tags: [],
+ only_main_content: false,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toBe("
Unclosed div
");
+ });
+
+ it("should handle HTML with comments and scripts", async () => {
+ const options = {
+ html: `
+
+
+
+
Real content
+
+
+
+ `,
+ url: "https://example.com",
+ include_tags: ["p"],
+ exclude_tags: ["script", "style", "noscript"],
+ only_main_content: true,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toContain("
Real content
");
+ expect(result).not.toContain("alert");
+ expect(result).not.toContain("color: red");
+ expect(result).not.toContain("Enable JavaScript");
+ });
+
+ it("should handle special characters and encoding", async () => {
+ const options = {
+ html: `
+
+
© 2024
+
<tag>
+
Special chars: á é í ó ú ñ
+
Emojis: 🎉 👍 🚀
+
+ `,
+ url: "https://example.com",
+ include_tags: ["p"],
+ exclude_tags: [],
+ only_main_content: true,
+ };
+
+ const result = await transformHtml(options);
+ expect(result).toContain("©");
+ expect(result).toContain("á é í ó ú ñ");
+ expect(result).toContain("🎉 👍 🚀");
+ });
+ });
+});