Nick: added html-transformer unit tests

This commit is contained in:
Nicolas 2025-01-25 14:28:09 -03:00
parent 7fdecdc4d3
commit 02caa72f02

View File

@ -0,0 +1,319 @@
import {
extractLinks,
extractMetadata,
transformHtml,
} from "../html-transformer";
describe("HTML Transformer", () => {
describe("extractLinks", () => {
it("should return empty array for null or undefined input", async () => {
expect(await extractLinks(null)).toEqual([]);
expect(await extractLinks(undefined)).toEqual([]);
});
it("should extract links from HTML content", async () => {
const html = `
<html>
<body>
<a href="https://example.com">Example</a>
<a href="https://test.com">Test</a>
</body>
</html>
`;
const links = await extractLinks(html);
expect(links).toContain("https://example.com");
expect(links).toContain("https://test.com");
});
it("should handle relative links", async () => {
const html = `
<html>
<body>
<a href="/path/to/page">Relative</a>
<a href="../another/page">Parent Path</a>
<a href="./local/page">Local Path</a>
<a href="relative/path">Implicit Relative</a>
<a href="?param=value">Query Param</a>
<a href="#section">Hash Link</a>
</body>
</html>
`;
const links = await extractLinks(html);
expect(links).toEqual([
"/path/to/page",
"../another/page",
"./local/page",
"relative/path",
"?param=value",
"#section",
]);
});
it("should handle complex nested HTML structure", async () => {
const html = `
<html>
<body>
<div class="container">
<nav>
<ul>
<li><a href="https://nav1.com">Nav 1</a></li>
<li><a href="https://nav2.com">Nav 2</a></li>
</ul>
</nav>
<main>
<article>
<p>Some text with a <a href="https://inline.com">link</a></p>
<div class="nested">
<a href="https://nested.com">Nested Link</a>
</div>
</article>
</main>
</div>
</body>
</html>
`;
const links = await extractLinks(html);
expect(links).toContain("https://nav1.com");
expect(links).toContain("https://nav2.com");
expect(links).toContain("https://inline.com");
expect(links).toContain("https://nested.com");
});
it("should handle malformed HTML gracefully", async () => {
const html = `
<div>
<a href="https://valid.com">Valid</a>
<a href="invalid">Invalid</a>
<a>No href</a>
<a href="">Empty href</a>
<a href="javascript:void(0)">JavaScript href</a>
<a href="mailto:test@example.com">Email link</a>
</div>
`;
const links = await extractLinks(html);
expect(links).toContain("https://valid.com");
// Other links should be filtered out or handled appropriately
});
});
describe("extractMetadata", () => {
it("should return empty array for null or undefined input", async () => {
expect(await extractMetadata(null)).toEqual([]);
expect(await extractMetadata(undefined)).toEqual([]);
});
it("should extract comprehensive metadata from HTML content", async () => {
const html = `
<html>
<head>
<title>Test Page Title</title>
<meta name="description" content="Detailed page description">
<meta name="keywords" content="test,page,keywords">
<meta name="author" content="Test Author">
<meta property="og:title" content="OpenGraph Title">
<meta property="og:description" content="OpenGraph Description">
<meta property="og:image" content="https://example.com/image.jpg">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Twitter Title">
<link rel="canonical" href="https://example.com/canonical">
</head>
<body></body>
</html>
`;
const metadata = await extractMetadata(html);
expect(metadata).toMatchObject({
"twitter:title": "Twitter Title",
ogImage: "https://example.com/image.jpg",
"og:image": "https://example.com/image.jpg",
ogDescription: "OpenGraph Description",
"twitter:card": "summary",
title: "Test Page Title",
ogTitle: "OpenGraph Title",
author: "Test Author",
keywords: "test,page,keywords",
"og:title": "OpenGraph Title",
"og:description": "OpenGraph Description",
description: "Detailed page description",
});
});
it("should handle metadata with special characters and encoding", async () => {
const html = `
<html>
<head>
<title>Test &amp; Page with © symbols</title>
<meta name="description" content="Description with &quot;quotes&quot; and émojis 🎉">
<meta property="og:title" content="Title with < and > symbols">
</head>
<body></body>
</html>
`;
const metadata = await extractMetadata(html);
expect(metadata.title).toContain("&");
expect(metadata.description).toContain("quotes");
});
it("should handle missing or malformed metadata gracefully", async () => {
const html = `
<html>
<head>
<meta name="description" content="">
<meta property="og:title">
<meta name="keywords" content=" ">
</head>
<body></body>
</html>
`;
const metadata = await extractMetadata(html);
expect(metadata).toBeDefined();
});
});
describe("transformHtml", () => {
it("should transform HTML content according to options", async () => {
const options = {
html: "<div><p>Test</p><span>Remove me</span></div>",
url: "https://example.com",
include_tags: ["p"],
exclude_tags: ["span"],
only_main_content: true,
};
const result = await transformHtml(options);
expect(result).toContain("<p>");
expect(result).not.toContain("<span>");
});
it("should handle complex content filtering", async () => {
const options = {
html: `
<div class="wrapper">
<header>
<nav>Navigation</nav>
</header>
<main>
<article>
<h1>Title</h1>
<p>Important content</p>
<div class="ads">Advertisement</div>
<aside>Sidebar</aside>
<div class="social-share">Share buttons</div>
</article>
</main>
<footer>Footer content</footer>
</div>
`,
url: "https://example.com",
include_tags: ["article", "h1", "p"],
exclude_tags: ["nav", "aside", "footer", ".ads", ".social-share"],
only_main_content: true,
};
const result = await transformHtml(options);
expect(result).toContain("<h1>Title</h1>");
expect(result).toContain("<p>Important content</p>");
expect(result).not.toContain("Navigation");
expect(result).not.toContain("Advertisement");
expect(result).not.toContain("Share buttons");
expect(result).not.toContain("Footer content");
});
it("should handle nested content preservation and absolute links", async () => {
const options = {
html: `
<article>
<div class="content">
<h2>Section</h2>
<p>Text with <strong>bold</strong> and <em>emphasis</em></p>
<ul>
<li>Item 1</li>
<li>Item 2 <a href="#">with link</a></li>
</ul>
</div>
</article>
`,
url: "https://example.com",
include_tags: ["article", "p", "ul", "li"],
exclude_tags: [],
only_main_content: true,
};
const result = await transformHtml(options);
expect(result).toContain("<strong>bold</strong>");
expect(result).toContain("<em>emphasis</em>");
expect(result).toContain('<a href="https://example.com/#">');
});
it("should handle empty HTML content", async () => {
const options = {
html: "",
url: "https://example.com",
include_tags: [],
exclude_tags: [],
only_main_content: false,
};
const result = await transformHtml(options);
expect(result).toBe("<html><body></body></html>");
});
it("should handle malformed HTML", async () => {
const options = {
html: "<div>Unclosed div",
url: "https://example.com",
include_tags: [],
exclude_tags: [],
only_main_content: false,
};
const result = await transformHtml(options);
expect(result).toBe("<html><body><div>Unclosed div</div></body></html>");
});
it("should handle HTML with comments and scripts", async () => {
const options = {
html: `
<div>
<!-- Comment -->
<script>alert('test');</script>
<p>Real content</p>
<style>.test { color: red; }</style>
<noscript>Enable JavaScript</noscript>
</div>
`,
url: "https://example.com",
include_tags: ["p"],
exclude_tags: ["script", "style", "noscript"],
only_main_content: true,
};
const result = await transformHtml(options);
expect(result).toContain("<p>Real content</p>");
expect(result).not.toContain("alert");
expect(result).not.toContain("color: red");
expect(result).not.toContain("Enable JavaScript");
});
it("should handle special characters and encoding", async () => {
const options = {
html: `
<div>
<p>&copy; 2024</p>
<p>&lt;tag&gt;</p>
<p>Special chars: á é í ó ú ñ</p>
<p>Emojis: 🎉 👍 🚀</p>
</div>
`,
url: "https://example.com",
include_tags: ["p"],
exclude_tags: [],
only_main_content: true,
};
const result = await transformHtml(options);
expect(result).toContain("©");
expect(result).toContain("á é í ó ú ñ");
expect(result).toContain("🎉 👍 🚀");
});
});
});