From 0b3c0ede49a77689082acb708c110aaae2fca327 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 21:15:59 -0700
Subject: [PATCH] Added tests per @nicks request
---
.../WebScraper/__tests__/single_url.test.ts | 77 +++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 63408eaf..30a836ba 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -22,6 +22,81 @@ describe('scrapSingleUrl', () => {
}, 10000);
});
+import { scrapSingleUrl } from '../single_url';
+import { PageOptions } from '../../../lib/entities';
+
+// Mock the fetchHtmlContent function
+jest.mock('../single_url', () => {
+ const originalModule = jest.requireActual('../single_url');
+ originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(`
+
+
Test Page
+
+ Absolute Link
+ Relative Link
+ Page Link
+ Fragment Link
+ Email Link
+
+
+ `);
+ return originalModule;
+});
+
+describe('scrapSingleUrl with linksOnPage', () => {
+ const baseUrl = 'https://test.com';
+
+ it('should not include linksOnPage when option is false', async () => {
+ const pageOptions: PageOptions = {};
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toBeUndefined();
+ });
+
+ it('should include linksOnPage when option is true', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toBeDefined();
+ expect(Array.isArray(result.linksOnPage)).toBe(true);
+ });
+
+ it('should correctly handle absolute URLs', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toContain('https://example.com');
+ });
+
+ it('should correctly handle relative URLs', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toContain('https://test.com/relative');
+ });
+
+ it('should correctly handle page URLs', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toContain('https://test.com/page');
+ });
+
+ it('should not include fragment-only links', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).not.toContain('#fragment');
+ expect(result.linksOnPage).not.toContain('https://test.com/#fragment');
+ });
+
+ it('should include mailto links', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ expect(result.linksOnPage).toContain('mailto:test@example.com');
+ });
+
+ it('should return unique links', async () => {
+ const pageOptions: PageOptions = { };
+ const result = await scrapSingleUrl(baseUrl, pageOptions);
+ const uniqueLinks = new Set(result.linksOnPage);
+ expect(result.linksOnPage?.length).toBe(uniqueLinks.size);
+ });
+});
it('should return a list of links on the mendable.ai page', async () => {
const url = 'https://mendable.ai';
@@ -36,3 +111,5 @@ it('should return a list of links on the mendable.ai page', async () => {
}, 10000);
+
+