diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 63408eaf..30a836ba 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -22,6 +22,81 @@ describe('scrapSingleUrl', () => { }, 10000); }); +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + +// Mock the fetchHtmlContent function +jest.mock('../single_url', () => { + const originalModule = jest.requireActual('../single_url'); + originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(` + + Test Page + + Absolute Link + Relative Link + Page Link + Fragment Link + Email Link + + + `); + return originalModule; +}); + +describe('scrapSingleUrl with linksOnPage', () => { + const baseUrl = 'https://test.com'; + + it('should not include linksOnPage when option is false', async () => { + const pageOptions: PageOptions = {}; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeUndefined(); + }); + + it('should include linksOnPage when option is true', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + }); + + it('should correctly handle absolute URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://example.com'); + }); + + it('should correctly handle relative URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/relative'); + }); + + it('should correctly handle page URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/page'); + }); + + it('should not include fragment-only links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).not.toContain('#fragment'); + expect(result.linksOnPage).not.toContain('https://test.com/#fragment'); + }); + + it('should include mailto links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('mailto:test@example.com'); + }); + + it('should return unique links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + const uniqueLinks = new Set(result.linksOnPage); + expect(result.linksOnPage?.length).toBe(uniqueLinks.size); + }); +}); it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; @@ -36,3 +111,5 @@ it('should return a list of links on the mendable.ai page', async () => { }, 10000); + +