From d39d3be64938082b6fb19e367b1d852f7844c442 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 18:38:03 -0700 Subject: [PATCH 1/6] Caleb: now extracting and returning a list of all links on the page for a customer --- apps/api/src/lib/entities.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 44 +++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 089d373c..f60e197f 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -89,7 +89,8 @@ export class Document { warning?: string; index?: number; - + linksOnPage?: string[]; // Add this new field as a separate property + constructor(data: Partial) { if (!data.content) { throw new Error("Missing required fields"); @@ -102,6 +103,7 @@ export class Document { this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; this.provider = data.provider || undefined; + this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided } } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..0aef2577 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -109,6 +109,38 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } +function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { @@ -234,7 +266,6 @@ export async function scrapSingleUrl( scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); return { @@ -309,6 +340,10 @@ export async function scrapSingleUrl( const soup = cheerio.load(rawHtml); const metadata = extractMetadata(soup, urlToScrap); + let linksOnPage: string[] | undefined; + + linksOnPage = extractLinks(rawHtml, urlToScrap); + let document: Document; if (screenshot && screenshot.length > 0) { document = { @@ -317,9 +352,10 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + linksOnPage, metadata: { ...metadata, screenshot: screenshot, @@ -335,7 +371,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { @@ -344,6 +380,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, + linksOnPage, }; } @@ -354,6 +391,7 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", + linksOnPage: [], metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, From 98c788ca7a0a27f1c9da5a94971f59647634f0f3 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:13:52 -0700 Subject: [PATCH 2/6] Caleb: added a test to ensure links on page exists and isn't zero on mendable --- .../WebScraper/__tests__/single_url.test.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 7966648b..63408eaf 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -22,3 +22,17 @@ describe('scrapSingleUrl', () => { }, 10000); }); + +it('should return a list of links on the mendable.ai page', async () => { + const url = 'https://mendable.ai'; + const pageOptions: PageOptions = { includeHtml: true }; + + const result = await scrapSingleUrl(url, pageOptions); + + // Check if the result contains a list of links + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + expect(result.linksOnPage.length).toBeGreaterThan(0); +}, 10000); + + From 0b3c0ede49a77689082acb708c110aaae2fca327 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:15:59 -0700 Subject: [PATCH 3/6] Added tests per @nicks request --- .../WebScraper/__tests__/single_url.test.ts | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 63408eaf..30a836ba 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -22,6 +22,81 @@ describe('scrapSingleUrl', () => { }, 10000); }); +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + +// Mock the fetchHtmlContent function +jest.mock('../single_url', () => { + const originalModule = jest.requireActual('../single_url'); + originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(` + + Test Page + + Absolute Link + Relative Link + Page Link + Fragment Link + Email Link + + + `); + return originalModule; +}); + +describe('scrapSingleUrl with linksOnPage', () => { + const baseUrl = 'https://test.com'; + + it('should not include linksOnPage when option is false', async () => { + const pageOptions: PageOptions = {}; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeUndefined(); + }); + + it('should include linksOnPage when option is true', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + }); + + it('should correctly handle absolute URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://example.com'); + }); + + it('should correctly handle relative URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/relative'); + }); + + it('should correctly handle page URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/page'); + }); + + it('should not include fragment-only links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).not.toContain('#fragment'); + expect(result.linksOnPage).not.toContain('https://test.com/#fragment'); + }); + + it('should include mailto links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('mailto:test@example.com'); + }); + + it('should return unique links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + const uniqueLinks = new Set(result.linksOnPage); + expect(result.linksOnPage?.length).toBe(uniqueLinks.size); + }); +}); it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; @@ -36,3 +111,5 @@ it('should return a list of links on the mendable.ai page', async () => { }, 10000); + + From da3c6bca374c9d51a21ede7812730b04465b315a Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:23:22 -0700 Subject: [PATCH 4/6] Caleb: added a simple test --- .../WebScraper/__tests__/single_url.test.ts | 87 +------------------ 1 file changed, 4 insertions(+), 83 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 30a836ba..0ee3493b 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -1,3 +1,7 @@ +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + + jest.mock('../single_url', () => { const originalModule = jest.requireActual('../single_url'); originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); @@ -5,9 +9,6 @@ jest.mock('../single_url', () => { return originalModule; }); -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - describe('scrapSingleUrl', () => { it('should handle includeHtml option correctly', async () => { const url = 'https://roastmywebsite.ai'; @@ -22,82 +23,6 @@ describe('scrapSingleUrl', () => { }, 10000); }); -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - -// Mock the fetchHtmlContent function -jest.mock('../single_url', () => { - const originalModule = jest.requireActual('../single_url'); - originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(` - - Test Page - - Absolute Link - Relative Link - Page Link - Fragment Link - Email Link - - - `); - return originalModule; -}); - -describe('scrapSingleUrl with linksOnPage', () => { - const baseUrl = 'https://test.com'; - - it('should not include linksOnPage when option is false', async () => { - const pageOptions: PageOptions = {}; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toBeUndefined(); - }); - - it('should include linksOnPage when option is true', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toBeDefined(); - expect(Array.isArray(result.linksOnPage)).toBe(true); - }); - - it('should correctly handle absolute URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://example.com'); - }); - - it('should correctly handle relative URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://test.com/relative'); - }); - - it('should correctly handle page URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://test.com/page'); - }); - - it('should not include fragment-only links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).not.toContain('#fragment'); - expect(result.linksOnPage).not.toContain('https://test.com/#fragment'); - }); - - it('should include mailto links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('mailto:test@example.com'); - }); - - it('should return unique links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - const uniqueLinks = new Set(result.linksOnPage); - expect(result.linksOnPage?.length).toBe(uniqueLinks.size); - }); -}); - it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; const pageOptions: PageOptions = { includeHtml: true }; @@ -109,7 +34,3 @@ it('should return a list of links on the mendable.ai page', async () => { expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); }, 10000); - - - - From c5d1e7260d95d60b1369eab82ce7e5c0af28acff Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:29:05 -0700 Subject: [PATCH 5/6] Caleb: made changes per Rafaels requests --- .../WebScraper/__tests__/single_url.test.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 31 +---------------- .../api/src/scraper/WebScraper/utils/utils.ts | 34 +++++++++++++++++++ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 0ee3493b..3ef138a5 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); + expect(result.linksOnPage).toContain('https://www.mendable.ai/blog') }, 10000); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0aef2577..f66a7c06 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; +import { extractLinks } from "./utils/utils"; dotenv.config(); @@ -109,37 +110,7 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } -function extractLinks(html: string, baseUrl: string): string[] { - const $ = cheerio.load(html); - const links: string[] = []; - // Parse the base URL to get the origin - const urlObject = new URL(baseUrl); - const origin = urlObject.origin; - - $('a').each((_, element) => { - const href = $(element).attr('href'); - if (href) { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(`${origin}${href}`); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(`${baseUrl}/${href}`); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); - } - // Fragment-only links (#) are ignored - } - }); - - // Remove duplicates and return - return [...new Set(links)]; -} export async function scrapSingleUrl( urlToScrap: string, diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index f9ce9b3c..3aa021a6 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,4 +1,6 @@ import axios from "axios"; +import * as cheerio from "cheerio"; + export async function attemptScrapWithRequests( urlToScrap: string @@ -21,3 +23,35 @@ export async function attemptScrapWithRequests( export function sanitizeText(text: string): string { return text.replace("\u0000", ""); } + +export function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} \ No newline at end of file From 5b24d26c84ca68301af50199994f57021a15e424 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:33:12 -0700 Subject: [PATCH 6/6] Caleb; fixed test --- apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 3ef138a5..8a9df227 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); - expect(result.linksOnPage).toContain('https://www.mendable.ai/blog') + expect(result.linksOnPage).toContain('https://mendable.ai/blog') }, 10000);