From d39d3be64938082b6fb19e367b1d852f7844c442 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 18:38:03 -0700
Subject: [PATCH 1/6] Caleb: now extracting and returning a list of all links
 on the page for a customer

---
 apps/api/src/lib/entities.ts                  |  4 +-
 apps/api/src/scraper/WebScraper/single_url.ts | 44 +++++++++++++++++--
 2 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 089d373c..f60e197f 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -89,7 +89,8 @@ export class Document {
   warning?: string;
 
   index?: number;
-
+  linksOnPage?: string[]; // Add this new field as a separate property
+  
   constructor(data: Partial<Document>) {
     if (!data.content) {
       throw new Error("Missing required fields");
@@ -102,6 +103,7 @@ export class Document {
     this.markdown = data.markdown || "";
     this.childrenLinks = data.childrenLinks || undefined;
     this.provider = data.provider || undefined;
+    this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
   }
 }
 
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index d24e5c2e..0aef2577 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
   return scrapersInOrder as (typeof baseScrapers)[number][];
 }
 
+function extractLinks(html: string, baseUrl: string): string[] {
+  const $ = cheerio.load(html);
+  const links: string[] = [];
+
+  // Parse the base URL to get the origin
+  const urlObject = new URL(baseUrl);
+  const origin = urlObject.origin;
+
+  $('a').each((_, element) => {
+    const href = $(element).attr('href');
+    if (href) {
+      if (href.startsWith('http://') || href.startsWith('https://')) {
+        // Absolute URL, add as is
+        links.push(href);
+      } else if (href.startsWith('/')) {
+        // Relative URL starting with '/', append to origin
+        links.push(`${origin}${href}`);
+      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
+        // Relative URL not starting with '/', append to base URL
+        links.push(`${baseUrl}/${href}`);
+      } else if (href.startsWith('mailto:')) {
+        // mailto: links, add as is
+        links.push(href);
+      }
+      // Fragment-only links (#) are ignored
+    }
+  });
+
+  // Remove duplicates and return
+  return [...new Set(links)];
+}
+
 export async function scrapSingleUrl(
   urlToScrap: string,
   pageOptions: PageOptions = {
@@ -234,7 +266,6 @@ export async function scrapSingleUrl(
       scraperResponse.text = customScrapedContent.html;
       screenshot = customScrapedContent.screenshot;
     }
-
     //* TODO: add an optional to return markdown or structured/extracted content
     let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
     return {
@@ -309,6 +340,10 @@ export async function scrapSingleUrl(
     const soup = cheerio.load(rawHtml);
     const metadata = extractMetadata(soup, urlToScrap);
 
+    let linksOnPage: string[] | undefined;
+
+    linksOnPage = extractLinks(rawHtml, urlToScrap);
+
     let document: Document;
     if (screenshot && screenshot.length > 0) {
       document = {
@@ -317,9 +352,10 @@ export async function scrapSingleUrl(
         html: pageOptions.includeHtml ? html : undefined,
         rawHtml:
           pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
             ? rawHtml
             : undefined,
+        linksOnPage,
         metadata: {
           ...metadata,
           screenshot: screenshot,
@@ -335,7 +371,7 @@ export async function scrapSingleUrl(
         html: pageOptions.includeHtml ? html : undefined,
         rawHtml:
           pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
             ? rawHtml
             : undefined,
         metadata: {
@@ -344,6 +380,7 @@ export async function scrapSingleUrl(
           pageStatusCode: pageStatusCode,
           pageError: pageError,
         },
+        linksOnPage,
       };
     }
 
@@ -354,6 +391,7 @@ export async function scrapSingleUrl(
       content: "",
       markdown: "",
       html: "",
+      linksOnPage: [],
       metadata: {
         sourceURL: urlToScrap,
         pageStatusCode: pageStatusCode,

From 98c788ca7a0a27f1c9da5a94971f59647634f0f3 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 21:13:52 -0700
Subject: [PATCH 2/6] Caleb: added a test to ensure links on page exists and
 isn't zero on mendable

---
 .../WebScraper/__tests__/single_url.test.ts        | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 7966648b..63408eaf 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -22,3 +22,17 @@ describe('scrapSingleUrl', () => {
   }, 10000);
 });
 
+
+it('should return a list of links on the mendable.ai page', async () => {
+  const url = 'https://mendable.ai';
+  const pageOptions: PageOptions = { includeHtml: true };
+
+  const result = await scrapSingleUrl(url, pageOptions);
+
+  // Check if the result contains a list of links
+  expect(result.linksOnPage).toBeDefined();
+  expect(Array.isArray(result.linksOnPage)).toBe(true);
+  expect(result.linksOnPage.length).toBeGreaterThan(0);
+}, 10000);
+
+

From 0b3c0ede49a77689082acb708c110aaae2fca327 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 21:15:59 -0700
Subject: [PATCH 3/6] Added tests per @nicks request

---
 .../WebScraper/__tests__/single_url.test.ts   | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 63408eaf..30a836ba 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -22,6 +22,81 @@ describe('scrapSingleUrl', () => {
   }, 10000);
 });
 
+import { scrapSingleUrl } from '../single_url';
+import { PageOptions } from '../../../lib/entities';
+
+// Mock the fetchHtmlContent function
+jest.mock('../single_url', () => {
+  const originalModule = jest.requireActual('../single_url');
+  originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(`
+    <html>
+      <head><title>Test Page</title></head>
+      <body>
+        <a href="https://example.com">Absolute Link</a>
+        <a href="/relative">Relative Link</a>
+        <a href="page">Page Link</a>
+        <a href="#fragment">Fragment Link</a>
+        <a href="mailto:test@example.com">Email Link</a>
+      </body>
+    </html>
+  `);
+  return originalModule;
+});
+
+describe('scrapSingleUrl with linksOnPage', () => {
+  const baseUrl = 'https://test.com';
+
+  it('should not include linksOnPage when option is false', async () => {
+    const pageOptions: PageOptions = {};
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toBeUndefined();
+  });
+
+  it('should include linksOnPage when option is true', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toBeDefined();
+    expect(Array.isArray(result.linksOnPage)).toBe(true);
+  });
+
+  it('should correctly handle absolute URLs', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toContain('https://example.com');
+  });
+
+  it('should correctly handle relative URLs', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toContain('https://test.com/relative');
+  });
+
+  it('should correctly handle page URLs', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toContain('https://test.com/page');
+  });
+
+  it('should not include fragment-only links', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).not.toContain('#fragment');
+    expect(result.linksOnPage).not.toContain('https://test.com/#fragment');
+  });
+
+  it('should include mailto links', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    expect(result.linksOnPage).toContain('mailto:test@example.com');
+  });
+
+  it('should return unique links', async () => {
+    const pageOptions: PageOptions = {  };
+    const result = await scrapSingleUrl(baseUrl, pageOptions);
+    const uniqueLinks = new Set(result.linksOnPage);
+    expect(result.linksOnPage?.length).toBe(uniqueLinks.size);
+  });
+});
 
 it('should return a list of links on the mendable.ai page', async () => {
   const url = 'https://mendable.ai';
@@ -36,3 +111,5 @@ it('should return a list of links on the mendable.ai page', async () => {
 }, 10000);
 
 
+
+

From da3c6bca374c9d51a21ede7812730b04465b315a Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 16 Jul 2024 21:23:22 -0700
Subject: [PATCH 4/6] Caleb: added a simple test

---
 .../WebScraper/__tests__/single_url.test.ts   | 87 +------------------
 1 file changed, 4 insertions(+), 83 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 30a836ba..0ee3493b 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -1,3 +1,7 @@
+import { scrapSingleUrl } from '../single_url';
+import { PageOptions } from '../../../lib/entities';
+
+
 jest.mock('../single_url', () => {
   const originalModule = jest.requireActual('../single_url');
   originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
@@ -5,9 +9,6 @@ jest.mock('../single_url', () => {
   return originalModule;
 });
 
-import { scrapSingleUrl } from '../single_url';
-import { PageOptions } from '../../../lib/entities';
-
 describe('scrapSingleUrl', () => {
   it('should handle includeHtml option correctly', async () => {
     const url = 'https://roastmywebsite.ai';
@@ -22,82 +23,6 @@ describe('scrapSingleUrl', () => {
   }, 10000);
 });
 
-import { scrapSingleUrl } from '../single_url';
-import { PageOptions } from '../../../lib/entities';
-
-// Mock the fetchHtmlContent function
-jest.mock('../single_url', () => {
-  const originalModule = jest.requireActual('../single_url');
-  originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(`
-    <html>
-      <head><title>Test Page</title></head>
-      <body>
-        <a href="https://example.com">Absolute Link</a>
-        <a href="/relative">Relative Link</a>
-        <a href="page">Page Link</a>
-        <a href="#fragment">Fragment Link</a>
-        <a href="mailto:test@example.com">Email Link</a>
-      </body>
-    </html>
-  `);
-  return originalModule;
-});
-
-describe('scrapSingleUrl with linksOnPage', () => {
-  const baseUrl = 'https://test.com';
-
-  it('should not include linksOnPage when option is false', async () => {
-    const pageOptions: PageOptions = {};
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toBeUndefined();
-  });
-
-  it('should include linksOnPage when option is true', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toBeDefined();
-    expect(Array.isArray(result.linksOnPage)).toBe(true);
-  });
-
-  it('should correctly handle absolute URLs', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toContain('https://example.com');
-  });
-
-  it('should correctly handle relative URLs', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toContain('https://test.com/relative');
-  });
-
-  it('should correctly handle page URLs', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toContain('https://test.com/page');
-  });
-
-  it('should not include fragment-only links', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).not.toContain('#fragment');
-    expect(result.linksOnPage).not.toContain('https://test.com/#fragment');
-  });
-
-  it('should include mailto links', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    expect(result.linksOnPage).toContain('mailto:test@example.com');
-  });
-
-  it('should return unique links', async () => {
-    const pageOptions: PageOptions = {  };
-    const result = await scrapSingleUrl(baseUrl, pageOptions);
-    const uniqueLinks = new Set(result.linksOnPage);
-    expect(result.linksOnPage?.length).toBe(uniqueLinks.size);
-  });
-});
-
 it('should return a list of links on the mendable.ai page', async () => {
   const url = 'https://mendable.ai';
   const pageOptions: PageOptions = { includeHtml: true };
@@ -109,7 +34,3 @@ it('should return a list of links on the mendable.ai page', async () => {
   expect(Array.isArray(result.linksOnPage)).toBe(true);
   expect(result.linksOnPage.length).toBeGreaterThan(0);
 }, 10000);
-
-
-
-

From c5d1e7260d95d60b1369eab82ce7e5c0af28acff Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Wed, 17 Jul 2024 11:29:05 -0700
Subject: [PATCH 5/6] Caleb: made changes per Rafaels requests

---
 .../WebScraper/__tests__/single_url.test.ts   |  1 +
 apps/api/src/scraper/WebScraper/single_url.ts | 31 +----------------
 .../api/src/scraper/WebScraper/utils/utils.ts | 34 +++++++++++++++++++
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 0ee3493b..3ef138a5 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
   expect(result.linksOnPage).toBeDefined();
   expect(Array.isArray(result.linksOnPage)).toBe(true);
   expect(result.linksOnPage.length).toBeGreaterThan(0);
+  expect(result.linksOnPage).toContain('https://www.mendable.ai/blog')
 }, 10000);
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 0aef2577..f66a7c06 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch";
 import { scrapWithFireEngine } from "./scrapers/fireEngine";
 import { scrapWithPlaywright } from "./scrapers/playwright";
 import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
+import { extractLinks } from "./utils/utils";
 
 dotenv.config();
 
@@ -109,37 +110,7 @@ function getScrapingFallbackOrder(
   return scrapersInOrder as (typeof baseScrapers)[number][];
 }
 
-function extractLinks(html: string, baseUrl: string): string[] {
-  const $ = cheerio.load(html);
-  const links: string[] = [];
 
-  // Parse the base URL to get the origin
-  const urlObject = new URL(baseUrl);
-  const origin = urlObject.origin;
-
-  $('a').each((_, element) => {
-    const href = $(element).attr('href');
-    if (href) {
-      if (href.startsWith('http://') || href.startsWith('https://')) {
-        // Absolute URL, add as is
-        links.push(href);
-      } else if (href.startsWith('/')) {
-        // Relative URL starting with '/', append to origin
-        links.push(`${origin}${href}`);
-      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
-        // Relative URL not starting with '/', append to base URL
-        links.push(`${baseUrl}/${href}`);
-      } else if (href.startsWith('mailto:')) {
-        // mailto: links, add as is
-        links.push(href);
-      }
-      // Fragment-only links (#) are ignored
-    }
-  });
-
-  // Remove duplicates and return
-  return [...new Set(links)];
-}
 
 export async function scrapSingleUrl(
   urlToScrap: string,
diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts
index f9ce9b3c..3aa021a6 100644
--- a/apps/api/src/scraper/WebScraper/utils/utils.ts
+++ b/apps/api/src/scraper/WebScraper/utils/utils.ts
@@ -1,4 +1,6 @@
 import axios from "axios";
+import * as cheerio from "cheerio";
+
 
 export async function attemptScrapWithRequests(
   urlToScrap: string
@@ -21,3 +23,35 @@ export async function attemptScrapWithRequests(
 export function sanitizeText(text: string): string {
   return text.replace("\u0000", "");
 }
+
+export function extractLinks(html: string, baseUrl: string): string[] {
+  const $ = cheerio.load(html);
+  const links: string[] = [];
+
+  // Parse the base URL to get the origin
+  const urlObject = new URL(baseUrl);
+  const origin = urlObject.origin;
+
+  $('a').each((_, element) => {
+    const href = $(element).attr('href');
+    if (href) {
+      if (href.startsWith('http://') || href.startsWith('https://')) {
+        // Absolute URL, add as is
+        links.push(href);
+      } else if (href.startsWith('/')) {
+        // Relative URL starting with '/', append to origin
+        links.push(`${origin}${href}`);
+      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
+        // Relative URL not starting with '/', append to base URL
+        links.push(`${baseUrl}/${href}`);
+      } else if (href.startsWith('mailto:')) {
+        // mailto: links, add as is
+        links.push(href);
+      }
+      // Fragment-only links (#) are ignored
+    }
+  });
+
+  // Remove duplicates and return
+  return [...new Set(links)];
+}
\ No newline at end of file

From 5b24d26c84ca68301af50199994f57021a15e424 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Wed, 17 Jul 2024 11:33:12 -0700
Subject: [PATCH 6/6] Caleb; fixed test

---
 apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
index 3ef138a5..8a9df227 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
@@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
   expect(result.linksOnPage).toBeDefined();
   expect(Array.isArray(result.linksOnPage)).toBe(true);
   expect(result.linksOnPage.length).toBeGreaterThan(0);
-  expect(result.linksOnPage).toContain('https://www.mendable.ai/blog')
+  expect(result.linksOnPage).toContain('https://mendable.ai/blog')
 }, 10000);