diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 0ee3493b..3ef138a5 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); + expect(result.linksOnPage).toContain('https://www.mendable.ai/blog') }, 10000); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0aef2577..f66a7c06 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; +import { extractLinks } from "./utils/utils"; dotenv.config(); @@ -109,37 +110,7 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } -function extractLinks(html: string, baseUrl: string): string[] { - const $ = cheerio.load(html); - const links: string[] = []; - // Parse the base URL to get the origin - const urlObject = new URL(baseUrl); - const origin = urlObject.origin; - - $('a').each((_, element) => { - const href = $(element).attr('href'); - if (href) { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(`${origin}${href}`); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(`${baseUrl}/${href}`); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); - } - // Fragment-only links (#) are ignored - } - }); - - // Remove duplicates and return - return [...new Set(links)]; -} export async function scrapSingleUrl( urlToScrap: string, diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index f9ce9b3c..3aa021a6 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,4 +1,6 @@ import axios from "axios"; +import * as cheerio from "cheerio"; + export async function attemptScrapWithRequests( urlToScrap: string @@ -21,3 +23,35 @@ export async function attemptScrapWithRequests( export function sanitizeText(text: string): string { return text.replace("\u0000", ""); } + +export function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} \ No newline at end of file