diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 089d373c..f60e197f 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -89,7 +89,8 @@ export class Document { warning?: string; index?: number; - + linksOnPage?: string[]; // Add this new field as a separate property + constructor(data: Partial) { if (!data.content) { throw new Error("Missing required fields"); @@ -102,6 +103,7 @@ export class Document { this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; this.provider = data.provider || undefined; + this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided } } diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 7966648b..8a9df227 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -1,3 +1,7 @@ +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + + jest.mock('../single_url', () => { const originalModule = jest.requireActual('../single_url'); originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); @@ -5,9 +9,6 @@ jest.mock('../single_url', () => { return originalModule; }); -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - describe('scrapSingleUrl', () => { it('should handle includeHtml option correctly', async () => { const url = 'https://roastmywebsite.ai'; @@ -22,3 +23,15 @@ describe('scrapSingleUrl', () => { }, 10000); }); +it('should return a list of links on the mendable.ai page', async () => { + const url = 'https://mendable.ai'; + const pageOptions: PageOptions = { includeHtml: true }; + + const result = await scrapSingleUrl(url, pageOptions); + + // Check if the result contains a list of links + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + expect(result.linksOnPage.length).toBeGreaterThan(0); + expect(result.linksOnPage).toContain('https://mendable.ai/blog') +}, 10000); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..f66a7c06 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; +import { extractLinks } from "./utils/utils"; dotenv.config(); @@ -109,6 +110,8 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } + + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { @@ -234,7 +237,6 @@ export async function scrapSingleUrl( scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); return { @@ -309,6 +311,10 @@ export async function scrapSingleUrl( const soup = cheerio.load(rawHtml); const metadata = extractMetadata(soup, urlToScrap); + let linksOnPage: string[] | undefined; + + linksOnPage = extractLinks(rawHtml, urlToScrap); + let document: Document; if (screenshot && screenshot.length > 0) { document = { @@ -317,9 +323,10 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + linksOnPage, metadata: { ...metadata, screenshot: screenshot, @@ -335,7 +342,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { @@ -344,6 +351,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, + linksOnPage, }; } @@ -354,6 +362,7 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", + linksOnPage: [], metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index f9ce9b3c..3aa021a6 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,4 +1,6 @@ import axios from "axios"; +import * as cheerio from "cheerio"; + export async function attemptScrapWithRequests( urlToScrap: string @@ -21,3 +23,35 @@ export async function attemptScrapWithRequests( export function sanitizeText(text: string): string { return text.replace("\u0000", ""); } + +export function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} \ No newline at end of file