Caleb: now extracting and returning a list of all links on the page for a customer

2025-07-03 20:35:09 +08:00 · 2024-07-16 18:38:03 -07:00 · 2024-07-16 18:38:03 -07:00 · d39d3be649
commit d39d3be649
parent db0545014f
2 changed files with 44 additions and 4 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -89,6 +89,7 @@ export class Document {
  warning?: string;
  index?: number;
  linksOnPage?: string[]; // Add this new field as a separate property
  constructor(data: Partial<Document>) {
    if (!data.content) {
@ -102,6 +103,7 @@ export class Document {
    this.markdown = data.markdown || "";
    this.childrenLinks = data.childrenLinks || undefined;
    this.provider = data.provider || undefined;
    this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
  }
 }
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
  return scrapersInOrder as (typeof baseScrapers)[number][];
 }
 function extractLinks(html: string, baseUrl: string): string[] {
  const $ = cheerio.load(html);
  const links: string[] = [];
  // Parse the base URL to get the origin
  const urlObject = new URL(baseUrl);
  const origin = urlObject.origin;
  $('a').each((_, element) => {
    const href = $(element).attr('href');
    if (href) {
      if (href.startsWith('http://') || href.startsWith('https://')) {
        // Absolute URL, add as is
        links.push(href);
      } else if (href.startsWith('/')) {
        // Relative URL starting with '/', append to origin
        links.push(`${origin}${href}`);
      } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
        // Relative URL not starting with '/', append to base URL
        links.push(`${baseUrl}/${href}`);
      } else if (href.startsWith('mailto:')) {
        // mailto: links, add as is
        links.push(href);
      }
      // Fragment-only links (#) are ignored
    }
  });
  // Remove duplicates and return
  return [...new Set(links)];
 }
 export async function scrapSingleUrl(
  urlToScrap: string,
  pageOptions: PageOptions = {
@ -234,7 +266,6 @@ export async function scrapSingleUrl(
      scraperResponse.text = customScrapedContent.html;
      screenshot = customScrapedContent.screenshot;
    }
    //* TODO: add an optional to return markdown or structured/extracted content
    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
    return {
@ -309,6 +340,10 @@ export async function scrapSingleUrl(
    const soup = cheerio.load(rawHtml);
    const metadata = extractMetadata(soup, urlToScrap);
    let linksOnPage: string[] | undefined;
    linksOnPage = extractLinks(rawHtml, urlToScrap);
    let document: Document;
    if (screenshot && screenshot.length > 0) {
      document = {
@ -317,9 +352,10 @@ export async function scrapSingleUrl(
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
        linksOnPage,
        metadata: {
          ...metadata,
          screenshot: screenshot,
@ -335,7 +371,7 @@ export async function scrapSingleUrl(
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
-          extractorOptions.mode === "llm-extraction-from-raw-html"
+            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
        metadata: {
@ -344,6 +380,7 @@ export async function scrapSingleUrl(
          pageStatusCode: pageStatusCode,
          pageError: pageError,
        },
        linksOnPage,
      };
    }
@ -354,6 +391,7 @@ export async function scrapSingleUrl(
      content: "",
      markdown: "",
      html: "",
      linksOnPage: [],
      metadata: {
        sourceURL: urlToScrap,
        pageStatusCode: pageStatusCode,