new feature allowExternalContentLinks

2025-08-15 04:16:00 +08:00 · 2024-06-28 17:23:40 -07:00 · 2024-06-28 17:23:40 -07:00 · a5fb45988c
commit a5fb45988c
parent 9bf74bc774
3 changed files with 72 additions and 16 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -51,6 +51,7 @@ export type CrawlerOptions = {
  ignoreSitemap?: boolean;
  mode?: "default" | "fast"; // have a mode of some sort
  allowBackwardCrawling?: boolean;
+  allowExternalContentLinks?: boolean;
 }

 export type WebScraperOptions = {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -23,6 +23,7 @@ export class WebCrawler {
  private robots: any;
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
+  private allowExternalContentLinks: boolean;

  constructor({
    initialUrl,
@ -32,7 +33,8 @@ export class WebCrawler {
    limit = 10000,
    generateImgAltText = false,
    maxCrawledDepth = 10,
-    allowBackwardCrawling = false
+    allowBackwardCrawling = false,
+    allowExternalContentLinks = false
  }: {
    initialUrl: string;
    includes?: string[];
@ -42,6 +44,7 @@ export class WebCrawler {
    generateImgAltText?: boolean;
    maxCrawledDepth?: number;
    allowBackwardCrawling?: boolean;
+    allowExternalContentLinks?: boolean;
  }) {
    this.initialUrl = initialUrl;
    this.baseUrl = new URL(initialUrl).origin;
@ -55,6 +58,7 @@ export class WebCrawler {
    this.maxCrawledDepth = maxCrawledDepth ?? 10;
    this.generateImgAltText = generateImgAltText ?? false;
    this.allowBackwardCrawling = allowBackwardCrawling ?? false;
+    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
  }

  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -98,9 +102,10 @@ export class WebCrawler {
        const linkHostname = normalizedLink.hostname.replace(/^www\./, '');

        // Ensure the protocol and hostname match, and the path starts with the initial URL's path
-        if (linkHostname !== initialHostname) {
-          return false;
-        }
+        // commented to able to handling external link on allowExternalContentLinks
+        // if (linkHostname !== initialHostname) {
+        //   return false;
+        // }

        if (!this.allowBackwardCrawling) {
          if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
@ -278,15 +283,26 @@ export class WebCrawler {
          const path = urlObj.pathname;


-          if (
-            this.isInternalLink(fullUrl) &&
-            this.noSections(fullUrl) &&
-            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
-            // this.matchesIncludes(path) &&
-            !this.matchesExcludes(path) &&
-            this.isRobotsAllowed(fullUrl)
-          ) {
-            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+          if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS
+            if (this.isInternalLink(fullUrl) &&
+              this.noSections(fullUrl) &&
+              // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
+              // this.matchesIncludes(path) &&
+              !this.matchesExcludes(path) &&
+              this.isRobotsAllowed(fullUrl)
+            ) {
+              links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+            }
+          }else{ // EXTERNAL LINKS
+            if(
+              this.isInternalLink(url) && //its avoid to add links from external pages on the queue
+              this.allowExternalContentLinks &&
+              !this.isSocialMediaOrEmail(fullUrl) &&
+              !this.matchesExcludes(fullUrl, true) &&
+              !this.isExternalMainPage(fullUrl)
+            ){
+              links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+            }
          }
        }
      });
@ -320,9 +336,41 @@ export class WebCrawler {
    return this.includes.some((pattern) => new RegExp(pattern).test(url));
  }

-  private matchesExcludes(url: string): boolean {
-    if (this.excludes.length === 0 || this.excludes[0] == "") return false;
-    return this.excludes.some((pattern) => new RegExp(pattern).test(url));
+  private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
+    return this.excludes.some((pattern) => {
+      if (onlyDomains)
+        return this.matchesExcludesExternalDomains(url);
+
+      return this.excludes.some((pattern) => new RegExp(pattern).test(url));
+    });
+  }
+
+  // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
+  private matchesExcludesExternalDomains(url: string) {
+    try {
+      const urlObj = new URL(url);
+      const hostname = urlObj.hostname;
+      const pathname = urlObj.pathname;
+
+      for (let domain of this.excludes) {
+        let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
+        let domainHostname = domainObj.hostname;
+        let domainPathname = domainObj.pathname;
+
+        if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
+          if (pathname.startsWith(domainPathname)) {
+            return true;
+          }
+        }
+      }
+      return false;
+    } catch (e) {
+      return false;
+    }
+  }
+
+  private isExternalMainPage(url:string):boolean {
+    return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
  }

  private noSections(link: string): boolean {
@ -375,6 +423,10 @@ export class WebCrawler {
      "instagram.com",
      "pinterest.com",
      "mailto:",
+      "github.com",
+      "calendly.com",
+      "discord.gg",
+      "discord.com",
    ];
    return socialMediaOrEmail.some((ext) => url.includes(ext));
  }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -40,6 +40,7 @@ export class WebScraperDataProvider {
    "gpt-4-turbo";
  private crawlerMode: string = "default";
  private allowBackwardCrawling: boolean = false;
+  private allowExternalContentLinks: boolean = false;

  authorize(): void {
    throw new Error("Method not implemented.");
@ -172,6 +173,7 @@ export class WebScraperDataProvider {
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
      allowBackwardCrawling: this.allowBackwardCrawling,
+      allowExternalContentLinks: this.allowExternalContentLinks,
    });

    let links = await crawler.start(
@ -489,6 +491,7 @@ export class WebScraperDataProvider {
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
    this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
+    this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;

    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {