From a5fb45988cc3688ae884bb69016752e588ff85f4 Mon Sep 17 00:00:00 2001
From: Jeff Pereira <jeffluipe@gmail.com>
Date: Fri, 28 Jun 2024 17:23:40 -0700
Subject: [PATCH 1/3] new feature allowExternalContentLinks

---
 apps/api/src/lib/entities.ts               |  1 +
 apps/api/src/scraper/WebScraper/crawler.ts | 84 +++++++++++++++++-----
 apps/api/src/scraper/WebScraper/index.ts   |  3 +
 3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 2f43b9a4..c7d110bd 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -51,6 +51,7 @@ export type CrawlerOptions = {
   ignoreSitemap?: boolean;
   mode?: "default" | "fast"; // have a mode of some sort
   allowBackwardCrawling?: boolean;
+  allowExternalContentLinks?: boolean;
 }
 
 export type WebScraperOptions = {
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 5003845e..29baa1ce 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -23,6 +23,7 @@ export class WebCrawler {
   private robots: any;
   private generateImgAltText: boolean;
   private allowBackwardCrawling: boolean;
+  private allowExternalContentLinks: boolean;
 
   constructor({
     initialUrl,
@@ -32,7 +33,8 @@ export class WebCrawler {
     limit = 10000,
     generateImgAltText = false,
     maxCrawledDepth = 10,
-    allowBackwardCrawling = false
+    allowBackwardCrawling = false,
+    allowExternalContentLinks = false
   }: {
     initialUrl: string;
     includes?: string[];
@@ -42,6 +44,7 @@ export class WebCrawler {
     generateImgAltText?: boolean;
     maxCrawledDepth?: number;
     allowBackwardCrawling?: boolean;
+    allowExternalContentLinks?: boolean;
   }) {
     this.initialUrl = initialUrl;
     this.baseUrl = new URL(initialUrl).origin;
@@ -55,6 +58,7 @@ export class WebCrawler {
     this.maxCrawledDepth = maxCrawledDepth ?? 10;
     this.generateImgAltText = generateImgAltText ?? false;
     this.allowBackwardCrawling = allowBackwardCrawling ?? false;
+    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
   }
 
   private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@@ -98,9 +102,10 @@ export class WebCrawler {
         const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
 
         // Ensure the protocol and hostname match, and the path starts with the initial URL's path
-        if (linkHostname !== initialHostname) {
-          return false;
-        }
+        // commented to able to handling external link on allowExternalContentLinks
+        // if (linkHostname !== initialHostname) {
+        //   return false;
+        // }
 
         if (!this.allowBackwardCrawling) {
           if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
@@ -278,15 +283,26 @@ export class WebCrawler {
           const path = urlObj.pathname;
 
 
-          if (
-            this.isInternalLink(fullUrl) &&
-            this.noSections(fullUrl) &&
-            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
-            // this.matchesIncludes(path) &&
-            !this.matchesExcludes(path) &&
-            this.isRobotsAllowed(fullUrl)
-          ) {
-            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+          if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS
+            if (this.isInternalLink(fullUrl) &&
+              this.noSections(fullUrl) &&
+              // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
+              // this.matchesIncludes(path) &&
+              !this.matchesExcludes(path) &&
+              this.isRobotsAllowed(fullUrl)
+            ) {
+              links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+            }
+          }else{ // EXTERNAL LINKS
+            if(
+              this.isInternalLink(url) && //its avoid to add links from external pages on the queue
+              this.allowExternalContentLinks &&
+              !this.isSocialMediaOrEmail(fullUrl) &&
+              !this.matchesExcludes(fullUrl, true) &&
+              !this.isExternalMainPage(fullUrl)
+            ){
+              links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
+            }
           }
         }
       });
@@ -320,9 +336,41 @@ export class WebCrawler {
     return this.includes.some((pattern) => new RegExp(pattern).test(url));
   }
 
-  private matchesExcludes(url: string): boolean {
-    if (this.excludes.length === 0 || this.excludes[0] == "") return false;
-    return this.excludes.some((pattern) => new RegExp(pattern).test(url));
+  private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
+    return this.excludes.some((pattern) => {
+      if (onlyDomains)
+        return this.matchesExcludesExternalDomains(url);
+
+      return this.excludes.some((pattern) => new RegExp(pattern).test(url));
+    });
+  }
+
+  // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
+  private matchesExcludesExternalDomains(url: string) {
+    try {
+      const urlObj = new URL(url);
+      const hostname = urlObj.hostname;
+      const pathname = urlObj.pathname;
+
+      for (let domain of this.excludes) {
+        let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
+        let domainHostname = domainObj.hostname;
+        let domainPathname = domainObj.pathname;
+
+        if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
+          if (pathname.startsWith(domainPathname)) {
+            return true;
+          }
+        }
+      }
+      return false;
+    } catch (e) {
+      return false;
+    }
+  }
+
+  private isExternalMainPage(url:string):boolean {
+    return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
   }
 
   private noSections(link: string): boolean {
@@ -375,6 +423,10 @@ export class WebCrawler {
       "instagram.com",
       "pinterest.com",
       "mailto:",
+      "github.com",
+      "calendly.com",
+      "discord.gg",
+      "discord.com",
     ];
     return socialMediaOrEmail.some((ext) => url.includes(ext));
   }
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 9e318505..f7d23fc6 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -40,6 +40,7 @@ export class WebScraperDataProvider {
     "gpt-4-turbo";
   private crawlerMode: string = "default";
   private allowBackwardCrawling: boolean = false;
+  private allowExternalContentLinks: boolean = false;
 
   authorize(): void {
     throw new Error("Method not implemented.");
@@ -172,6 +173,7 @@ export class WebScraperDataProvider {
       limit: this.limit,
       generateImgAltText: this.generateImgAltText,
       allowBackwardCrawling: this.allowBackwardCrawling,
+      allowExternalContentLinks: this.allowExternalContentLinks,
     });
 
     let links = await crawler.start(
@@ -489,6 +491,7 @@ export class WebScraperDataProvider {
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
     this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
     this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
+    this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {

From 4d6e25619b5c8aec8698ab90ea5532a8b924096d Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 1 Jul 2024 16:05:34 -0300
Subject: [PATCH 2/3] minor spacing and comment stuff

---
 apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 29baa1ce..831970ea 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -283,24 +283,22 @@ export class WebCrawler {
           const path = urlObj.pathname;
 
 
-          if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS
+          if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
             if (this.isInternalLink(fullUrl) &&
               this.noSections(fullUrl) &&
-              // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
-              // this.matchesIncludes(path) &&
               !this.matchesExcludes(path) &&
               this.isRobotsAllowed(fullUrl)
             ) {
               links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
             }
-          }else{ // EXTERNAL LINKS
-            if(
-              this.isInternalLink(url) && //its avoid to add links from external pages on the queue
+          } else { // EXTERNAL LINKS
+            if (
+              this.isInternalLink(url) &&
               this.allowExternalContentLinks &&
               !this.isSocialMediaOrEmail(fullUrl) &&
               !this.matchesExcludes(fullUrl, true) &&
               !this.isExternalMainPage(fullUrl)
-            ){
+            ) {
               links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
             }
           }

From db4a7433655f16d38fcc3b6390b58acaeb8a07c7 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 2 Jul 2024 09:44:08 -0300
Subject: [PATCH 3/3] Added e2e test

---
 .../__tests__/e2e_full_withAuth/index.test.ts | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
index c8281edd..144661bb 100644
--- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
@@ -804,6 +804,46 @@ describe("E2E Tests for API Routes", () => {
       expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
     }, 180000);
 
+  it.concurrent("should crawl external content links when allowed", async () => {
+    const crawlInitResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        url: "https://mendable.ai",
+        crawlerOptions: {
+          allowExternalContentLinks: true,
+          ignoreSitemap: true,
+          returnOnlyUrls: true,
+          limit: 50
+        }
+      });
+
+      expect(crawlInitResponse.statusCode).toBe(200);
+      expect(crawlInitResponse.body).toHaveProperty("jobId");
+
+      let crawlStatus: string;
+      let crawlData = [];
+      while (crawlStatus !== "completed") {
+        const statusResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        crawlStatus = statusResponse.body.status;
+        if (statusResponse.body.data) {
+          crawlData = statusResponse.body.data;
+        }
+        if (crawlStatus !== "completed") {
+          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+        }
+      }
+      console.log(crawlData)
+      expect(crawlData.length).toBeGreaterThan(0);
+      expect(crawlData).toEqual(expect.arrayContaining([
+        expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
+        expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }),
+        expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") })
+      ]));
+    }, 180000); // 3 minutes timeout
   });
 
   describe("POST /v0/crawlWebsitePreview", () => {