From a5fb45988cc3688ae884bb69016752e588ff85f4 Mon Sep 17 00:00:00 2001 From: Jeff Pereira Date: Fri, 28 Jun 2024 17:23:40 -0700 Subject: [PATCH 1/3] new feature allowExternalContentLinks --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 84 +++++++++++++++++----- apps/api/src/scraper/WebScraper/index.ts | 3 + 3 files changed, 72 insertions(+), 16 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 2f43b9a4..c7d110bd 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -51,6 +51,7 @@ export type CrawlerOptions = { ignoreSitemap?: boolean; mode?: "default" | "fast"; // have a mode of some sort allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; } export type WebScraperOptions = { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5003845e..29baa1ce 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -23,6 +23,7 @@ export class WebCrawler { private robots: any; private generateImgAltText: boolean; private allowBackwardCrawling: boolean; + private allowExternalContentLinks: boolean; constructor({ initialUrl, @@ -32,7 +33,8 @@ export class WebCrawler { limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, - allowBackwardCrawling = false + allowBackwardCrawling = false, + allowExternalContentLinks = false }: { initialUrl: string; includes?: string[]; @@ -42,6 +44,7 @@ export class WebCrawler { generateImgAltText?: boolean; maxCrawledDepth?: number; allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -55,6 +58,7 @@ export class WebCrawler { this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false; + this.allowExternalContentLinks = allowExternalContentLinks ?? false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -98,9 +102,10 @@ export class WebCrawler { const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - if (linkHostname !== initialHostname) { - return false; - } + // commented to able to handling external link on allowExternalContentLinks + // if (linkHostname !== initialHostname) { + // return false; + // } if (!this.allowBackwardCrawling) { if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { @@ -278,15 +283,26 @@ export class WebCrawler { const path = urlObj.pathname; - if ( - this.isInternalLink(fullUrl) && - this.noSections(fullUrl) && - // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards - // this.matchesIncludes(path) && - !this.matchesExcludes(path) && - this.isRobotsAllowed(fullUrl) - ) { - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS + if (this.isInternalLink(fullUrl) && + this.noSections(fullUrl) && + // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards + // this.matchesIncludes(path) && + !this.matchesExcludes(path) && + this.isRobotsAllowed(fullUrl) + ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } + }else{ // EXTERNAL LINKS + if( + this.isInternalLink(url) && //its avoid to add links from external pages on the queue + this.allowExternalContentLinks && + !this.isSocialMediaOrEmail(fullUrl) && + !this.matchesExcludes(fullUrl, true) && + !this.isExternalMainPage(fullUrl) + ){ + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } } } }); @@ -320,9 +336,41 @@ export class WebCrawler { return this.includes.some((pattern) => new RegExp(pattern).test(url)); } - private matchesExcludes(url: string): boolean { - if (this.excludes.length === 0 || this.excludes[0] == "") return false; - return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { + return this.excludes.some((pattern) => { + if (onlyDomains) + return this.matchesExcludesExternalDomains(url); + + return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + }); + } + + // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com" + private matchesExcludesExternalDomains(url: string) { + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const pathname = urlObj.pathname; + + for (let domain of this.excludes) { + let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); + let domainHostname = domainObj.hostname; + let domainPathname = domainObj.pathname; + + if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { + if (pathname.startsWith(domainPathname)) { + return true; + } + } + } + return false; + } catch (e) { + return false; + } + } + + private isExternalMainPage(url:string):boolean { + return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) } private noSections(link: string): boolean { @@ -375,6 +423,10 @@ export class WebCrawler { "instagram.com", "pinterest.com", "mailto:", + "github.com", + "calendly.com", + "discord.gg", + "discord.com", ]; return socialMediaOrEmail.some((ext) => url.includes(ext)); } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9e318505..f7d23fc6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -40,6 +40,7 @@ export class WebScraperDataProvider { "gpt-4-turbo"; private crawlerMode: string = "default"; private allowBackwardCrawling: boolean = false; + private allowExternalContentLinks: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -172,6 +173,7 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, allowBackwardCrawling: this.allowBackwardCrawling, + allowExternalContentLinks: this.allowExternalContentLinks, }); let links = await crawler.start( @@ -489,6 +491,7 @@ export class WebScraperDataProvider { this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; + this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { From 4d6e25619b5c8aec8698ab90ea5532a8b924096d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 1 Jul 2024 16:05:34 -0300 Subject: [PATCH 2/3] minor spacing and comment stuff --- apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 29baa1ce..831970ea 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -283,24 +283,22 @@ export class WebCrawler { const path = urlObj.pathname; - if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS + if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && - // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards - // this.matchesIncludes(path) && !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } - }else{ // EXTERNAL LINKS - if( - this.isInternalLink(url) && //its avoid to add links from external pages on the queue + } else { // EXTERNAL LINKS + if ( + this.isInternalLink(url) && this.allowExternalContentLinks && !this.isSocialMediaOrEmail(fullUrl) && !this.matchesExcludes(fullUrl, true) && !this.isExternalMainPage(fullUrl) - ){ + ) { links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } From db4a7433655f16d38fcc3b6390b58acaeb8a07c7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 2 Jul 2024 09:44:08 -0300 Subject: [PATCH 3/3] Added e2e test --- .../__tests__/e2e_full_withAuth/index.test.ts | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index c8281edd..144661bb 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -804,6 +804,46 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 180000); + it.concurrent("should crawl external content links when allowed", async () => { + const crawlInitResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { + allowExternalContentLinks: true, + ignoreSitemap: true, + returnOnlyUrls: true, + limit: 50 + } + }); + + expect(crawlInitResponse.statusCode).toBe(200); + expect(crawlInitResponse.body).toHaveProperty("jobId"); + + let crawlStatus: string; + let crawlData = []; + while (crawlStatus !== "completed") { + const statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + crawlStatus = statusResponse.body.status; + if (statusResponse.body.data) { + crawlData = statusResponse.body.data; + } + if (crawlStatus !== "completed") { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + console.log(crawlData) + expect(crawlData.length).toBeGreaterThan(0); + expect(crawlData).toEqual(expect.arrayContaining([ + expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), + expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }), + expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") }) + ])); + }, 180000); // 3 minutes timeout }); describe("POST /v0/crawlWebsitePreview", () => {