diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index af7fe4a3..3e324d39 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -826,6 +826,46 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 180000); + it.concurrent("should crawl external content links when allowed", async () => { + const crawlInitResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { + allowExternalContentLinks: true, + ignoreSitemap: true, + returnOnlyUrls: true, + limit: 50 + } + }); + + expect(crawlInitResponse.statusCode).toBe(200); + expect(crawlInitResponse.body).toHaveProperty("jobId"); + + let crawlStatus: string; + let crawlData = []; + while (crawlStatus !== "completed") { + const statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + crawlStatus = statusResponse.body.status; + if (statusResponse.body.data) { + crawlData = statusResponse.body.data; + } + if (crawlStatus !== "completed") { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + console.log(crawlData) + expect(crawlData.length).toBeGreaterThan(0); + expect(crawlData).toEqual(expect.arrayContaining([ + expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), + expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }), + expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") }) + ])); + }, 180000); // 3 minutes timeout }); describe("POST /v0/crawlWebsitePreview", () => { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d2b3b002..3cd59b6c 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -52,6 +52,7 @@ export type CrawlerOptions = { ignoreSitemap?: boolean; mode?: "default" | "fast"; // have a mode of some sort allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; } export type WebScraperOptions = { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5003845e..831970ea 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -23,6 +23,7 @@ export class WebCrawler { private robots: any; private generateImgAltText: boolean; private allowBackwardCrawling: boolean; + private allowExternalContentLinks: boolean; constructor({ initialUrl, @@ -32,7 +33,8 @@ export class WebCrawler { limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, - allowBackwardCrawling = false + allowBackwardCrawling = false, + allowExternalContentLinks = false }: { initialUrl: string; includes?: string[]; @@ -42,6 +44,7 @@ export class WebCrawler { generateImgAltText?: boolean; maxCrawledDepth?: number; allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -55,6 +58,7 @@ export class WebCrawler { this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false; + this.allowExternalContentLinks = allowExternalContentLinks ?? false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -98,9 +102,10 @@ export class WebCrawler { const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - if (linkHostname !== initialHostname) { - return false; - } + // commented to able to handling external link on allowExternalContentLinks + // if (linkHostname !== initialHostname) { + // return false; + // } if (!this.allowBackwardCrawling) { if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { @@ -278,15 +283,24 @@ export class WebCrawler { const path = urlObj.pathname; - if ( - this.isInternalLink(fullUrl) && - this.noSections(fullUrl) && - // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards - // this.matchesIncludes(path) && - !this.matchesExcludes(path) && - this.isRobotsAllowed(fullUrl) - ) { - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS + if (this.isInternalLink(fullUrl) && + this.noSections(fullUrl) && + !this.matchesExcludes(path) && + this.isRobotsAllowed(fullUrl) + ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } + } else { // EXTERNAL LINKS + if ( + this.isInternalLink(url) && + this.allowExternalContentLinks && + !this.isSocialMediaOrEmail(fullUrl) && + !this.matchesExcludes(fullUrl, true) && + !this.isExternalMainPage(fullUrl) + ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } } } }); @@ -320,9 +334,41 @@ export class WebCrawler { return this.includes.some((pattern) => new RegExp(pattern).test(url)); } - private matchesExcludes(url: string): boolean { - if (this.excludes.length === 0 || this.excludes[0] == "") return false; - return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { + return this.excludes.some((pattern) => { + if (onlyDomains) + return this.matchesExcludesExternalDomains(url); + + return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + }); + } + + // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com" + private matchesExcludesExternalDomains(url: string) { + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const pathname = urlObj.pathname; + + for (let domain of this.excludes) { + let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); + let domainHostname = domainObj.hostname; + let domainPathname = domainObj.pathname; + + if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { + if (pathname.startsWith(domainPathname)) { + return true; + } + } + } + return false; + } catch (e) { + return false; + } + } + + private isExternalMainPage(url:string):boolean { + return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) } private noSections(link: string): boolean { @@ -375,6 +421,10 @@ export class WebCrawler { "instagram.com", "pinterest.com", "mailto:", + "github.com", + "calendly.com", + "discord.gg", + "discord.com", ]; return socialMediaOrEmail.some((ext) => url.includes(ext)); } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 3badfa19..8b7de28a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -40,6 +40,7 @@ export class WebScraperDataProvider { "gpt-4-turbo"; private crawlerMode: string = "default"; private allowBackwardCrawling: boolean = false; + private allowExternalContentLinks: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -173,6 +174,7 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, allowBackwardCrawling: this.allowBackwardCrawling, + allowExternalContentLinks: this.allowExternalContentLinks, }); let links = await crawler.start( @@ -496,6 +498,7 @@ export class WebScraperDataProvider { this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; + this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => {