From 79a75e088a58fdf8c2c2cd10acfc1d1a7f2409da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 19 Nov 2024 18:38:59 +0100 Subject: [PATCH] feat(crawl): allowSubdomain --- apps/api/src/controllers/v1/types.ts | 3 +++ apps/api/src/lib/crawl-redis.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 14 +++++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 2059ac8d..eefcd10e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -211,6 +211,7 @@ const crawlerOptions = z.object({ limit: z.number().default(10000), // default? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), + allowSubdomains: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true), ignoreQueryParameters: z.boolean().default(false), @@ -468,6 +469,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { generateImgAltText: false, allowBackwardCrawling: x.allowBackwardLinks, allowExternalContentLinks: x.allowExternalLinks, + allowSubdomains: x.allowSubdomains, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, @@ -483,6 +485,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions maxDepth: x.maxDepth, allowBackwardLinks: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, + allowSubdomains: x.allowSubdomains, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 9bce160b..b6bae396 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -179,6 +179,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, + allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, }); if (sc.robots !== undefined) { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index c00fed78..3828f830 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -23,6 +23,7 @@ export class WebCrawler { private generateImgAltText: boolean; private allowBackwardCrawling: boolean; private allowExternalContentLinks: boolean; + private allowSubdomains: boolean; constructor({ jobId, @@ -35,7 +36,8 @@ export class WebCrawler { generateImgAltText = false, maxCrawledDepth = 10, allowBackwardCrawling = false, - allowExternalContentLinks = false + allowExternalContentLinks = false, + allowSubdomains = false, }: { jobId: string; initialUrl: string; @@ -48,6 +50,7 @@ export class WebCrawler { maxCrawledDepth?: number; allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; + allowSubdomains?: boolean; }) { this.jobId = jobId; this.initialUrl = initialUrl; @@ -63,6 +66,7 @@ export class WebCrawler { this.generateImgAltText = generateImgAltText ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false; + this.allowSubdomains = allowSubdomains ?? false; } public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { @@ -214,6 +218,10 @@ export class WebCrawler { } } + if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) { + return fullUrl; + } + return null; } @@ -297,6 +305,10 @@ export class WebCrawler { return linkDomain === baseDomain; } + private isSubdomain(link: string): boolean { + return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".")); + } + public isFile(url: string): boolean { const fileExtensions = [ ".png",