feat(crawl): allowSubdomain

This commit is contained in:
Gergő Móricz 2024-11-19 18:38:59 +01:00
parent 91caa01c5e
commit 79a75e088a
3 changed files with 17 additions and 1 deletions

View File

@ -211,6 +211,7 @@ const crawlerOptions = z.object({
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false),
@ -468,6 +469,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
@ -483,6 +485,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,

View File

@ -179,6 +179,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
});
if (sc.robots !== undefined) {

View File

@ -23,6 +23,7 @@ export class WebCrawler {
private generateImgAltText: boolean;
private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;
private allowSubdomains: boolean;
constructor({
jobId,
@ -35,7 +36,8 @@ export class WebCrawler {
generateImgAltText = false,
maxCrawledDepth = 10,
allowBackwardCrawling = false,
allowExternalContentLinks = false
allowExternalContentLinks = false,
allowSubdomains = false,
}: {
jobId: string;
initialUrl: string;
@ -48,6 +50,7 @@ export class WebCrawler {
maxCrawledDepth?: number;
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
allowSubdomains?: boolean;
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
@ -63,6 +66,7 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false;
}
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -214,6 +218,10 @@ export class WebCrawler {
}
}
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
return fullUrl;
}
return null;
}
@ -297,6 +305,10 @@ export class WebCrawler {
return linkDomain === baseDomain;
}
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
}
public isFile(url: string): boolean {
const fileExtensions = [
".png",