feat(crawl): allowSubdomain

This commit is contained in:
Gergő Móricz 2024-11-19 18:38:59 +01:00
parent 91caa01c5e
commit 79a75e088a
3 changed files with 17 additions and 1 deletions

View File

@ -211,6 +211,7 @@ const crawlerOptions = z.object({
limit: z.number().default(10000), // default? limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true), ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false), ignoreQueryParameters: z.boolean().default(false),
@ -468,6 +469,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
generateImgAltText: false, generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks, allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks, allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,
@ -483,6 +485,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
maxDepth: x.maxDepth, maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling, allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks, allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,

View File

@ -179,6 +179,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
}); });
if (sc.robots !== undefined) { if (sc.robots !== undefined) {

View File

@ -23,6 +23,7 @@ export class WebCrawler {
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
private allowSubdomains: boolean;
constructor({ constructor({
jobId, jobId,
@ -35,7 +36,8 @@ export class WebCrawler {
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
allowBackwardCrawling = false, allowBackwardCrawling = false,
allowExternalContentLinks = false allowExternalContentLinks = false,
allowSubdomains = false,
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
@ -48,6 +50,7 @@ export class WebCrawler {
maxCrawledDepth?: number; maxCrawledDepth?: number;
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
allowSubdomains?: boolean;
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
@ -63,6 +66,7 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false;
} }
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -214,6 +218,10 @@ export class WebCrawler {
} }
} }
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
return fullUrl;
}
return null; return null;
} }
@ -297,6 +305,10 @@ export class WebCrawler {
return linkDomain === baseDomain; return linkDomain === baseDomain;
} }
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
}
public isFile(url: string): boolean { public isFile(url: string): boolean {
const fileExtensions = [ const fileExtensions = [
".png", ".png",