mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 14:28:59 +08:00
feat(crawl): allowSubdomain
This commit is contained in:
parent
91caa01c5e
commit
79a75e088a
@ -211,6 +211,7 @@ const crawlerOptions = z.object({
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
allowSubdomains: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
deduplicateSimilarURLs: z.boolean().default(true),
|
||||
ignoreQueryParameters: z.boolean().default(false),
|
||||
@ -468,6 +469,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
@ -483,6 +485,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
|
@ -179,6 +179,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
|
@ -23,6 +23,7 @@ export class WebCrawler {
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
private allowSubdomains: boolean;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
@ -35,7 +36,8 @@ export class WebCrawler {
|
||||
generateImgAltText = false,
|
||||
maxCrawledDepth = 10,
|
||||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false
|
||||
allowExternalContentLinks = false,
|
||||
allowSubdomains = false,
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
@ -48,6 +50,7 @@ export class WebCrawler {
|
||||
maxCrawledDepth?: number;
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
allowSubdomains?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
@ -63,6 +66,7 @@ export class WebCrawler {
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
this.allowSubdomains = allowSubdomains ?? false;
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||
@ -214,6 +218,10 @@ export class WebCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
|
||||
return fullUrl;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -297,6 +305,10 @@ export class WebCrawler {
|
||||
return linkDomain === baseDomain;
|
||||
}
|
||||
|
||||
private isSubdomain(link: string): boolean {
|
||||
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
|
||||
}
|
||||
|
||||
public isFile(url: string): boolean {
|
||||
const fileExtensions = [
|
||||
".png",
|
||||
|
Loading…
x
Reference in New Issue
Block a user