mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 03:59:05 +08:00
feat(crawl): allowSubdomain
This commit is contained in:
parent
91caa01c5e
commit
79a75e088a
@ -211,6 +211,7 @@ const crawlerOptions = z.object({
|
|||||||
limit: z.number().default(10000), // default?
|
limit: z.number().default(10000), // default?
|
||||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
|
allowSubdomains: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(true),
|
ignoreSitemap: z.boolean().default(true),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false),
|
ignoreQueryParameters: z.boolean().default(false),
|
||||||
@ -468,6 +469,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
generateImgAltText: false,
|
generateImgAltText: false,
|
||||||
allowBackwardCrawling: x.allowBackwardLinks,
|
allowBackwardCrawling: x.allowBackwardLinks,
|
||||||
allowExternalContentLinks: x.allowExternalLinks,
|
allowExternalContentLinks: x.allowExternalLinks,
|
||||||
|
allowSubdomains: x.allowSubdomains,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
@ -483,6 +485,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
|||||||
maxDepth: x.maxDepth,
|
maxDepth: x.maxDepth,
|
||||||
allowBackwardLinks: x.allowBackwardCrawling,
|
allowBackwardLinks: x.allowBackwardCrawling,
|
||||||
allowExternalLinks: x.allowExternalContentLinks,
|
allowExternalLinks: x.allowExternalContentLinks,
|
||||||
|
allowSubdomains: x.allowSubdomains,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
|
@ -179,6 +179,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
|||||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
@ -23,6 +23,7 @@ export class WebCrawler {
|
|||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
|
private allowSubdomains: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -35,7 +36,8 @@ export class WebCrawler {
|
|||||||
generateImgAltText = false,
|
generateImgAltText = false,
|
||||||
maxCrawledDepth = 10,
|
maxCrawledDepth = 10,
|
||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false
|
allowExternalContentLinks = false,
|
||||||
|
allowSubdomains = false,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@ -48,6 +50,7 @@ export class WebCrawler {
|
|||||||
maxCrawledDepth?: number;
|
maxCrawledDepth?: number;
|
||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
|
allowSubdomains?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
@ -63,6 +66,7 @@ export class WebCrawler {
|
|||||||
this.generateImgAltText = generateImgAltText ?? false;
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
|
this.allowSubdomains = allowSubdomains ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||||
@ -214,6 +218,10 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
|
||||||
|
return fullUrl;
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,6 +305,10 @@ export class WebCrawler {
|
|||||||
return linkDomain === baseDomain;
|
return linkDomain === baseDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private isSubdomain(link: string): boolean {
|
||||||
|
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
|
||||||
|
}
|
||||||
|
|
||||||
public isFile(url: string): boolean {
|
public isFile(url: string): boolean {
|
||||||
const fileExtensions = [
|
const fileExtensions = [
|
||||||
".png",
|
".png",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user