mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 21:09:00 +08:00
fix(crawl/maxDepth): fix maxDepth behaviour
This commit is contained in:
parent
7d576d13bf
commit
68c9615f2d
@ -3,6 +3,7 @@ import { ScrapeOptions } from "../controllers/v1/types";
|
|||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisConnection } from "../services/queue-service";
|
||||||
import { logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||||
|
|
||||||
export type StoredCrawl = {
|
export type StoredCrawl = {
|
||||||
originUrl?: string;
|
originUrl?: string;
|
||||||
@ -172,7 +173,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
|||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
|
||||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
|
@ -7,6 +7,6 @@ export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function getURLDepth(url: string): number {
|
export function getURLDepth(url: string): number {
|
||||||
const pathSplits = new URL(url).pathname.split('/');
|
const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html");
|
||||||
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
|
return pathSplits.length;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user