mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 16:59:05 +08:00
(fix/map) Map failed to filter by path if indexed (#1333)
* Nick: * Update map.ts * Update map.ts
This commit is contained in:
parent
f87e11712c
commit
134de67a3b
@ -56,6 +56,7 @@ export async function getMapResults({
|
|||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
abort = new AbortController().signal, // noop
|
abort = new AbortController().signal, // noop
|
||||||
mock,
|
mock,
|
||||||
|
filterByPath = true,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -70,6 +71,7 @@ export async function getMapResults({
|
|||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
mock?: string;
|
mock?: string;
|
||||||
|
filterByPath?: boolean;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -247,6 +249,29 @@ export async function getMapResults({
|
|||||||
links = links.filter((x) => isSameSubdomain(x, url));
|
links = links.filter((x) => isSameSubdomain(x, url));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filter by path if enabled
|
||||||
|
if (filterByPath && !allowExternalLinks) {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
const urlPath = urlObj.pathname;
|
||||||
|
// Only apply path filtering if the URL has a significant path (not just '/' or empty)
|
||||||
|
// This means we only filter by path if the user has not selected a root domain
|
||||||
|
if (urlPath && urlPath !== '/' && urlPath.length > 1) {
|
||||||
|
links = links.filter(link => {
|
||||||
|
try {
|
||||||
|
const linkObj = new URL(link);
|
||||||
|
return linkObj.pathname.startsWith(urlPath);
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// If URL parsing fails, continue without path filtering
|
||||||
|
logger.warn(`Failed to parse URL for path filtering: ${url}`, { error: e });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
}
|
}
|
||||||
@ -300,6 +325,7 @@ export async function mapController(
|
|||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
abort: abort.signal,
|
abort: abort.signal,
|
||||||
mock: req.body.useMock,
|
mock: req.body.useMock,
|
||||||
|
filterByPath: req.body.filterByPath !== false,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => {
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
@ -506,6 +506,7 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
limit: z.number().min(1).max(30000).default(5000),
|
limit: z.number().min(1).max(30000).default(5000),
|
||||||
timeout: z.number().positive().finite().optional(),
|
timeout: z.number().positive().finite().optional(),
|
||||||
useMock: z.string().optional(),
|
useMock: z.string().optional(),
|
||||||
|
filterByPath: z.boolean().default(true),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user