mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 16:20:48 +08:00
fix(map): make sitemapOnly simpler
This commit is contained in:
parent
a4d3dba865
commit
1b032b05fa
@ -61,6 +61,15 @@ export async function mapController(
|
|||||||
sitemap.forEach((x) => {
|
sitemap.forEach((x) => {
|
||||||
links.push(x.url);
|
links.push(x.url);
|
||||||
});
|
});
|
||||||
|
links = links.slice(1)
|
||||||
|
.map((x) => {
|
||||||
|
try {
|
||||||
|
return checkAndUpdateURLForMap(x).url.trim();
|
||||||
|
} catch (_) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter((x) => x !== null) as string[];
|
||||||
// links = links.slice(1, limit); // don't slice, unnecessary
|
// links = links.slice(1, limit); // don't slice, unnecessary
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -139,35 +148,35 @@ export async function mapController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Perform cosine similarity between the search query and the list of links
|
||||||
|
if (req.body.search) {
|
||||||
|
const searchQuery = req.body.search.toLowerCase();
|
||||||
|
|
||||||
|
links = performCosineSimilarity(links, searchQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
links = links
|
||||||
|
.map((x) => {
|
||||||
|
try {
|
||||||
|
return checkAndUpdateURLForMap(x).url.trim();
|
||||||
|
} catch (_) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter((x) => x !== null) as string[];
|
||||||
|
|
||||||
|
// allows for subdomains to be included
|
||||||
|
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||||
|
|
||||||
|
// if includeSubdomains is false, filter out subdomains
|
||||||
|
if (!req.body.includeSubdomains) {
|
||||||
|
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove duplicates that could be due to http/https or www
|
||||||
|
links = removeDuplicateUrls(links);
|
||||||
|
links.slice(0, limit);
|
||||||
}
|
}
|
||||||
// Perform cosine similarity between the search query and the list of links
|
|
||||||
if (req.body.search) {
|
|
||||||
const searchQuery = req.body.search.toLowerCase();
|
|
||||||
|
|
||||||
links = performCosineSimilarity(links, searchQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
links = links
|
|
||||||
.map((x) => {
|
|
||||||
try {
|
|
||||||
return checkAndUpdateURLForMap(x).url.trim();
|
|
||||||
} catch (_) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter((x) => x !== null) as string[];
|
|
||||||
|
|
||||||
// allows for subdomains to be included
|
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
|
||||||
|
|
||||||
// if includeSubdomains is false, filter out subdomains
|
|
||||||
if (!req.body.includeSubdomains) {
|
|
||||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
|
||||||
links = removeDuplicateUrls(links);
|
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -179,14 +188,12 @@ export async function mapController(
|
|||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
|
||||||
const linksToReturn = links.slice(0, limit);
|
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: id,
|
job_id: id,
|
||||||
success: links.length > 0,
|
success: links.length > 0,
|
||||||
message: "Map completed",
|
message: "Map completed",
|
||||||
num_docs: linksToReturn.length,
|
num_docs: links.length,
|
||||||
docs: linksToReturn,
|
docs: links,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
mode: "map",
|
mode: "map",
|
||||||
@ -199,7 +206,7 @@ export async function mapController(
|
|||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links: linksToReturn,
|
links: links,
|
||||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user