diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index a9c61d04..6b13f762 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -19,8 +19,15 @@ import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; import { Logger } from "../../lib/logger"; +import Redis from "ioredis"; configDotenv(); +const redis = new Redis(process.env.REDIS_URL); + +// Max Links that /map can return +const MAX_MAP_LIMIT = 5000; +// Max Links that "Smart /map" can return +const MAX_FIRE_ENGINE_RESULTS = 1000; export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, @@ -30,8 +37,7 @@ export async function mapController( req.body = mapRequestSchema.parse(req.body); - - const limit : number = req.body.limit ?? 5000; + const limit: number = req.body.limit ?? MAX_MAP_LIMIT; const id = uuidv4(); let links: string[] = [req.body.url]; @@ -47,24 +53,61 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); - - if (sitemap !== null) { - sitemap.map((x) => { - links.push(x.url); - }); - } - let urlWithoutWww = req.body.url.replace("www.", ""); let mapUrl = req.body.search ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; - // www. seems to exclude subdomains in some cases - const mapResults = await fireEngineMap(mapUrl, { - // limit to 100 results (beta) - numResults: Math.min(limit, 100), - }); + + const resultsPerPage = 100; + const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); + + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = await redis.get(cacheKey); + + let allResults: any[]; + let pagePromises: Promise[]; + + if (cachedResult) { + allResults = JSON.parse(cachedResult); + } else { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } + + // Parallelize sitemap fetch with serper search + const [sitemap, ...searchResults] = await Promise.all([ + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + ...(cachedResult ? [] : pagePromises), + ]); + + if (!cachedResult) { + allResults = searchResults; + } + + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); + }); + } + + let mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); + + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); + } if (mapResults.length > 0) { if (req.body.search) { @@ -84,17 +127,19 @@ export async function mapController( // Perform cosine similarity between the search query and the list of links if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); - + links = performCosineSimilarity(links, searchQuery); } - links = links.map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim() - } catch (_) { - return null; - } - }).filter(x => x !== null); + links = links + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -107,8 +152,10 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - billTeam(req.auth.team_id, 1).catch(error => { - Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + billTeam(req.auth.team_id, 1).catch((error) => { + Logger.error( + `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -116,7 +163,7 @@ export async function mapController( const timeTakenInSeconds = (endTime - startTime) / 1000; const linksToReturn = links.slice(0, limit); - + logJob({ job_id: id, success: links.length > 0, @@ -140,3 +187,51 @@ export async function mapController( scrape_id: req.body.origin?.includes("website") ? id : undefined, }); } + +// Subdomain sitemap url checking + +// // For each result, check for subdomains, get their sitemaps and add them to the links +// const processedUrls = new Set(); +// const processedSubdomains = new Set(); + +// for (const result of links) { +// let url; +// let hostParts; +// try { +// url = new URL(result); +// hostParts = url.hostname.split('.'); +// } catch (e) { +// continue; +// } + +// console.log("hostParts", hostParts); +// // Check if it's a subdomain (more than 2 parts, and not 'www') +// if (hostParts.length > 2 && hostParts[0] !== 'www') { +// const subdomain = hostParts[0]; +// console.log("subdomain", subdomain); +// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`; +// console.log("subdomainUrl", subdomainUrl); + +// if (!processedSubdomains.has(subdomainUrl)) { +// processedSubdomains.add(subdomainUrl); + +// const subdomainCrawl = crawlToCrawler(id, { +// originUrl: subdomainUrl, +// crawlerOptions: legacyCrawlerOptions(req.body), +// pageOptions: {}, +// team_id: req.auth.team_id, +// createdAt: Date.now(), +// plan: req.auth.plan, +// }); +// const subdomainSitemap = await subdomainCrawl.tryGetSitemap(); +// if (subdomainSitemap) { +// subdomainSitemap.forEach((x) => { +// if (!processedUrls.has(x.url)) { +// processedUrls.add(x.url); +// links.push(x.url); +// } +// }); +// } +// } +// } +// } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index b1a6a6ff..13dfc26e 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -36,17 +36,15 @@ export async function getLinksFromSitemap( const root = parsed.urlset || parsed.sitemapindex; if (root && root.sitemap) { - for (const sitemap of root.sitemap) { - if (sitemap.loc && sitemap.loc.length > 0) { - await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }); - } - } + const sitemapPromises = root.sitemap + .filter(sitemap => sitemap.loc && sitemap.loc.length > 0) + .map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode })); + await Promise.all(sitemapPromises); } else if (root && root.url) { - for (const url of root.url) { - if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) { - allUrls.push(url.loc[0]); - } - } + const validUrls = root.url + .filter(url => url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) + .map(url => url.loc[0]); + allUrls.push(...validUrls); } } catch (error) { Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);