From 2d78c20d686b9aea0b0c85770d357c47aa9d375f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 26 Aug 2024 16:56:27 -0300 Subject: [PATCH] Nick: --- apps/api/src/controllers/v1/map.ts | 48 +++++++++++++----- apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/lib/validateUrl.test.ts | 75 +++++++++++++++++++++++++++- apps/api/src/lib/validateUrl.ts | 29 +++++++++++ apps/api/src/search/fireEngine.ts | 1 + 5 files changed, 140 insertions(+), 14 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 76cf1498..a1a6cec9 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -12,9 +12,11 @@ import { checkAndUpdateURLForMap, isSameDomain, isSameSubdomain, + removeDuplicateUrls, } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; +import { logJob } from "../../services/logging/log_job"; configDotenv(); @@ -22,12 +24,14 @@ export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response ) { + const startTime = new Date().getTime(); + req.body = mapRequestSchema.parse(req.body); + const limit = req.body.limit; const id = uuidv4(); let links: string[] = [req.body.url]; - const sc: StoredCrawl = { originUrl: req.body.url, crawlerOptions: legacyCrawlerOptions(req.body), @@ -38,10 +42,7 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - const sitemap = - req.body.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); + const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); if (sitemap !== null) { sitemap.map((x) => { @@ -50,19 +51,24 @@ export async function mapController( } let urlWithoutWww = req.body.url.replace("www.", ""); - + let mapUrl = req.body.search ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { - numResults: 50, + // limit to 50 results (beta) + numResults: Math.min(limit, 50), }); if (mapResults.length > 0) { if (req.body.search) { // Ensure all map results are first, maintaining their order - links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links]; + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; } else { mapResults.map((x) => { links.push(x.url); @@ -72,8 +78,6 @@ export async function mapController( links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); - - // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -83,12 +87,32 @@ export async function mapController( } // remove duplicates that could be due to http/https or www - links = [...new Set(links)]; + links = removeDuplicateUrls(links); await billTeam(req.auth.team_id, 1); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + + logJob({ + job_id: id, + success: true, + message: "Map completed", + num_docs: 1, + docs: links, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "map", + url: req.body.url, + crawlerOptions: {}, + pageOptions: {}, + origin: req.body.origin, + extractor_options: { mode: "markdown" }, + num_tokens: 0, + }); + return res.status(200).json({ success: true, - links, + links: links.slice(0, limit), }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 91f436ae..9b726a90 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -121,6 +121,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), + limit: z.number().min(1).max(50).default(5000), }).strict(strictMessage); // export type MapRequest = { diff --git a/apps/api/src/lib/validateUrl.test.ts b/apps/api/src/lib/validateUrl.test.ts index 43730584..eec39f97 100644 --- a/apps/api/src/lib/validateUrl.test.ts +++ b/apps/api/src/lib/validateUrl.test.ts @@ -1,4 +1,4 @@ -import { isSameDomain } from "./validateUrl"; +import { isSameDomain, removeDuplicateUrls } from "./validateUrl"; import { isSameSubdomain } from "./validateUrl"; describe("isSameDomain", () => { @@ -85,4 +85,75 @@ describe("isSameSubdomain", () => { const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com"); expect(result).toBe(false); }); -}); \ No newline at end of file +}); + +describe("removeDuplicateUrls", () => { + it("should remove duplicate URLs with different protocols", () => { + const urls = [ + "http://example.com", + "https://example.com", + "http://www.example.com", + "https://www.example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should keep URLs with different paths", () => { + const urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page1?param=1", + "https://example.com/page1#section1" + ]); + }); + + it("should prefer https over http", () => { + const urls = [ + "http://example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should prefer non-www over www", () => { + const urls = [ + "https://www.example.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); + + it("should handle empty input", () => { + const urls: string[] = []; + const result = removeDuplicateUrls(urls); + expect(result).toEqual([]); + }); + + it("should handle URLs with different cases", () => { + const urls = [ + "https://EXAMPLE.com", + "https://example.com" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://EXAMPLE.com"]); + }); + + it("should handle URLs with trailing slashes", () => { + const urls = [ + "https://example.com", + "https://example.com/" + ]; + const result = removeDuplicateUrls(urls); + expect(result).toEqual(["https://example.com"]); + }); +}); diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index fa2698e7..bb83a8fa 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -120,3 +120,32 @@ export const checkAndUpdateURLForMap = (url: string) => { + + +export function removeDuplicateUrls(urls: string[]): string[] { + const urlMap = new Map(); + + for (const url of urls) { + const parsedUrl = new URL(url); + const protocol = parsedUrl.protocol; + const hostname = parsedUrl.hostname.replace(/^www\./, ''); + const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash; + + const key = `${hostname}${path}`; + + if (!urlMap.has(key)) { + urlMap.set(key, url); + } else { + const existingUrl = new URL(urlMap.get(key)!); + const existingProtocol = existingUrl.protocol; + + if (protocol === 'https:' && existingProtocol === 'http:') { + urlMap.set(key, url); + } else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) { + urlMap.set(key, url); + } + } + } + + return [...new Set(Array.from(urlMap.values()))]; +} \ No newline at end of file diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index c316f8f7..7c6d8a4d 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -24,6 +24,7 @@ export async function fireEngineMap(q: string, options: { }); if (!process.env.FIRE_ENGINE_BETA_URL) { + console.warn("(v1/map Beta) Results might differ from cloud offering currently."); return []; }