From 8e4e49e471eb9cf0fa87a001efbf3b04c99dc395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 11 Nov 2024 20:29:17 +0100 Subject: [PATCH] feat(generateURLPermutations): add tests --- apps/api/src/lib/crawl-redis.test.ts | 33 ++++++++++++++++ apps/api/src/lib/crawl-redis.ts | 59 +++++++++++++++------------- 2 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 apps/api/src/lib/crawl-redis.test.ts diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts new file mode 100644 index 00000000..eb9c81f1 --- /dev/null +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -0,0 +1,33 @@ +import { generateURLPermutations } from "./crawl-redis"; + +describe("generateURLPermutations", () => { + it("generates permutations correctly", () => { + const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href); + expect(bareHttps.length).toBe(4); + expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href); + expect(bareHttp.length).toBe(4); + expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href); + expect(wwwHttps.length).toBe(4); + expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href); + expect(wwwHttp.length).toBe(4); + expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); + }) +}); \ No newline at end of file diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 3d918263..34b164d2 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -97,6 +97,37 @@ export function normalizeURL(url: string): string { return urlO.href; } +export function generateURLPermutations(url: string | URL): URL[] { + const urlO = new URL(url); + + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap(urlO => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; + } + + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; + + return [urlWithHTTP, urlWithHTTPS]; + }); + + return permutations; +} + export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { @@ -110,33 +141,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise if (!sc.crawlerOptions.deduplicateSimilarURLs) { res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 } else { - const urlO = new URL(url); - - // Construct two versions, one with www., one without - const urlWithWWW = new URL(urlO); - const urlWithoutWWW = new URL(urlO); - if (urlO.hostname.startsWith("www.")) { - urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); - } else { - urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; - } - - let permutations = [urlWithWWW, urlWithoutWWW]; - - // Construct more versions for http/https - permutations = permutations.flatMap(urlO => { - if (!["http:", "https:"].includes(urlO.protocol)) { - return [urlO]; - } - - const urlWithHTTP = new URL(urlO); - const urlWithHTTPS = new URL(urlO); - urlWithHTTP.protocol = "http:"; - urlWithHTTPS.protocol = "https:"; - - return [urlWithHTTP, urlWithHTTPS]; - }); - + const permutations = generateURLPermutations(url); res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; }