From 34fe360dc1ac9b11d40273630520e69d617d2a5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 5 Feb 2025 15:44:56 +0100 Subject: [PATCH] fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends (FIR-827) (#1134) * fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends * fix(crawl-redis/tests): adjust tests --- apps/api/src/lib/crawl-redis.test.ts | 40 +++++++++++++++++++++++++--- apps/api/src/lib/crawl-redis.ts | 28 +++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts index 65d4e13a..f27598b1 100644 --- a/apps/api/src/lib/crawl-redis.test.ts +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -5,37 +5,69 @@ describe("generateURLPermutations", () => { const bareHttps = generateURLPermutations("https://firecrawl.dev").map( (x) => x.href, ); - expect(bareHttps.length).toBe(4); + expect(bareHttps.length).toBe(16); expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://firecrawl.dev/index.html")).toBe(true); + expect(bareHttps.includes("https://firecrawl.dev/index.php")).toBe(true); expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true); expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/index.html")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/index.php")).toBe(true); expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true); const bareHttp = generateURLPermutations("http://firecrawl.dev").map( (x) => x.href, ); - expect(bareHttp.length).toBe(4); + expect(bareHttp.length).toBe(16); expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://firecrawl.dev/index.html")).toBe(true); + expect(bareHttp.includes("https://firecrawl.dev/index.php")).toBe(true); expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true); expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/index.html")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/index.php")).toBe(true); expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true); const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map( (x) => x.href, ); - expect(wwwHttps.length).toBe(4); + expect(wwwHttps.length).toBe(16); expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://firecrawl.dev/index.html")).toBe(true); + expect(wwwHttps.includes("https://firecrawl.dev/index.php")).toBe(true); expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true); expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/index.html")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/index.php")).toBe(true); expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true); const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map( (x) => x.href, ); - expect(wwwHttp.length).toBe(4); + expect(wwwHttp.length).toBe(16); expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://firecrawl.dev/index.html")).toBe(true); + expect(wwwHttp.includes("https://firecrawl.dev/index.php")).toBe(true); expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true); expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/index.html")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/index.php")).toBe(true); expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true); }); }); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 553934d7..c0870586 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -229,6 +229,34 @@ export function generateURLPermutations(url: string | URL): URL[] { return [urlWithHTTP, urlWithHTTPS]; }); + // Construct more versions for index.html/index.php + permutations = permutations.flatMap((urlO) => { + const urlWithHTML = new URL(urlO); + const urlWithPHP = new URL(urlO); + const urlWithBare = new URL(urlO); + const urlWithSlash = new URL(urlO); + + if (urlO.pathname.endsWith("/")) { + urlWithBare.pathname = urlWithBare.pathname.length === 1 ? urlWithBare.pathname : urlWithBare.pathname.slice(0, -1); + urlWithHTML.pathname += "index.html"; + urlWithPHP.pathname += "index.php"; + } else if (urlO.pathname.endsWith("/index.html")) { + urlWithPHP.pathname = urlWithPHP.pathname.slice(0, -"index.html".length) + "index.php"; + urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.html".length); + urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.html".length); + } else if (urlO.pathname.endsWith("/index.php")) { + urlWithHTML.pathname = urlWithHTML.pathname.slice(0, -"index.php".length) + "index.html"; + urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.php".length); + urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.php".length); + } else { + urlWithSlash.pathname += "/"; + urlWithHTML.pathname += "/index.html"; + urlWithPHP.pathname += "/index.php"; + } + + return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare]; + }); + return permutations; }