fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends (FIR-827) (#1134)

* fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends

* fix(crawl-redis/tests): adjust tests
This commit is contained in:
Gergő Móricz 2025-02-05 15:44:56 +01:00 committed by GitHub
parent aa1e820d52
commit 34fe360dc1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 64 additions and 4 deletions

View File

@ -5,37 +5,69 @@ describe("generateURLPermutations", () => {
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
(x) => x.href,
);
expect(bareHttps.length).toBe(4);
expect(bareHttps.length).toBe(16);
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("https://firecrawl.dev/index.html")).toBe(true);
expect(bareHttps.includes("https://firecrawl.dev/index.php")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/index.html")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/index.php")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true);
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
(x) => x.href,
);
expect(bareHttp.length).toBe(4);
expect(bareHttp.length).toBe(16);
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("https://firecrawl.dev/index.html")).toBe(true);
expect(bareHttp.includes("https://firecrawl.dev/index.php")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/index.html")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/index.php")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true);
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
(x) => x.href,
);
expect(wwwHttps.length).toBe(4);
expect(wwwHttps.length).toBe(16);
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("https://firecrawl.dev/index.html")).toBe(true);
expect(wwwHttps.includes("https://firecrawl.dev/index.php")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/index.html")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/index.php")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true);
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
(x) => x.href,
);
expect(wwwHttp.length).toBe(4);
expect(wwwHttp.length).toBe(16);
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("https://firecrawl.dev/index.html")).toBe(true);
expect(wwwHttp.includes("https://firecrawl.dev/index.php")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/index.html")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/index.php")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true);
});
});

View File

@ -229,6 +229,34 @@ export function generateURLPermutations(url: string | URL): URL[] {
return [urlWithHTTP, urlWithHTTPS];
});
// Construct more versions for index.html/index.php
permutations = permutations.flatMap((urlO) => {
const urlWithHTML = new URL(urlO);
const urlWithPHP = new URL(urlO);
const urlWithBare = new URL(urlO);
const urlWithSlash = new URL(urlO);
if (urlO.pathname.endsWith("/")) {
urlWithBare.pathname = urlWithBare.pathname.length === 1 ? urlWithBare.pathname : urlWithBare.pathname.slice(0, -1);
urlWithHTML.pathname += "index.html";
urlWithPHP.pathname += "index.php";
} else if (urlO.pathname.endsWith("/index.html")) {
urlWithPHP.pathname = urlWithPHP.pathname.slice(0, -"index.html".length) + "index.php";
urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.html".length);
urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.html".length);
} else if (urlO.pathname.endsWith("/index.php")) {
urlWithHTML.pathname = urlWithHTML.pathname.slice(0, -"index.php".length) + "index.html";
urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.php".length);
urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.php".length);
} else {
urlWithSlash.pathname += "/";
urlWithHTML.pathname += "/index.html";
urlWithPHP.pathname += "/index.php";
}
return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare];
});
return permutations;
}