mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 03:49:01 +08:00
fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends (FIR-827) (#1134)
* fix(crawl-redis/generateURLPermutations): dedupe index.html/index.php/slash/bare URL ends * fix(crawl-redis/tests): adjust tests
This commit is contained in:
parent
aa1e820d52
commit
34fe360dc1
@ -5,37 +5,69 @@ describe("generateURLPermutations", () => {
|
||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
|
||||
(x) => x.href,
|
||||
);
|
||||
expect(bareHttps.length).toBe(4);
|
||||
expect(bareHttps.length).toBe(16);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true);
|
||||
|
||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
|
||||
(x) => x.href,
|
||||
);
|
||||
expect(bareHttp.length).toBe(4);
|
||||
expect(bareHttp.length).toBe(16);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true);
|
||||
|
||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
|
||||
(x) => x.href,
|
||||
);
|
||||
expect(wwwHttps.length).toBe(4);
|
||||
expect(wwwHttps.length).toBe(16);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/index.php")).toBe(true);
|
||||
|
||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
|
||||
(x) => x.href,
|
||||
);
|
||||
expect(wwwHttp.length).toBe(4);
|
||||
expect(wwwHttp.length).toBe(16);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/index.php")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/index.html")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/index.php")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
@ -229,6 +229,34 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
||||
return [urlWithHTTP, urlWithHTTPS];
|
||||
});
|
||||
|
||||
// Construct more versions for index.html/index.php
|
||||
permutations = permutations.flatMap((urlO) => {
|
||||
const urlWithHTML = new URL(urlO);
|
||||
const urlWithPHP = new URL(urlO);
|
||||
const urlWithBare = new URL(urlO);
|
||||
const urlWithSlash = new URL(urlO);
|
||||
|
||||
if (urlO.pathname.endsWith("/")) {
|
||||
urlWithBare.pathname = urlWithBare.pathname.length === 1 ? urlWithBare.pathname : urlWithBare.pathname.slice(0, -1);
|
||||
urlWithHTML.pathname += "index.html";
|
||||
urlWithPHP.pathname += "index.php";
|
||||
} else if (urlO.pathname.endsWith("/index.html")) {
|
||||
urlWithPHP.pathname = urlWithPHP.pathname.slice(0, -"index.html".length) + "index.php";
|
||||
urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.html".length);
|
||||
urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.html".length);
|
||||
} else if (urlO.pathname.endsWith("/index.php")) {
|
||||
urlWithHTML.pathname = urlWithHTML.pathname.slice(0, -"index.php".length) + "index.html";
|
||||
urlWithSlash.pathname = urlWithSlash.pathname.slice(0, -"index.php".length);
|
||||
urlWithBare.pathname = urlWithBare.pathname.slice(0, -"/index.php".length);
|
||||
} else {
|
||||
urlWithSlash.pathname += "/";
|
||||
urlWithHTML.pathname += "/index.html";
|
||||
urlWithPHP.pathname += "/index.php";
|
||||
}
|
||||
|
||||
return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare];
|
||||
});
|
||||
|
||||
return permutations;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user