mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 15:39:13 +08:00
feat(generateURLPermutations): add tests
This commit is contained in:
parent
1acef8e49b
commit
8e4e49e471
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
@ -0,0 +1,33 @@
|
||||
import { generateURLPermutations } from "./crawl-redis";
|
||||
|
||||
describe("generateURLPermutations", () => {
|
||||
it("generates permutations correctly", () => {
|
||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttps.length).toBe(4);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttp.length).toBe(4);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttps.length).toBe(4);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttp.length).toBe(4);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
})
|
||||
});
|
@ -97,6 +97,37 @@ export function normalizeURL(url: string): string {
|
||||
return urlO.href;
|
||||
}
|
||||
|
||||
export function generateURLPermutations(url: string | URL): URL[] {
|
||||
const urlO = new URL(url);
|
||||
|
||||
// Construct two versions, one with www., one without
|
||||
const urlWithWWW = new URL(urlO);
|
||||
const urlWithoutWWW = new URL(urlO);
|
||||
if (urlO.hostname.startsWith("www.")) {
|
||||
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
|
||||
} else {
|
||||
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
|
||||
}
|
||||
|
||||
let permutations = [urlWithWWW, urlWithoutWWW];
|
||||
|
||||
// Construct more versions for http/https
|
||||
permutations = permutations.flatMap(urlO => {
|
||||
if (!["http:", "https:"].includes(urlO.protocol)) {
|
||||
return [urlO];
|
||||
}
|
||||
|
||||
const urlWithHTTP = new URL(urlO);
|
||||
const urlWithHTTPS = new URL(urlO);
|
||||
urlWithHTTP.protocol = "http:";
|
||||
urlWithHTTPS.protocol = "https:";
|
||||
|
||||
return [urlWithHTTP, urlWithHTTPS];
|
||||
});
|
||||
|
||||
return permutations;
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
@ -110,33 +141,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
} else {
|
||||
const urlO = new URL(url);
|
||||
|
||||
// Construct two versions, one with www., one without
|
||||
const urlWithWWW = new URL(urlO);
|
||||
const urlWithoutWWW = new URL(urlO);
|
||||
if (urlO.hostname.startsWith("www.")) {
|
||||
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
|
||||
} else {
|
||||
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
|
||||
}
|
||||
|
||||
let permutations = [urlWithWWW, urlWithoutWWW];
|
||||
|
||||
// Construct more versions for http/https
|
||||
permutations = permutations.flatMap(urlO => {
|
||||
if (!["http:", "https:"].includes(urlO.protocol)) {
|
||||
return [urlO];
|
||||
}
|
||||
|
||||
const urlWithHTTP = new URL(urlO);
|
||||
const urlWithHTTPS = new URL(urlO);
|
||||
urlWithHTTP.protocol = "http:";
|
||||
urlWithHTTPS.protocol = "https:";
|
||||
|
||||
return [urlWithHTTP, urlWithHTTPS];
|
||||
});
|
||||
|
||||
const permutations = generateURLPermutations(url);
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user