feat(generateURLPermutations): add tests

This commit is contained in:
Gergő Móricz 2024-11-11 20:29:17 +01:00
parent 1acef8e49b
commit 8e4e49e471
2 changed files with 65 additions and 27 deletions

View File

@ -0,0 +1,33 @@
import { generateURLPermutations } from "./crawl-redis";
describe("generateURLPermutations", () => {
it("generates permutations correctly", () => {
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
expect(bareHttps.length).toBe(4);
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
expect(bareHttp.length).toBe(4);
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
expect(wwwHttps.length).toBe(4);
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
expect(wwwHttp.length).toBe(4);
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
})
});

View File

@ -97,19 +97,7 @@ export function normalizeURL(url: string): string {
return urlO.href;
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
return false;
}
}
url = normalizeURL(url);
let res: boolean;
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else {
export function generateURLPermutations(url: string | URL): URL[] {
const urlO = new URL(url);
// Construct two versions, one with www., one without
@ -137,6 +125,23 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
return [urlWithHTTP, urlWithHTTPS];
});
return permutations;
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
return false;
}
}
url = normalizeURL(url);
let res: boolean;
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else {
const permutations = generateURLPermutations(url);
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
}