mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 17:48:59 +08:00
feat(generateURLPermutations): add tests
This commit is contained in:
parent
1acef8e49b
commit
8e4e49e471
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
@ -0,0 +1,33 @@
|
||||
import { generateURLPermutations } from "./crawl-redis";
|
||||
|
||||
describe("generateURLPermutations", () => {
|
||||
it("generates permutations correctly", () => {
|
||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttps.length).toBe(4);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttp.length).toBe(4);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttps.length).toBe(4);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttp.length).toBe(4);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
})
|
||||
});
|
@ -97,19 +97,7 @@ export function normalizeURL(url: string): string {
|
||||
return urlO.href;
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
url = normalizeURL(url);
|
||||
|
||||
let res: boolean;
|
||||
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
} else {
|
||||
export function generateURLPermutations(url: string | URL): URL[] {
|
||||
const urlO = new URL(url);
|
||||
|
||||
// Construct two versions, one with www., one without
|
||||
@ -137,6 +125,23 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
return [urlWithHTTP, urlWithHTTPS];
|
||||
});
|
||||
|
||||
return permutations;
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
url = normalizeURL(url);
|
||||
|
||||
let res: boolean;
|
||||
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
} else {
|
||||
const permutations = generateURLPermutations(url);
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user