mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 17:48:59 +08:00
feat(generateURLPermutations): add tests
This commit is contained in:
parent
1acef8e49b
commit
8e4e49e471
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
33
apps/api/src/lib/crawl-redis.test.ts
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import { generateURLPermutations } from "./crawl-redis";
|
||||||
|
|
||||||
|
describe("generateURLPermutations", () => {
|
||||||
|
it("generates permutations correctly", () => {
|
||||||
|
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
|
||||||
|
expect(bareHttps.length).toBe(4);
|
||||||
|
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
|
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
|
||||||
|
expect(bareHttp.length).toBe(4);
|
||||||
|
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
|
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
|
||||||
|
expect(wwwHttps.length).toBe(4);
|
||||||
|
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
|
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
|
||||||
|
expect(wwwHttp.length).toBe(4);
|
||||||
|
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
|
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
})
|
||||||
|
});
|
@ -97,6 +97,37 @@ export function normalizeURL(url: string): string {
|
|||||||
return urlO.href;
|
return urlO.href;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function generateURLPermutations(url: string | URL): URL[] {
|
||||||
|
const urlO = new URL(url);
|
||||||
|
|
||||||
|
// Construct two versions, one with www., one without
|
||||||
|
const urlWithWWW = new URL(urlO);
|
||||||
|
const urlWithoutWWW = new URL(urlO);
|
||||||
|
if (urlO.hostname.startsWith("www.")) {
|
||||||
|
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
|
||||||
|
} else {
|
||||||
|
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
|
||||||
|
}
|
||||||
|
|
||||||
|
let permutations = [urlWithWWW, urlWithoutWWW];
|
||||||
|
|
||||||
|
// Construct more versions for http/https
|
||||||
|
permutations = permutations.flatMap(urlO => {
|
||||||
|
if (!["http:", "https:"].includes(urlO.protocol)) {
|
||||||
|
return [urlO];
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlWithHTTP = new URL(urlO);
|
||||||
|
const urlWithHTTPS = new URL(urlO);
|
||||||
|
urlWithHTTP.protocol = "http:";
|
||||||
|
urlWithHTTPS.protocol = "https:";
|
||||||
|
|
||||||
|
return [urlWithHTTP, urlWithHTTPS];
|
||||||
|
});
|
||||||
|
|
||||||
|
return permutations;
|
||||||
|
}
|
||||||
|
|
||||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||||
@ -110,33 +141,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
} else {
|
} else {
|
||||||
const urlO = new URL(url);
|
const permutations = generateURLPermutations(url);
|
||||||
|
|
||||||
// Construct two versions, one with www., one without
|
|
||||||
const urlWithWWW = new URL(urlO);
|
|
||||||
const urlWithoutWWW = new URL(urlO);
|
|
||||||
if (urlO.hostname.startsWith("www.")) {
|
|
||||||
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
|
|
||||||
} else {
|
|
||||||
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
|
|
||||||
}
|
|
||||||
|
|
||||||
let permutations = [urlWithWWW, urlWithoutWWW];
|
|
||||||
|
|
||||||
// Construct more versions for http/https
|
|
||||||
permutations = permutations.flatMap(urlO => {
|
|
||||||
if (!["http:", "https:"].includes(urlO.protocol)) {
|
|
||||||
return [urlO];
|
|
||||||
}
|
|
||||||
|
|
||||||
const urlWithHTTP = new URL(urlO);
|
|
||||||
const urlWithHTTPS = new URL(urlO);
|
|
||||||
urlWithHTTP.protocol = "http:";
|
|
||||||
urlWithHTTPS.protocol = "https:";
|
|
||||||
|
|
||||||
return [urlWithHTTP, urlWithHTTPS];
|
|
||||||
});
|
|
||||||
|
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user