From 6ecf24b85e26283bbaab77c6b58ca5726f69e6a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 8 Nov 2024 16:22:06 +0100 Subject: [PATCH 1/4] feat(crawl): URL deduplication --- apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/lib/crawl-redis.ts | 47 +++++++++++++++++++++++---- apps/api/src/services/queue-worker.ts | 6 ++++ apps/js-sdk/firecrawl/src/index.ts | 3 ++ 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 82b24c22..8a01e502 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -143,6 +143,7 @@ export const scrapeOptions = z.object({ }).optional(), skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), + deduplicateSimilarURLs: z.boolean().default(true), }).strict(strictMessage) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 41cbb07c..f1c1d956 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -90,6 +90,13 @@ export async function getThrottledJobs(teamId: string): Promise { return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); } +export function normalizeURL(url: string): string { + const urlO = new URL(url); + urlO.search = ""; + urlO.hash = ""; + return urlO.href; +} + export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { @@ -97,16 +104,42 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise } } - try { + url = normalizeURL(url); + + let res: boolean; + if (!sc.scrapeOptions.deduplicateSimilarURLs) { + res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 + } else { const urlO = new URL(url); - urlO.search = ""; - urlO.hash = ""; - url = urlO.href; - } catch (error) { - logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap(urlO => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; + } + + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; + + return [urlWithHTTP, urlWithHTTPS]; + }); + + res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; } - const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 36928873..9cb8326d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -23,6 +23,7 @@ import { getCrawl, getCrawlJobs, lockURL, + normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; @@ -305,6 +306,11 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + + if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) { + logger.debug("Was redirected, locking new URL..."); + await lockURL(job.data.crawl_id, sc, doc.metadata.url); + } await logJob({ job_id: job.id as string, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 7ad5a5f0..5c83170b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -86,6 +86,9 @@ export interface CrawlScrapeOptions { country?: string; languages?: string[]; }; + skipTlsVerification?: boolean; + removeBase64Images?: boolean; + deduplicateSimilarURLs?: boolean; } export type Action = { From dc3a4e27fdf02ccb166f22f364fc9a7b665946a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 8 Nov 2024 16:25:11 +0100 Subject: [PATCH 2/4] move param to the right place --- apps/api/src/controllers/v1/types.ts | 2 +- apps/api/src/lib/crawl-redis.ts | 2 +- apps/js-sdk/firecrawl/src/index.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 8a01e502..dd0faff7 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -143,7 +143,6 @@ export const scrapeOptions = z.object({ }).optional(), skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), - deduplicateSimilarURLs: z.boolean().default(true), }).strict(strictMessage) @@ -200,6 +199,7 @@ const crawlerOptions = z.object({ allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), + deduplicateSimilarURLs: z.boolean().default(true), }).strict(strictMessage); // export type CrawlerOptions = { diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index f1c1d956..3d918263 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -107,7 +107,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise url = normalizeURL(url); let res: boolean; - if (!sc.scrapeOptions.deduplicateSimilarURLs) { + if (!sc.crawlerOptions.deduplicateSimilarURLs) { res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 } else { const urlO = new URL(url); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 5c83170b..4f68f303 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -88,7 +88,6 @@ export interface CrawlScrapeOptions { }; skipTlsVerification?: boolean; removeBase64Images?: boolean; - deduplicateSimilarURLs?: boolean; } export type Action = { @@ -151,6 +150,7 @@ export interface CrawlParams { ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; webhook?: string; + deduplicateSimilarURLs?: boolean; } /** From 1acef8e49b5be18f3acc03357d13f6f36b30e4d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 8 Nov 2024 17:11:22 +0100 Subject: [PATCH 3/4] fix: converter missing --- apps/api/src/controllers/v1/types.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index dd0faff7..a9449431 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -454,6 +454,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { allowBackwardCrawling: x.allowBackwardLinks, allowExternalContentLinks: x.allowExternalLinks, ignoreSitemap: x.ignoreSitemap, + deduplicateSimilarURLs: x.deduplicateSimilarURLs, }; } @@ -467,7 +468,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions allowBackwardLinks: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, ignoreSitemap: x.ignoreSitemap, - // TODO: returnOnlyUrls support + deduplicateSimilarURLs: x.deduplicateSimilarURLs, }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, From 8e4e49e471eb9cf0fa87a001efbf3b04c99dc395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 11 Nov 2024 20:29:17 +0100 Subject: [PATCH 4/4] feat(generateURLPermutations): add tests --- apps/api/src/lib/crawl-redis.test.ts | 33 ++++++++++++++++ apps/api/src/lib/crawl-redis.ts | 59 +++++++++++++++------------- 2 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 apps/api/src/lib/crawl-redis.test.ts diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts new file mode 100644 index 00000000..eb9c81f1 --- /dev/null +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -0,0 +1,33 @@ +import { generateURLPermutations } from "./crawl-redis"; + +describe("generateURLPermutations", () => { + it("generates permutations correctly", () => { + const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href); + expect(bareHttps.length).toBe(4); + expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href); + expect(bareHttp.length).toBe(4); + expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href); + expect(wwwHttps.length).toBe(4); + expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href); + expect(wwwHttp.length).toBe(4); + expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); + }) +}); \ No newline at end of file diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 3d918263..34b164d2 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -97,6 +97,37 @@ export function normalizeURL(url: string): string { return urlO.href; } +export function generateURLPermutations(url: string | URL): URL[] { + const urlO = new URL(url); + + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap(urlO => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; + } + + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; + + return [urlWithHTTP, urlWithHTTPS]; + }); + + return permutations; +} + export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { @@ -110,33 +141,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise if (!sc.crawlerOptions.deduplicateSimilarURLs) { res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 } else { - const urlO = new URL(url); - - // Construct two versions, one with www., one without - const urlWithWWW = new URL(urlO); - const urlWithoutWWW = new URL(urlO); - if (urlO.hostname.startsWith("www.")) { - urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); - } else { - urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; - } - - let permutations = [urlWithWWW, urlWithoutWWW]; - - // Construct more versions for http/https - permutations = permutations.flatMap(urlO => { - if (!["http:", "https:"].includes(urlO.protocol)) { - return [urlO]; - } - - const urlWithHTTP = new URL(urlO); - const urlWithHTTPS = new URL(urlO); - urlWithHTTP.protocol = "http:"; - urlWithHTTPS.protocol = "https:"; - - return [urlWithHTTP, urlWithHTTPS]; - }); - + const permutations = generateURLPermutations(url); res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; }