diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 33dc4afe..f2bb0604 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -203,6 +203,7 @@ const crawlerOptions = z.object({ allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), + deduplicateSimilarURLs: z.boolean().default(true), }).strict(strictMessage); // export type CrawlerOptions = { @@ -457,6 +458,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { allowBackwardCrawling: x.allowBackwardLinks, allowExternalContentLinks: x.allowExternalLinks, ignoreSitemap: x.ignoreSitemap, + deduplicateSimilarURLs: x.deduplicateSimilarURLs, }; } @@ -470,7 +472,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions allowBackwardLinks: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, ignoreSitemap: x.ignoreSitemap, - // TODO: returnOnlyUrls support + deduplicateSimilarURLs: x.deduplicateSimilarURLs, }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts new file mode 100644 index 00000000..eb9c81f1 --- /dev/null +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -0,0 +1,33 @@ +import { generateURLPermutations } from "./crawl-redis"; + +describe("generateURLPermutations", () => { + it("generates permutations correctly", () => { + const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href); + expect(bareHttps.length).toBe(4); + expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href); + expect(bareHttp.length).toBe(4); + expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href); + expect(wwwHttps.length).toBe(4); + expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); + + const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href); + expect(wwwHttp.length).toBe(4); + expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); + }) +}); \ No newline at end of file diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 41cbb07c..34b164d2 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -90,6 +90,44 @@ export async function getThrottledJobs(teamId: string): Promise { return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); } +export function normalizeURL(url: string): string { + const urlO = new URL(url); + urlO.search = ""; + urlO.hash = ""; + return urlO.href; +} + +export function generateURLPermutations(url: string | URL): URL[] { + const urlO = new URL(url); + + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap(urlO => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; + } + + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; + + return [urlWithHTTP, urlWithHTTPS]; + }); + + return permutations; +} + export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { @@ -97,16 +135,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise } } - try { - const urlO = new URL(url); - urlO.search = ""; - urlO.hash = ""; - url = urlO.href; - } catch (error) { - logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + url = normalizeURL(url); + + let res: boolean; + if (!sc.crawlerOptions.deduplicateSimilarURLs) { + res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 + } else { + const permutations = generateURLPermutations(url); + res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; } - const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c0bab316..e29187c2 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -23,6 +23,7 @@ import { getCrawl, getCrawlJobs, lockURL, + normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; @@ -318,6 +319,11 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + + if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) { + logger.debug("Was redirected, locking new URL..."); + await lockURL(job.data.crawl_id, sc, doc.metadata.url); + } await logJob({ job_id: job.id as string, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index bcf9d5b7..45c67cf8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -86,6 +86,8 @@ export interface CrawlScrapeOptions { country?: string; languages?: string[]; }; + skipTlsVerification?: boolean; + removeBase64Images?: boolean; } export type Action = { @@ -151,6 +153,7 @@ export interface CrawlParams { ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; webhook?: string; + deduplicateSimilarURLs?: boolean; } /**