diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 82b24c22..8a01e502 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -143,6 +143,7 @@ export const scrapeOptions = z.object({ }).optional(), skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), + deduplicateSimilarURLs: z.boolean().default(true), }).strict(strictMessage) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 41cbb07c..f1c1d956 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -90,6 +90,13 @@ export async function getThrottledJobs(teamId: string): Promise { return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); } +export function normalizeURL(url: string): string { + const urlO = new URL(url); + urlO.search = ""; + urlO.hash = ""; + return urlO.href; +} + export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { @@ -97,16 +104,42 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise } } - try { + url = normalizeURL(url); + + let res: boolean; + if (!sc.scrapeOptions.deduplicateSimilarURLs) { + res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 + } else { const urlO = new URL(url); - urlO.search = ""; - urlO.hash = ""; - url = urlO.href; - } catch (error) { - logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap(urlO => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; + } + + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; + + return [urlWithHTTP, urlWithHTTPS]; + }); + + res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; } - const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 36928873..9cb8326d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -23,6 +23,7 @@ import { getCrawl, getCrawlJobs, lockURL, + normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; @@ -305,6 +306,11 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + + if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) { + logger.debug("Was redirected, locking new URL..."); + await lockURL(job.data.crawl_id, sc, doc.metadata.url); + } await logJob({ job_id: job.id as string, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 7ad5a5f0..5c83170b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -86,6 +86,9 @@ export interface CrawlScrapeOptions { country?: string; languages?: string[]; }; + skipTlsVerification?: boolean; + removeBase64Images?: boolean; + deduplicateSimilarURLs?: boolean; } export type Action = {