From f2e0bfbfe3048d7b52b44e1c472ddf915eff4134 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 23:54:03 -0300 Subject: [PATCH] Nick: url normalization --- apps/api/src/lib/canonical-url.ts | 15 +++++++++++++-- apps/api/src/services/queue-worker.ts | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts index fedea09d..50570293 100644 --- a/apps/api/src/lib/canonical-url.ts +++ b/apps/api/src/lib/canonical-url.ts @@ -1,8 +1,19 @@ export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} + +export function normalizeUrlOnlyHostname(url: string) { try { const hostname = new URL(url).hostname; return hostname.replace(/^www\./, ""); } catch (error) { - return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0]; + return url + .replace(/^https?:\/\//, "") + .replace(/^www\./, "") + .split("/")[0]; } -} \ No newline at end of file +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4fb08337..9e6f3d24 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; -import { normalizeUrl } from "../lib/canonical-url"; +import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url"; configDotenv(); @@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { (async () => { - const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined; + const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; // Get all visited URLs from Redis const visitedUrls = await redisConnection.smembers( "crawl:" + job.data.crawl_id + ":visited",