Nick: url normalization

This commit is contained in:
Nicolas 2025-01-03 23:54:03 -03:00
parent f25c0c6d21
commit f2e0bfbfe3
2 changed files with 15 additions and 4 deletions

View File

@ -1,8 +1,19 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}
export function normalizeUrlOnlyHostname(url: string) {
try {
const hostname = new URL(url).hostname;
return hostname.replace(/^www\./, "");
} catch (error) {
return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0];
return url
.replace(/^https?:\/\//, "")
.replace(/^www\./, "")
.split("/")[0];
}
}
}

View File

@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
import { normalizeUrl } from "../lib/canonical-url";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
configDotenv();
@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",