From 550d6bf7bd75c837bd9acb1985add07423c06455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 28 May 2025 14:22:08 +0200 Subject: [PATCH] url splits and better url normalization --- .../src/scraper/scrapeURL/engines/index.ts | 3 +- .../scraper/scrapeURL/engines/index/index.ts | 42 ++------------ apps/api/src/services/index.ts | 58 +++++++++++++++++++ 3 files changed, 64 insertions(+), 39 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index c54a6f2b..6ee353a6 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -10,7 +10,8 @@ import { scrapePDF } from "./pdf"; import { scrapeURLWithFetch } from "./fetch"; import { scrapeURLWithPlaywright } from "./playwright"; import { scrapeCache } from "./cache"; -import { scrapeURLWithIndex, useIndex } from "./index/index"; +import { scrapeURLWithIndex } from "./index/index"; +import { useIndex } from "../../../services"; export type Engine = | "fire-engine;chrome-cdp" diff --git a/apps/api/src/scraper/scrapeURL/engines/index/index.ts b/apps/api/src/scraper/scrapeURL/engines/index/index.ts index adec1729..369728ca 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts @@ -1,46 +1,10 @@ import { Document } from "../../../../controllers/v1/types"; import { EngineScrapeResult } from ".."; import { Meta } from "../.."; -import { getIndexFromGCS, index_supabase_service, saveIndexToGCS } from "../../../../services"; +import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits } from "../../../../services"; import { EngineError, IndexMissError } from "../../error"; import crypto from "crypto"; -export const useIndex = - process.env.INDEX_SUPABASE_URL !== "" && - process.env.INDEX_SUPABASE_URL !== undefined; - -function normalizeURLForIndex(url: string): string { - const urlObj = new URL(url); - urlObj.hash = ""; - urlObj.protocol = "https"; - - if (urlObj.port === "80" || urlObj.port === "443") { - urlObj.port = ""; - } - - if (urlObj.pathname.endsWith("/index.html")) { - urlObj.pathname = urlObj.pathname.slice(0, -10); - } else if (urlObj.pathname.endsWith("/index.php")) { - urlObj.pathname = urlObj.pathname.slice(0, -9); - } else if (urlObj.pathname.endsWith("/index.htm")) { - urlObj.pathname = urlObj.pathname.slice(0, -9); - } else if (urlObj.pathname.endsWith("/index.shtml")) { - urlObj.pathname = urlObj.pathname.slice(0, -11); - } else if (urlObj.pathname.endsWith("/index.xml")) { - urlObj.pathname = urlObj.pathname.slice(0, -9); - } - - if (urlObj.pathname.endsWith("/")) { - urlObj.pathname = urlObj.pathname.slice(0, -1); - } - - return urlObj.toString(); -} - -async function hashURL(url: string): Promise { - return "\\x" + crypto.createHash("sha256").update(url).digest("hex"); -} - export async function sendDocumentToIndex(meta: Meta, document: Document) { if (meta.winnerEngine === "cache" || meta.winnerEngine === "index") { return document; @@ -53,7 +17,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) { const normalizedURL = normalizeURLForIndex(meta.url); const urlHash = await hashURL(normalizedURL); - const urlSplits = []; // TODO + const urlSplits = generateURLSplits(normalizedURL); const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split))); const indexId = crypto.randomUUID(); @@ -82,6 +46,8 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) { url_hash: urlHash, url_splits: urlSplits, url_splits_hash: urlSplitsHash, + original_url: document.metadata.sourceURL ?? meta.url, + resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url, }); if (error) { diff --git a/apps/api/src/services/index.ts b/apps/api/src/services/index.ts index c384d0ad..a5845d4b 100644 --- a/apps/api/src/services/index.ts +++ b/apps/api/src/services/index.ts @@ -2,6 +2,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { logger } from "../lib/logger"; import { configDotenv } from "dotenv"; import { Storage } from "@google-cloud/storage"; +import crypto from "crypto"; configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. @@ -130,3 +131,60 @@ export async function saveIndexToGCS(id: string, doc: { }); } } + +export const useIndex = + process.env.INDEX_SUPABASE_URL !== "" && + process.env.INDEX_SUPABASE_URL !== undefined; + +export function normalizeURLForIndex(url: string): string { + const urlObj = new URL(url); + urlObj.hash = ""; + urlObj.protocol = "https"; + + if (urlObj.port === "80" || urlObj.port === "443") { + urlObj.port = ""; + } + + if (urlObj.hostname.startsWith("www.")) { + urlObj.hostname = urlObj.hostname.slice(4); + } + + if (urlObj.pathname.endsWith("/index.html")) { + urlObj.pathname = urlObj.pathname.slice(0, -10); + } else if (urlObj.pathname.endsWith("/index.php")) { + urlObj.pathname = urlObj.pathname.slice(0, -9); + } else if (urlObj.pathname.endsWith("/index.htm")) { + urlObj.pathname = urlObj.pathname.slice(0, -9); + } else if (urlObj.pathname.endsWith("/index.shtml")) { + urlObj.pathname = urlObj.pathname.slice(0, -11); + } else if (urlObj.pathname.endsWith("/index.xml")) { + urlObj.pathname = urlObj.pathname.slice(0, -9); + } + + if (urlObj.pathname.endsWith("/")) { + urlObj.pathname = urlObj.pathname.slice(0, -1); + } + + return urlObj.toString(); +} + +export async function hashURL(url: string): Promise { + return "\\x" + crypto.createHash("sha256").update(url).digest("hex"); +} + +export function generateURLSplits(url: string): string[] { + const urls: string[] = []; + const urlObj = new URL(url); + urlObj.hash = ""; + urlObj.search = ""; + const pathnameParts = urlObj.pathname.split("/"); + + for (let i = 0; i <= pathnameParts.length; i++) { + urlObj.pathname = pathnameParts.slice(0, i).join("/"); + urls.push(urlObj.href); + } + + urls.push(url); + + return [...new Set(urls.map(x => normalizeURLForIndex(x)))]; +} \ No newline at end of file