url splits and better url normalization

This commit is contained in:
Gergő Móricz 2025-05-28 14:22:08 +02:00
parent 1d733d169a
commit 550d6bf7bd
3 changed files with 64 additions and 39 deletions

View File

@ -10,7 +10,8 @@ import { scrapePDF } from "./pdf";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
import { scrapeCache } from "./cache";
import { scrapeURLWithIndex, useIndex } from "./index/index";
import { scrapeURLWithIndex } from "./index/index";
import { useIndex } from "../../../services";
export type Engine =
| "fire-engine;chrome-cdp"

View File

@ -1,46 +1,10 @@
import { Document } from "../../../../controllers/v1/types";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { getIndexFromGCS, index_supabase_service, saveIndexToGCS } from "../../../../services";
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits } from "../../../../services";
import { EngineError, IndexMissError } from "../../error";
import crypto from "crypto";
export const useIndex =
process.env.INDEX_SUPABASE_URL !== "" &&
process.env.INDEX_SUPABASE_URL !== undefined;
function normalizeURLForIndex(url: string): string {
const urlObj = new URL(url);
urlObj.hash = "";
urlObj.protocol = "https";
if (urlObj.port === "80" || urlObj.port === "443") {
urlObj.port = "";
}
if (urlObj.pathname.endsWith("/index.html")) {
urlObj.pathname = urlObj.pathname.slice(0, -10);
} else if (urlObj.pathname.endsWith("/index.php")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
} else if (urlObj.pathname.endsWith("/index.htm")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
} else if (urlObj.pathname.endsWith("/index.shtml")) {
urlObj.pathname = urlObj.pathname.slice(0, -11);
} else if (urlObj.pathname.endsWith("/index.xml")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
}
if (urlObj.pathname.endsWith("/")) {
urlObj.pathname = urlObj.pathname.slice(0, -1);
}
return urlObj.toString();
}
async function hashURL(url: string): Promise<string> {
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
}
export async function sendDocumentToIndex(meta: Meta, document: Document) {
if (meta.winnerEngine === "cache" || meta.winnerEngine === "index") {
return document;
@ -53,7 +17,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
const normalizedURL = normalizeURLForIndex(meta.url);
const urlHash = await hashURL(normalizedURL);
const urlSplits = []; // TODO
const urlSplits = generateURLSplits(normalizedURL);
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
const indexId = crypto.randomUUID();
@ -82,6 +46,8 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
url_hash: urlHash,
url_splits: urlSplits,
url_splits_hash: urlSplitsHash,
original_url: document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
});
if (error) {

View File

@ -2,6 +2,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js";
import { logger } from "../lib/logger";
import { configDotenv } from "dotenv";
import { Storage } from "@google-cloud/storage";
import crypto from "crypto";
configDotenv();
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
@ -130,3 +131,60 @@ export async function saveIndexToGCS(id: string, doc: {
});
}
}
export const useIndex =
process.env.INDEX_SUPABASE_URL !== "" &&
process.env.INDEX_SUPABASE_URL !== undefined;
export function normalizeURLForIndex(url: string): string {
const urlObj = new URL(url);
urlObj.hash = "";
urlObj.protocol = "https";
if (urlObj.port === "80" || urlObj.port === "443") {
urlObj.port = "";
}
if (urlObj.hostname.startsWith("www.")) {
urlObj.hostname = urlObj.hostname.slice(4);
}
if (urlObj.pathname.endsWith("/index.html")) {
urlObj.pathname = urlObj.pathname.slice(0, -10);
} else if (urlObj.pathname.endsWith("/index.php")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
} else if (urlObj.pathname.endsWith("/index.htm")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
} else if (urlObj.pathname.endsWith("/index.shtml")) {
urlObj.pathname = urlObj.pathname.slice(0, -11);
} else if (urlObj.pathname.endsWith("/index.xml")) {
urlObj.pathname = urlObj.pathname.slice(0, -9);
}
if (urlObj.pathname.endsWith("/")) {
urlObj.pathname = urlObj.pathname.slice(0, -1);
}
return urlObj.toString();
}
export async function hashURL(url: string): Promise<string> {
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
}
export function generateURLSplits(url: string): string[] {
const urls: string[] = [];
const urlObj = new URL(url);
urlObj.hash = "";
urlObj.search = "";
const pathnameParts = urlObj.pathname.split("/");
for (let i = 0; i <= pathnameParts.length; i++) {
urlObj.pathname = pathnameParts.slice(0, i).join("/");
urls.push(urlObj.href);
}
urls.push(url);
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
}