From 7a31306be591f4d27cad23e58e3f6d3e1c57b60e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 20:04:22 -0300 Subject: [PATCH] Nick: url normalization + max metadata size --- apps/api/src/index.ts | 2 +- apps/api/src/lib/extract/index/pinecone.ts | 39 ++++++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index d4769283..20214d72 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -17,7 +17,6 @@ import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; -import { searchSimilarPages } from "./lib/extract/index/pinecone"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -255,3 +254,4 @@ logger.info(`Worker ${process.pid} started`); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); +// diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts index 603cd38f..14c3bea4 100644 --- a/apps/api/src/lib/extract/index/pinecone.ts +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -13,6 +13,8 @@ const pinecone = new Pinecone({ const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; +const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes + export interface PageMetadata { url: string; originUrl: string; @@ -42,40 +44,54 @@ function normalizeUrl(url: string) { return urlO.href; } -export async function indexPage( - document: Document, - originUrl: string, - crawlId?: string, - teamId?: string +export async function indexPage({ + document, + originUrl, + crawlId, + teamId +}: { + document: Document; + originUrl: string; + crawlId?: string; + teamId?: string; +} ) { try { const index = pinecone.index(INDEX_NAME); + // Trim markdown if it's too long + let trimmedMarkdown = document.markdown; + if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) { + trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding + } + // Create text to embed const textToEmbed = [ document.metadata.title, document.metadata.description, - document.markdown + trimmedMarkdown ].filter(Boolean).join('\n\n'); // Get embedding from OpenAI const embedding = await getEmbedding(textToEmbed); + const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!); + // Prepare metadata const metadata: PageMetadata = { - url: normalizeUrl(document.metadata.sourceURL || document.metadata.url!), + url: normalizedUrl, originUrl: normalizeUrl(originUrl), title: document.metadata.title, description: document.metadata.description, crawlId, teamId, - markdown: document.markdown, + markdown: trimmedMarkdown, timestamp: Date.now() }; // Upsert to Pinecone await index.upsert([{ - id: document.metadata.sourceURL || document.metadata.url!, + id: normalizedUrl, values: embedding, metadata: { ...metadata, @@ -114,10 +130,11 @@ export async function searchSimilarPages( includeMetadata: true }; + const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; // Add filter if originUrl is provided - if (originUrl) { + if (normalizedOriginUrl) { queryParams.filter = { - [originUrl]: { $contains: normalizeUrl(originUrl) } + originUrl: { $eq: normalizedOriginUrl } }; }