Nick: url normalization + max metadata size

This commit is contained in:
Nicolas 2024-12-30 20:04:22 -03:00
parent bf9d41d0b2
commit 7a31306be5
2 changed files with 29 additions and 12 deletions

View File

@ -17,7 +17,6 @@ import expressWs from "express-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod"; import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { searchSimilarPages } from "./lib/extract/index/pinecone";
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -255,3 +254,4 @@ logger.info(`Worker ${process.pid} started`);
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
//

View File

@ -13,6 +13,8 @@ const pinecone = new Pinecone({
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
export interface PageMetadata { export interface PageMetadata {
url: string; url: string;
originUrl: string; originUrl: string;
@ -42,40 +44,54 @@ function normalizeUrl(url: string) {
return urlO.href; return urlO.href;
} }
export async function indexPage( export async function indexPage({
document: Document, document,
originUrl: string, originUrl,
crawlId?: string, crawlId,
teamId?: string teamId
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
}
) { ) {
try { try {
const index = pinecone.index(INDEX_NAME); const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
}
// Create text to embed // Create text to embed
const textToEmbed = [ const textToEmbed = [
document.metadata.title, document.metadata.title,
document.metadata.description, document.metadata.description,
document.markdown trimmedMarkdown
].filter(Boolean).join('\n\n'); ].filter(Boolean).join('\n\n');
// Get embedding from OpenAI // Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed); const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
// Prepare metadata // Prepare metadata
const metadata: PageMetadata = { const metadata: PageMetadata = {
url: normalizeUrl(document.metadata.sourceURL || document.metadata.url!), url: normalizedUrl,
originUrl: normalizeUrl(originUrl), originUrl: normalizeUrl(originUrl),
title: document.metadata.title, title: document.metadata.title,
description: document.metadata.description, description: document.metadata.description,
crawlId, crawlId,
teamId, teamId,
markdown: document.markdown, markdown: trimmedMarkdown,
timestamp: Date.now() timestamp: Date.now()
}; };
// Upsert to Pinecone // Upsert to Pinecone
await index.upsert([{ await index.upsert([{
id: document.metadata.sourceURL || document.metadata.url!, id: normalizedUrl,
values: embedding, values: embedding,
metadata: { metadata: {
...metadata, ...metadata,
@ -114,10 +130,11 @@ export async function searchSimilarPages(
includeMetadata: true includeMetadata: true
}; };
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided // Add filter if originUrl is provided
if (originUrl) { if (normalizedOriginUrl) {
queryParams.filter = { queryParams.filter = {
[originUrl]: { $contains: normalizeUrl(originUrl) } originUrl: { $eq: normalizedOriginUrl }
}; };
} }