Merge pull request #1090 from mendableai/nsc/new-re-rank

Re-ranker changes
This commit is contained in:
Nicolas 2025-01-24 19:20:39 -03:00 committed by GitHub
commit fa5544add8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 693 additions and 46 deletions

View File

@ -283,4 +283,574 @@ describe("spreadSchemas", () => {
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should spread pages schema", async () => {
const schema = {
type: "object",
properties: {
pages: {
type: "array",
items: {
type: "object",
properties: {
title: {
type: "string",
},
},
},
},
},
required: ["pages"],
};
const keys = ["pages"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should spread pages schema", async () => {
const schema = {
type: "object",
properties: {
pages: {
type: "array",
items: {
type: "object",
properties: {
title: {
type: "string",
},
},
},
},
},
required: ["pages"],
};
const keys = ["pages.title"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should handle deeply nested array properties", async () => {
const schema = {
type: "object",
properties: {
company: {
type: "object",
properties: {
name: { type: "string" },
departments: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
employees: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
role: { type: "string" },
},
},
},
},
},
},
},
},
},
required: ["company"],
};
const keys = ["company.departments.employees"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should handle multiple nested paths", async () => {
const schema = {
type: "object",
properties: {
user: {
type: "object",
properties: {
name: { type: "string" },
contacts: {
type: "array",
items: {
type: "object",
properties: {
email: { type: "string" },
phone: { type: "string" },
},
},
},
},
},
orders: {
type: "array",
items: {
type: "object",
properties: {
id: { type: "string" },
items: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
quantity: { type: "number" },
},
},
},
},
},
},
},
required: ["user", "orders"],
};
const keys = ["user.contacts", "orders.items"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should handle mixed single and array properties", async () => {
const schema = {
type: "object",
properties: {
metadata: {
type: "object",
properties: {
title: { type: "string" },
description: { type: "string" },
},
},
sections: {
type: "array",
items: {
type: "object",
properties: {
title: { type: "string" },
content: { type: "string" },
},
},
},
},
required: ["metadata", "sections"],
};
const keys = ["sections"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({
type: "object",
properties: {
metadata: {
type: "object",
properties: {
title: { type: "string" },
description: { type: "string" },
},
},
},
required: ["metadata"],
});
expect(multiEntitySchema).toEqual({
type: "object",
properties: {
sections: {
type: "array",
items: {
type: "object",
properties: {
title: { type: "string" },
content: { type: "string" },
},
},
},
},
required: ["sections"],
});
});
it("should handle empty keys array", async () => {
const schema = {
type: "object",
properties: {
name: { type: "string" },
age: { type: "number" },
},
required: ["name"],
};
const keys: string[] = [];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual(schema);
expect(multiEntitySchema).toEqual({});
});
it("should handle non-existent paths", async () => {
const schema = {
type: "object",
properties: {
user: {
type: "object",
properties: {
name: { type: "string" },
},
},
},
};
const keys = ["user.nonexistent.path"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
// it("should split nested object and array properties", async () => {
// const schema = {
// type: "object",
// properties: {
// company: {
// type: "object",
// properties: {
// name: { type: "string" },
// address: {
// type: "object",
// properties: {
// street: { type: "string" },
// city: { type: "string" },
// },
// },
// employees: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// position: { type: "string" },
// },
// },
// },
// },
// },
// },
// required: ["company"],
// };
// const keys = ["company.employees"];
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
// schema,
// keys,
// );
// expect(singleAnswerSchema).toEqual({
// type: "object",
// properties: {
// company: {
// type: "object",
// properties: {
// name: { type: "string" },
// address: {
// type: "object",
// properties: {
// street: { type: "string" },
// city: { type: "string" },
// },
// },
// },
// },
// },
// required: ["company"],
// });
// expect(multiEntitySchema).toEqual({
// type: "object",
// properties: {
// company: {
// type: "object",
// properties: {
// employees: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// position: { type: "string" },
// },
// },
// },
// },
// },
// },
// required: ["company"],
// });
// });
// it("should handle multiple root level properties with nested paths", async () => {
// const schema = {
// type: "object",
// properties: {
// user: {
// type: "object",
// properties: {
// id: { type: "string" },
// profile: {
// type: "object",
// properties: {
// name: { type: "string" },
// email: { type: "string" },
// },
// },
// posts: {
// type: "array",
// items: {
// type: "object",
// properties: {
// title: { type: "string" },
// content: { type: "string" },
// },
// },
// },
// },
// },
// settings: {
// type: "object",
// properties: {
// theme: { type: "string" },
// notifications: {
// type: "object",
// properties: {
// email: { type: "boolean" },
// push: { type: "boolean" },
// },
// },
// },
// },
// },
// required: ["user", "settings"],
// };
// const keys = ["user.posts", "settings.notifications"];
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
// schema,
// keys,
// );
// expect(singleAnswerSchema).toEqual({
// type: "object",
// properties: {
// user: {
// type: "object",
// properties: {
// id: { type: "string" },
// profile: {
// type: "object",
// properties: {
// name: { type: "string" },
// email: { type: "string" },
// },
// },
// },
// },
// settings: {
// type: "object",
// properties: {
// theme: { type: "string" },
// },
// },
// },
// required: ["user", "settings"],
// });
// expect(multiEntitySchema).toEqual({
// type: "object",
// properties: {
// user: {
// type: "object",
// properties: {
// posts: {
// type: "array",
// items: {
// type: "object",
// properties: {
// title: { type: "string" },
// content: { type: "string" },
// },
// },
// },
// },
// },
// settings: {
// type: "object",
// properties: {
// notifications: {
// type: "object",
// properties: {
// email: { type: "boolean" },
// push: { type: "boolean" },
// },
// },
// },
// },
// },
// required: ["user", "settings"],
// });
// });
// it("should handle array properties at different nesting levels", async () => {
// const schema = {
// type: "object",
// properties: {
// categories: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// subcategories: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// products: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// price: { type: "number" },
// },
// },
// },
// },
// },
// },
// },
// },
// },
// featured: {
// type: "object",
// properties: {
// category: { type: "string" },
// items: {
// type: "array",
// items: {
// type: "object",
// properties: {
// id: { type: "string" },
// name: { type: "string" },
// },
// },
// },
// },
// },
// },
// };
// const keys = ["categories.subcategories", "featured.items"];
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
// schema,
// keys,
// );
// expect(singleAnswerSchema).toEqual({
// type: "object",
// properties: {
// featured: {
// type: "object",
// properties: {
// category: { type: "string" },
// },
// },
// },
// });
// expect(multiEntitySchema).toEqual({
// type: "object",
// properties: {
// categories: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// subcategories: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// products: {
// type: "array",
// items: {
// type: "object",
// properties: {
// name: { type: "string" },
// price: { type: "number" },
// },
// },
// },
// },
// },
// },
// },
// },
// },
// featured: {
// type: "object",
// properties: {
// items: {
// type: "array",
// items: {
// type: "object",
// properties: {
// id: { type: "string" },
// name: { type: "string" },
// },
// },
// },
// },
// },
// },
// });
// });
});

View File

@ -31,9 +31,14 @@ Return only a concise sentece or 2 focused on the essential data points that the
}
export function buildRerankerSystemPrompt(): string {
return "You are a relevance expert. Analyze the provided URLs and their content to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query.";
return `You are a relevance expert scoring links from a website the user is trying to extract information from. Analyze the provided URLs and their content
to determine their relevance to the user's query and intent.
For each URL, assign a relevance score between 0 and 1, where 1
means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it.
Always return all the links scored that you are giving. Do not omit links.
Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`;
}
export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`;
}

View File

@ -1,3 +1,5 @@
import { logger } from "../../../lib/logger";
export async function spreadSchemas(
schema: any,
keys: string[],
@ -6,14 +8,45 @@ export async function spreadSchemas(
multiEntitySchema: any;
}> {
let singleAnswerSchema = { ...schema, properties: { ...schema.properties } };
let multiEntitySchema: any = { type: "object", properties: {} };
let multiEntitySchema: any = {
type: "object",
properties: {},
...(schema.required ? { required: [] } : {})
};
// Helper function to check if a property path exists in schema
const hasPropertyPath = (schema: any, path: string[]): boolean => {
let current = schema.properties;
for (let i = 0; i < path.length; i++) {
if (!current[path[i]]) return false;
if (current[path[i]].type === "array" && current[path[i]].items) {
current = current[path[i]].items.properties;
} else {
current = current[path[i]].properties;
}
}
return true;
};
// Helper function to get the root property of a dot path
const getRootProperty = (path: string): string => {
return path.split('.')[0];
};
keys.forEach((key) => {
if (singleAnswerSchema.properties[key]) {
multiEntitySchema.properties[key] = singleAnswerSchema.properties[key];
delete singleAnswerSchema.properties[key];
const rootProperty = getRootProperty(key);
if (singleAnswerSchema.properties[rootProperty]) {
multiEntitySchema.properties[rootProperty] = singleAnswerSchema.properties[rootProperty];
delete singleAnswerSchema.properties[rootProperty];
// Move required field if it exists
if (schema.required?.includes(rootProperty)) {
multiEntitySchema.required.push(rootProperty);
singleAnswerSchema.required = schema.required.filter((k: string) => k !== rootProperty);
}
}
});
// Recursively delete empty properties in singleAnswerSchema
const deleteEmptyProperties = (schema: any) => {
for (const key in schema.properties) {
@ -34,10 +67,14 @@ export async function spreadSchemas(
// If singleAnswerSchema has no properties left, return an empty object
if (Object.keys(singleAnswerSchema.properties).length === 0) {
singleAnswerSchema = {};
} else if (singleAnswerSchema.required?.length === 0) {
delete singleAnswerSchema.required;
}
if (Object.keys(multiEntitySchema.properties).length === 0) {
multiEntitySchema = {};
} else if (multiEntitySchema.required?.length === 0) {
delete multiEntitySchema.required;
}
return {

View File

@ -8,6 +8,7 @@ import { searchSimilarPages } from "./index/pinecone";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildRerankerUserPrompt } from "./build-prompts";
import { buildRerankerSystemPrompt } from "./build-prompts";
import { dumpToFile } from "./helpers/dump-to-file";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
@ -158,24 +159,27 @@ function filterAndProcessLinks(
}
export type RerankerResult = {
mapDocument: MapDocument[];
mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[];
tokensUsed: number;
};
export async function rerankLinksWithLLM(
mappedLinks: MapDocument[],
searchQuery: string,
urlTraces: URLTrace[],
): Promise<RerankerResult> {
export type RerankerOptions = {
links: MapDocument[];
searchQuery: string;
urlTraces: URLTrace[];
};
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
const { links, searchQuery, urlTraces } = options;
const chunkSize = 100;
const chunks: MapDocument[][] = [];
const TIMEOUT_MS = 20000;
const MAX_RETRIES = 2;
let totalTokensUsed = 0;
// Split mappedLinks into chunks of 200
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
chunks.push(mappedLinks.slice(i, i + chunkSize));
// Split links into chunks of 200
for (let i = 0; i < links.length; i += chunkSize) {
chunks.push(links.slice(i, i + chunkSize));
}
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
@ -190,8 +194,9 @@ export async function rerankLinksWithLLM(
properties: {
url: { type: "string" },
relevanceScore: { type: "number" },
reason: { type: "string", description: "The reason why you chose the score for this link given the intent." },
},
required: ["url", "relevanceScore"],
required: ["url", "relevanceScore", "reason"],
},
},
},
@ -215,6 +220,7 @@ export async function rerankLinksWithLLM(
setTimeout(() => resolve(null), TIMEOUT_MS);
});
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
const completionPromise = generateOpenAICompletions(
logger.child({
method: "rerankLinksWithLLM",
@ -229,7 +235,7 @@ export async function rerankLinksWithLLM(
},
linksContent,
undefined,
true,
true
);
const completion = await Promise.race([
@ -275,10 +281,15 @@ export async function rerankLinksWithLLM(
// Map back to MapDocument format, keeping only relevant links
const relevantLinks = flattenedResults
.map((result) => mappedLinks.find((link) => link.url === result.url))
.filter((link): link is MapDocument => link !== undefined);
.map((result) => {
const link = links.find((link) => link.url === result.url);
if (link) {
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
}
return undefined;
})
.filter((link): link is NonNullable<typeof link> => link !== undefined);
// console.log(`Returning ${relevantLinks.length} relevant links`);
return {
mapDocument: relevantLinks,
tokensUsed: totalTokensUsed,

View File

@ -203,38 +203,62 @@ export async function processUrl(
rephrasedPrompt
});
logger.info("Reranking (pass 1)...");
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
rephrasedPrompt,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
let rerankedLinks = mappedLinks;
logger.info("Reranking pass 1 (threshold 0.8)...");
const rerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
searchQuery: rephrasedPrompt,
urlTraces
});
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.8);
let tokensUsed = rerankerResult.tokensUsed;
logger.info("Reranked! (pass 1)", {
linkCount: mappedLinks.length,
logger.info("Reranked! (threshold 0.8)", {
linkCount: rerankedLinks.length,
});
// 2nd Pass, useful for when the first pass returns too many links
if (mappedLinks.length > 100) {
logger.info("Reranking (pass 2)...");
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
rephrasedPrompt,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
tokensUsed += rerankerResult.tokensUsed;
logger.info("Reranked! (pass 2)", {
linkCount: mappedLinks.length,
// lower threshold to 0.6 if no links are found
if (rerankedLinks.length === 0) {
logger.info("No links found. Reranking with threshold 0.6");
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
}
// dumpToFile(
// "llm-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
// lower threshold to 0.3 if no links are found
if (rerankedLinks.length === 0) {
logger.info("No links found. Reranking with threshold 0.3");
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3);
logger.info("Reranked! (threshold 0.3)", {
linkCount: rerankedLinks.length,
});
}
// 2nd Pass, useful for when the first pass returns too many links
if (rerankedLinks.length > 100) {
logger.info("Reranking pass 2 (> 100 links - threshold 0.6)...");
const secondPassRerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
searchQuery: rephrasedPrompt,
urlTraces,
});
// why 0.6? average? experimental results?
if (secondPassRerankerResult.mapDocument.length > 0) {
rerankedLinks = secondPassRerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
}
}
// If no relevant links are found, return the original mapped links
if (rerankedLinks.length === 0) {
logger.info("No links found. Not reranking.");
rerankedLinks = mappedLinks;
}
// Remove title and description from mappedLinks
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
return mappedLinks.map((x) => x.url);