mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
Merge pull request #1090 from mendableai/nsc/new-re-rank
Re-ranker changes
This commit is contained in:
commit
fa5544add8
@ -283,4 +283,574 @@ describe("spreadSchemas", () => {
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should spread pages schema", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
pages: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["pages"],
|
||||
};
|
||||
|
||||
const keys = ["pages"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should spread pages schema", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
pages: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["pages"],
|
||||
};
|
||||
|
||||
const keys = ["pages.title"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should handle deeply nested array properties", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
company: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
departments: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
employees: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
role: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["company"],
|
||||
};
|
||||
|
||||
const keys = ["company.departments.employees"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should handle multiple nested paths", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
user: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
contacts: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
email: { type: "string" },
|
||||
phone: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
orders: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
id: { type: "string" },
|
||||
items: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
quantity: { type: "number" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["user", "orders"],
|
||||
};
|
||||
|
||||
const keys = ["user.contacts", "orders.items"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should handle mixed single and array properties", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
metadata: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
description: { type: "string" },
|
||||
},
|
||||
},
|
||||
sections: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
content: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["metadata", "sections"],
|
||||
};
|
||||
|
||||
const keys = ["sections"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({
|
||||
type: "object",
|
||||
properties: {
|
||||
metadata: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
description: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["metadata"],
|
||||
});
|
||||
|
||||
expect(multiEntitySchema).toEqual({
|
||||
type: "object",
|
||||
properties: {
|
||||
sections: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
content: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["sections"],
|
||||
});
|
||||
});
|
||||
|
||||
it("should handle empty keys array", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
age: { type: "number" },
|
||||
},
|
||||
required: ["name"],
|
||||
};
|
||||
|
||||
const keys: string[] = [];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual(schema);
|
||||
expect(multiEntitySchema).toEqual({});
|
||||
});
|
||||
|
||||
it("should handle non-existent paths", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
user: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const keys = ["user.nonexistent.path"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
// it("should split nested object and array properties", async () => {
|
||||
// const schema = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// company: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// address: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// street: { type: "string" },
|
||||
// city: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// employees: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// position: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["company"],
|
||||
// };
|
||||
|
||||
// const keys = ["company.employees"];
|
||||
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
// schema,
|
||||
// keys,
|
||||
// );
|
||||
|
||||
// expect(singleAnswerSchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// company: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// address: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// street: { type: "string" },
|
||||
// city: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["company"],
|
||||
// });
|
||||
|
||||
// expect(multiEntitySchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// company: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// employees: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// position: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["company"],
|
||||
// });
|
||||
// });
|
||||
|
||||
// it("should handle multiple root level properties with nested paths", async () => {
|
||||
// const schema = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// user: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// id: { type: "string" },
|
||||
// profile: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// email: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// posts: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// title: { type: "string" },
|
||||
// content: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// settings: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// theme: { type: "string" },
|
||||
// notifications: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// email: { type: "boolean" },
|
||||
// push: { type: "boolean" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["user", "settings"],
|
||||
// };
|
||||
|
||||
// const keys = ["user.posts", "settings.notifications"];
|
||||
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
// schema,
|
||||
// keys,
|
||||
// );
|
||||
|
||||
// expect(singleAnswerSchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// user: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// id: { type: "string" },
|
||||
// profile: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// email: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// settings: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// theme: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["user", "settings"],
|
||||
// });
|
||||
|
||||
// expect(multiEntitySchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// user: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// posts: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// title: { type: "string" },
|
||||
// content: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// settings: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// notifications: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// email: { type: "boolean" },
|
||||
// push: { type: "boolean" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// required: ["user", "settings"],
|
||||
// });
|
||||
// });
|
||||
|
||||
// it("should handle array properties at different nesting levels", async () => {
|
||||
// const schema = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// categories: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// subcategories: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// products: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// price: { type: "number" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// featured: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// category: { type: "string" },
|
||||
// items: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// id: { type: "string" },
|
||||
// name: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// };
|
||||
|
||||
// const keys = ["categories.subcategories", "featured.items"];
|
||||
// const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
// schema,
|
||||
// keys,
|
||||
// );
|
||||
|
||||
// expect(singleAnswerSchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// featured: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// category: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// });
|
||||
|
||||
// expect(multiEntitySchema).toEqual({
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// categories: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// subcategories: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// products: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// name: { type: "string" },
|
||||
// price: { type: "number" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// featured: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// items: {
|
||||
// type: "array",
|
||||
// items: {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// id: { type: "string" },
|
||||
// name: { type: "string" },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// });
|
||||
// });
|
||||
});
|
||||
|
@ -31,9 +31,14 @@ Return only a concise sentece or 2 focused on the essential data points that the
|
||||
}
|
||||
|
||||
export function buildRerankerSystemPrompt(): string {
|
||||
return "You are a relevance expert. Analyze the provided URLs and their content to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query.";
|
||||
return `You are a relevance expert scoring links from a website the user is trying to extract information from. Analyze the provided URLs and their content
|
||||
to determine their relevance to the user's query and intent.
|
||||
For each URL, assign a relevance score between 0 and 1, where 1
|
||||
means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it.
|
||||
Always return all the links scored that you are giving. Do not omit links.
|
||||
Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`;
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
|
||||
return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`;
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
|
||||
export async function spreadSchemas(
|
||||
schema: any,
|
||||
keys: string[],
|
||||
@ -6,14 +8,45 @@ export async function spreadSchemas(
|
||||
multiEntitySchema: any;
|
||||
}> {
|
||||
let singleAnswerSchema = { ...schema, properties: { ...schema.properties } };
|
||||
let multiEntitySchema: any = { type: "object", properties: {} };
|
||||
let multiEntitySchema: any = {
|
||||
type: "object",
|
||||
properties: {},
|
||||
...(schema.required ? { required: [] } : {})
|
||||
};
|
||||
|
||||
// Helper function to check if a property path exists in schema
|
||||
const hasPropertyPath = (schema: any, path: string[]): boolean => {
|
||||
let current = schema.properties;
|
||||
for (let i = 0; i < path.length; i++) {
|
||||
if (!current[path[i]]) return false;
|
||||
if (current[path[i]].type === "array" && current[path[i]].items) {
|
||||
current = current[path[i]].items.properties;
|
||||
} else {
|
||||
current = current[path[i]].properties;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// Helper function to get the root property of a dot path
|
||||
const getRootProperty = (path: string): string => {
|
||||
return path.split('.')[0];
|
||||
};
|
||||
|
||||
keys.forEach((key) => {
|
||||
if (singleAnswerSchema.properties[key]) {
|
||||
multiEntitySchema.properties[key] = singleAnswerSchema.properties[key];
|
||||
delete singleAnswerSchema.properties[key];
|
||||
const rootProperty = getRootProperty(key);
|
||||
if (singleAnswerSchema.properties[rootProperty]) {
|
||||
multiEntitySchema.properties[rootProperty] = singleAnswerSchema.properties[rootProperty];
|
||||
delete singleAnswerSchema.properties[rootProperty];
|
||||
|
||||
// Move required field if it exists
|
||||
if (schema.required?.includes(rootProperty)) {
|
||||
multiEntitySchema.required.push(rootProperty);
|
||||
singleAnswerSchema.required = schema.required.filter((k: string) => k !== rootProperty);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Recursively delete empty properties in singleAnswerSchema
|
||||
const deleteEmptyProperties = (schema: any) => {
|
||||
for (const key in schema.properties) {
|
||||
@ -34,10 +67,14 @@ export async function spreadSchemas(
|
||||
// If singleAnswerSchema has no properties left, return an empty object
|
||||
if (Object.keys(singleAnswerSchema.properties).length === 0) {
|
||||
singleAnswerSchema = {};
|
||||
} else if (singleAnswerSchema.required?.length === 0) {
|
||||
delete singleAnswerSchema.required;
|
||||
}
|
||||
|
||||
if (Object.keys(multiEntitySchema.properties).length === 0) {
|
||||
multiEntitySchema = {};
|
||||
} else if (multiEntitySchema.required?.length === 0) {
|
||||
delete multiEntitySchema.required;
|
||||
}
|
||||
|
||||
return {
|
||||
|
@ -8,6 +8,7 @@ import { searchSimilarPages } from "./index/pinecone";
|
||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||
import { dumpToFile } from "./helpers/dump-to-file";
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
@ -158,24 +159,27 @@ function filterAndProcessLinks(
|
||||
}
|
||||
|
||||
export type RerankerResult = {
|
||||
mapDocument: MapDocument[];
|
||||
mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[];
|
||||
tokensUsed: number;
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<RerankerResult> {
|
||||
export type RerankerOptions = {
|
||||
links: MapDocument[];
|
||||
searchQuery: string;
|
||||
urlTraces: URLTrace[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
// Split mappedLinks into chunks of 200
|
||||
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
|
||||
chunks.push(mappedLinks.slice(i, i + chunkSize));
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
chunks.push(links.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
|
||||
@ -190,8 +194,9 @@ export async function rerankLinksWithLLM(
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" },
|
||||
reason: { type: "string", description: "The reason why you chose the score for this link given the intent." },
|
||||
},
|
||||
required: ["url", "relevanceScore"],
|
||||
required: ["url", "relevanceScore", "reason"],
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -215,6 +220,7 @@ export async function rerankLinksWithLLM(
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
|
||||
const completionPromise = generateOpenAICompletions(
|
||||
logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
@ -229,7 +235,7 @@ export async function rerankLinksWithLLM(
|
||||
},
|
||||
linksContent,
|
||||
undefined,
|
||||
true,
|
||||
true
|
||||
);
|
||||
|
||||
const completion = await Promise.race([
|
||||
@ -275,10 +281,15 @@ export async function rerankLinksWithLLM(
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => mappedLinks.find((link) => link.url === result.url))
|
||||
.filter((link): link is MapDocument => link !== undefined);
|
||||
.map((result) => {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason };
|
||||
}
|
||||
return undefined;
|
||||
})
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
tokensUsed: totalTokensUsed,
|
||||
|
@ -203,38 +203,62 @@ export async function processUrl(
|
||||
rephrasedPrompt
|
||||
});
|
||||
|
||||
logger.info("Reranking (pass 1)...");
|
||||
const rerankerResult = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
rephrasedPrompt,
|
||||
urlTraces,
|
||||
);
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let rerankedLinks = mappedLinks;
|
||||
logger.info("Reranking pass 1 (threshold 0.8)...");
|
||||
const rerankerResult = await rerankLinksWithLLM({
|
||||
links: rerankedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces
|
||||
});
|
||||
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.8);
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
|
||||
logger.info("Reranked! (threshold 0.8)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
const rerankerResult = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
rephrasedPrompt,
|
||||
urlTraces,
|
||||
);
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
linkCount: mappedLinks.length,
|
||||
// lower threshold to 0.6 if no links are found
|
||||
if (rerankedLinks.length === 0) {
|
||||
logger.info("No links found. Reranking with threshold 0.6");
|
||||
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
|
||||
logger.info("Reranked! (threshold 0.6)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
}
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
// lower threshold to 0.3 if no links are found
|
||||
if (rerankedLinks.length === 0) {
|
||||
logger.info("No links found. Reranking with threshold 0.3");
|
||||
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3);
|
||||
logger.info("Reranked! (threshold 0.3)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
}
|
||||
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (rerankedLinks.length > 100) {
|
||||
logger.info("Reranking pass 2 (> 100 links - threshold 0.6)...");
|
||||
const secondPassRerankerResult = await rerankLinksWithLLM({
|
||||
links: rerankedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
|
||||
// why 0.6? average? experimental results?
|
||||
if (secondPassRerankerResult.mapDocument.length > 0) {
|
||||
rerankedLinks = secondPassRerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
|
||||
logger.info("Reranked! (threshold 0.6)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If no relevant links are found, return the original mapped links
|
||||
if (rerankedLinks.length === 0) {
|
||||
logger.info("No links found. Not reranking.");
|
||||
rerankedLinks = mappedLinks;
|
||||
}
|
||||
|
||||
// Remove title and description from mappedLinks
|
||||
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
|
||||
return mappedLinks.map((x) => x.url);
|
||||
|
Loading…
x
Reference in New Issue
Block a user