This commit is contained in:
Nicolas 2024-11-14 15:26:15 -05:00
parent 5056dcd8e9
commit ebe9de2ac5
4 changed files with 36 additions and 20 deletions

View File

@ -28,8 +28,8 @@ configDotenv();
const redis = new Redis(process.env.REDIS_URL!); const redis = new Redis(process.env.REDIS_URL!);
const MAX_EXTRACT_LIMIT = 100; const MAX_EXTRACT_LIMIT = 100;
const MAX_RANKING_LIMIT = 5; const MAX_RANKING_LIMIT = 10;
const SCORE_THRESHOLD = 0.75; const SCORE_THRESHOLD = 0.70;
export async function extractController( export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
@ -64,28 +64,34 @@ export async function extractController(
allowExternalLinks, allowExternalLinks,
origin: req.body.origin, origin: req.body.origin,
limit: req.body.limit, limit: req.body.limit,
ignoreSitemap: false, ignoreSitemap: true,
includeMetadata: true, includeMetadata: true,
includeSubdomains: req.body.includeSubdomains, includeSubdomains: req.body.includeSubdomains,
}); });
let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); let mappedLinks = mapResults.links as MapDocument[];
// Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
// Filter by path prefix if present // Filter by path prefix if present
if (pathPrefix) { if (pathPrefix) {
mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`)); mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
} }
if (req.body.prompt) { if (req.body.prompt) {
const linksAndScores = await performRanking(mappedLinks, mapUrl); const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
mappedLinks = linksAndScores mappedLinks = linksAndScores
.filter(x => x.score > SCORE_THRESHOLD) .filter(x => x.score > SCORE_THRESHOLD)
.map(x => x.link.split("url: ")[1].split(",")[0]) .map(x => mappedLinks.find(link => link.url === x.link))
.filter(x => !isUrlBlocked(x)) .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url))
.slice(0, MAX_RANKING_LIMIT); .slice(0, MAX_RANKING_LIMIT);
console.log("linksAndScores", linksAndScores);
console.log("linksAndScores", linksAndScores.length);
} }
return mappedLinks; return mappedLinks.map(x => x.url) as string[];
} else { } else {
// Handle direct URLs without glob pattern // Handle direct URLs without glob pattern
@ -100,6 +106,8 @@ export async function extractController(
const processedUrls = await Promise.all(urlPromises); const processedUrls = await Promise.all(urlPromises);
links.push(...processedUrls.flat()); links.push(...processedUrls.flat());
console.log("links", links.length);
console
// Scrape all links in parallel // Scrape all links in parallel
const scrapePromises = links.map(async (url) => { const scrapePromises = links.map(async (url) => {
const origin = req.body.origin || "api"; const origin = req.body.origin || "api";

View File

@ -1,6 +1,7 @@
import { Response } from "express"; import { Response } from "express";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { import {
MapDocument,
mapRequestSchema, mapRequestSchema,
RequestWithAuth, RequestWithAuth,
scrapeOptions, scrapeOptions,
@ -129,7 +130,7 @@ export async function getMapResults({
}); });
} }
let mapResults = allResults let mapResults : MapDocument[] = allResults
.flat() .flat()
.filter((result) => result !== null && result !== undefined); .filter((result) => result !== null && result !== undefined);

View File

@ -40,24 +40,33 @@ const textToVector = (searchQuery: string, text: string): number[] => {
}); });
}; };
async function performRanking(links: string[], searchQuery: string) { async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
try { try {
// Generate embeddings for the search query // Generate embeddings for the search query
const queryEmbedding = await getEmbedding(searchQuery); const queryEmbedding = await getEmbedding(searchQuery);
// Generate embeddings for each link and calculate similarity // Generate embeddings for each link and calculate similarity
const linksAndScores = await Promise.all(links.map(async (link) => { const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
const linkEmbedding = await getEmbedding(link); const linkEmbedding = await getEmbedding(linkWithContext);
// console.log("linkEmbedding", linkEmbedding); // console.log("linkEmbedding", linkEmbedding);
// const linkVector = textToVector(searchQuery, link); // const linkVector = textToVector(searchQuery, linkWithContext);
const score = cosineSimilarity(queryEmbedding, linkEmbedding); const score = cosineSimilarity(queryEmbedding, linkEmbedding);
// console.log("score", score); // console.log("score", score);
return { link, score }; return {
link: links[index], // Use corresponding link from links array
linkWithContext,
score,
originalIndex: index // Store original position
};
})); }));
// Sort links based on similarity scores // Sort links based on similarity scores while preserving original order for equal scores
linksAndScores.sort((a, b) => b.score - a.score); linksAndScores.sort((a, b) => {
const scoreDiff = b.score - a.score;
// If scores are equal, maintain original order
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
});
return linksAndScores; return linksAndScores;
} catch (error) { } catch (error) {

View File

@ -29,8 +29,6 @@ export async function fireEngineMap(
page: options.page ?? 1, page: options.page ?? 1,
}); });
console.log("data", data);
if (!process.env.FIRE_ENGINE_BETA_URL) { if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn( console.warn(
"(v1/map Beta) Results might differ from cloud offering currently." "(v1/map Beta) Results might differ from cloud offering currently."