diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 71b1f9eb..0a94289c 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -28,8 +28,8 @@ configDotenv(); const redis = new Redis(process.env.REDIS_URL!); const MAX_EXTRACT_LIMIT = 100; -const MAX_RANKING_LIMIT = 5; -const SCORE_THRESHOLD = 0.75; +const MAX_RANKING_LIMIT = 10; +const SCORE_THRESHOLD = 0.70; export async function extractController( req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, @@ -64,28 +64,34 @@ export async function extractController( allowExternalLinks, origin: req.body.origin, limit: req.body.limit, - ignoreSitemap: false, + ignoreSitemap: true, includeMetadata: true, includeSubdomains: req.body.includeSubdomains, }); - let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); + let mappedLinks = mapResults.links as MapDocument[]; + // Limit number of links to MAX_EXTRACT_LIMIT + mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); + + let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); // Filter by path prefix if present if (pathPrefix) { - mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`)); + mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); } if (req.body.prompt) { - const linksAndScores = await performRanking(mappedLinks, mapUrl); + const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl); mappedLinks = linksAndScores .filter(x => x.score > SCORE_THRESHOLD) - .map(x => x.link.split("url: ")[1].split(",")[0]) - .filter(x => !isUrlBlocked(x)) + .map(x => mappedLinks.find(link => link.url === x.link)) + .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)) .slice(0, MAX_RANKING_LIMIT); + console.log("linksAndScores", linksAndScores); + console.log("linksAndScores", linksAndScores.length); } - return mappedLinks; + return mappedLinks.map(x => x.url) as string[]; } else { // Handle direct URLs without glob pattern @@ -100,6 +106,8 @@ export async function extractController( const processedUrls = await Promise.all(urlPromises); links.push(...processedUrls.flat()); + console.log("links", links.length); + console // Scrape all links in parallel const scrapePromises = links.map(async (url) => { const origin = req.body.origin || "api"; diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index f2e9453a..ba7be01f 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,6 +1,7 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { + MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions, @@ -86,7 +87,7 @@ export async function getMapResults({ ? `${search} ${urlWithoutWww}` : search ? `${search} site:${urlWithoutWww}` : `site:${url}`; - + const resultsPerPage = 100; const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); @@ -129,7 +130,7 @@ export async function getMapResults({ }); } - let mapResults = allResults + let mapResults : MapDocument[] = allResults .flat() .filter((result) => result !== null && result !== undefined); diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index 7cd39820..9a200f49 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -40,24 +40,33 @@ const textToVector = (searchQuery: string, text: string): number[] => { }); }; -async function performRanking(links: string[], searchQuery: string) { +async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) { try { // Generate embeddings for the search query const queryEmbedding = await getEmbedding(searchQuery); // Generate embeddings for each link and calculate similarity - const linksAndScores = await Promise.all(links.map(async (link) => { - const linkEmbedding = await getEmbedding(link); + const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => { + const linkEmbedding = await getEmbedding(linkWithContext); // console.log("linkEmbedding", linkEmbedding); - // const linkVector = textToVector(searchQuery, link); + // const linkVector = textToVector(searchQuery, linkWithContext); const score = cosineSimilarity(queryEmbedding, linkEmbedding); // console.log("score", score); - return { link, score }; + return { + link: links[index], // Use corresponding link from links array + linkWithContext, + score, + originalIndex: index // Store original position + }; })); - // Sort links based on similarity scores - linksAndScores.sort((a, b) => b.score - a.score); + // Sort links based on similarity scores while preserving original order for equal scores + linksAndScores.sort((a, b) => { + const scoreDiff = b.score - a.score; + // If scores are equal, maintain original order + return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff; + }); return linksAndScores; } catch (error) { diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 0b82478e..1eb2419f 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -29,8 +29,6 @@ export async function fireEngineMap( page: options.page ?? 1, }); - console.log("data", data); - if (!process.env.FIRE_ENGINE_BETA_URL) { console.warn( "(v1/map Beta) Results might differ from cloud offering currently."