mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 16:20:48 +08:00
Nick:
This commit is contained in:
parent
5056dcd8e9
commit
ebe9de2ac5
@ -28,8 +28,8 @@ configDotenv();
|
|||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
|
|
||||||
const MAX_EXTRACT_LIMIT = 100;
|
const MAX_EXTRACT_LIMIT = 100;
|
||||||
const MAX_RANKING_LIMIT = 5;
|
const MAX_RANKING_LIMIT = 10;
|
||||||
const SCORE_THRESHOLD = 0.75;
|
const SCORE_THRESHOLD = 0.70;
|
||||||
|
|
||||||
export async function extractController(
|
export async function extractController(
|
||||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
@ -64,28 +64,34 @@ export async function extractController(
|
|||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
limit: req.body.limit,
|
limit: req.body.limit,
|
||||||
ignoreSitemap: false,
|
ignoreSitemap: true,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: req.body.includeSubdomains,
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
let mappedLinks = mapResults.links as MapDocument[];
|
||||||
|
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||||
|
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||||
|
|
||||||
|
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||||
|
|
||||||
// Filter by path prefix if present
|
// Filter by path prefix if present
|
||||||
if (pathPrefix) {
|
if (pathPrefix) {
|
||||||
mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`));
|
mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
const linksAndScores = await performRanking(mappedLinks, mapUrl);
|
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
||||||
mappedLinks = linksAndScores
|
mappedLinks = linksAndScores
|
||||||
.filter(x => x.score > SCORE_THRESHOLD)
|
.filter(x => x.score > SCORE_THRESHOLD)
|
||||||
.map(x => x.link.split("url: ")[1].split(",")[0])
|
.map(x => mappedLinks.find(link => link.url === x.link))
|
||||||
.filter(x => !isUrlBlocked(x))
|
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url))
|
||||||
.slice(0, MAX_RANKING_LIMIT);
|
.slice(0, MAX_RANKING_LIMIT);
|
||||||
|
console.log("linksAndScores", linksAndScores);
|
||||||
|
console.log("linksAndScores", linksAndScores.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mappedLinks;
|
return mappedLinks.map(x => x.url) as string[];
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Handle direct URLs without glob pattern
|
// Handle direct URLs without glob pattern
|
||||||
@ -100,6 +106,8 @@ export async function extractController(
|
|||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
links.push(...processedUrls.flat());
|
links.push(...processedUrls.flat());
|
||||||
|
|
||||||
|
console.log("links", links.length);
|
||||||
|
console
|
||||||
// Scrape all links in parallel
|
// Scrape all links in parallel
|
||||||
const scrapePromises = links.map(async (url) => {
|
const scrapePromises = links.map(async (url) => {
|
||||||
const origin = req.body.origin || "api";
|
const origin = req.body.origin || "api";
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import {
|
import {
|
||||||
|
MapDocument,
|
||||||
mapRequestSchema,
|
mapRequestSchema,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
@ -129,7 +130,7 @@ export async function getMapResults({
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let mapResults = allResults
|
let mapResults : MapDocument[] = allResults
|
||||||
.flat()
|
.flat()
|
||||||
.filter((result) => result !== null && result !== undefined);
|
.filter((result) => result !== null && result !== undefined);
|
||||||
|
|
||||||
|
@ -40,24 +40,33 @@ const textToVector = (searchQuery: string, text: string): number[] => {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
async function performRanking(links: string[], searchQuery: string) {
|
async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
|
||||||
try {
|
try {
|
||||||
// Generate embeddings for the search query
|
// Generate embeddings for the search query
|
||||||
const queryEmbedding = await getEmbedding(searchQuery);
|
const queryEmbedding = await getEmbedding(searchQuery);
|
||||||
|
|
||||||
// Generate embeddings for each link and calculate similarity
|
// Generate embeddings for each link and calculate similarity
|
||||||
const linksAndScores = await Promise.all(links.map(async (link) => {
|
const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
|
||||||
const linkEmbedding = await getEmbedding(link);
|
const linkEmbedding = await getEmbedding(linkWithContext);
|
||||||
|
|
||||||
// console.log("linkEmbedding", linkEmbedding);
|
// console.log("linkEmbedding", linkEmbedding);
|
||||||
// const linkVector = textToVector(searchQuery, link);
|
// const linkVector = textToVector(searchQuery, linkWithContext);
|
||||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||||
// console.log("score", score);
|
// console.log("score", score);
|
||||||
return { link, score };
|
return {
|
||||||
|
link: links[index], // Use corresponding link from links array
|
||||||
|
linkWithContext,
|
||||||
|
score,
|
||||||
|
originalIndex: index // Store original position
|
||||||
|
};
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Sort links based on similarity scores
|
// Sort links based on similarity scores while preserving original order for equal scores
|
||||||
linksAndScores.sort((a, b) => b.score - a.score);
|
linksAndScores.sort((a, b) => {
|
||||||
|
const scoreDiff = b.score - a.score;
|
||||||
|
// If scores are equal, maintain original order
|
||||||
|
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
|
||||||
|
});
|
||||||
|
|
||||||
return linksAndScores;
|
return linksAndScores;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -29,8 +29,6 @@ export async function fireEngineMap(
|
|||||||
page: options.page ?? 1,
|
page: options.page ?? 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log("data", data);
|
|
||||||
|
|
||||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.warn(
|
console.warn(
|
||||||
"(v1/map Beta) Results might differ from cloud offering currently."
|
"(v1/map Beta) Results might differ from cloud offering currently."
|
||||||
|
Loading…
x
Reference in New Issue
Block a user