mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 07:47:12 +08:00
Update extract.ts
This commit is contained in:
parent
a18614cd00
commit
2513efc971
@ -30,7 +30,9 @@ const redis = new Redis(process.env.REDIS_URL!);
|
|||||||
|
|
||||||
const MAX_EXTRACT_LIMIT = 100;
|
const MAX_EXTRACT_LIMIT = 100;
|
||||||
const MAX_RANKING_LIMIT = 10;
|
const MAX_RANKING_LIMIT = 10;
|
||||||
const SCORE_THRESHOLD = 0.75;
|
const INITIAL_SCORE_THRESHOLD = 0.75;
|
||||||
|
const FALLBACK_SCORE_THRESHOLD = 0.5;
|
||||||
|
const MIN_REQUIRED_LINKS = 3;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts data from the provided URLs based on the request parameters.
|
* Extracts data from the provided URLs based on the request parameters.
|
||||||
@ -94,19 +96,28 @@ export async function extractController(
|
|||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
// Get similarity scores between the search query and each link's context
|
// Get similarity scores between the search query and each link's context
|
||||||
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
||||||
|
|
||||||
mappedLinks = linksAndScores
|
// First try with high threshold
|
||||||
// Only keep links that have a similarity score above the threshold
|
let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
|
||||||
.filter(x => x.score > SCORE_THRESHOLD)
|
|
||||||
// Map back to the original link objects
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
.map(x => mappedLinks.find(link => link.url === x.link))
|
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
||||||
// Remove any undefined links, links without URLs, and blocked URLs
|
logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
|
||||||
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url))
|
filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
|
||||||
// Limit the number of results
|
|
||||||
.slice(0, MAX_RANKING_LIMIT);
|
if (filteredLinks.length === 0) {
|
||||||
|
// If still no results, take top N results regardless of score
|
||||||
|
logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
|
||||||
|
filteredLinks = linksAndScores
|
||||||
|
.sort((a, b) => b.score - a.score)
|
||||||
|
.slice(0, MIN_REQUIRED_LINKS)
|
||||||
|
.map(x => mappedLinks.find(link => link.url === x.link))
|
||||||
|
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: handle case where no links are returned
|
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mappedLinks.map(x => x.url) as string[];
|
return mappedLinks.map(x => x.url) as string[];
|
||||||
@ -124,8 +135,14 @@ export async function extractController(
|
|||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
links.push(...processedUrls.flat());
|
links.push(...processedUrls.flat());
|
||||||
|
|
||||||
// console.log("links", links.length);
|
if (links.length === 0) {
|
||||||
// Scrape all links in parallel
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scrape all links in parallel with retries
|
||||||
const scrapePromises = links.map(async (url) => {
|
const scrapePromises = links.map(async (url) => {
|
||||||
const origin = req.body.origin || "api";
|
const origin = req.body.origin || "api";
|
||||||
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
|
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
|
||||||
@ -210,9 +227,6 @@ export async function extractController(
|
|||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
// console.log("completions.extract", completions.extract);
|
|
||||||
|
|
||||||
let data: any;
|
let data: any;
|
||||||
let warning = completions.warning ?? "";
|
let warning = completions.warning ?? "";
|
||||||
try {
|
try {
|
||||||
@ -244,4 +258,15 @@ export async function extractController(
|
|||||||
scrape_id: id,
|
scrape_id: id,
|
||||||
warning: warning
|
warning: warning
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function filterAndProcessLinks(
|
||||||
|
mappedLinks: MapDocument[],
|
||||||
|
linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
|
||||||
|
threshold: number
|
||||||
|
): MapDocument[] {
|
||||||
|
return linksAndScores
|
||||||
|
.filter(x => x.score > threshold)
|
||||||
|
.map(x => mappedLinks.find(link => link.url === x.link))
|
||||||
|
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user