mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 23:39:01 +08:00
Nick: better filtering for urls that should be scraped
This commit is contained in:
parent
3b6edef9fa
commit
ac187452c3
@ -26,6 +26,7 @@ import { getMapResults } from "./map";
|
||||
import { buildDocument } from "../../lib/extract/build-document";
|
||||
import { generateBasicCompletion } from "../../lib/LLM-extraction";
|
||||
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
|
||||
import { removeDuplicateUrls } from "../../lib/validateUrl";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
@ -88,8 +89,22 @@ export async function extractController(
|
||||
includeSubdomains: req.body.includeSubdomains,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.links as MapDocument[];
|
||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||
|
||||
// Remove duplicates between mapResults.links and mappedLinks
|
||||
const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links];
|
||||
const uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
|
||||
// Only add URLs from mapResults.links that aren't already in mappedLinks
|
||||
const existingUrls = new Set(mappedLinks.map(m => m.url));
|
||||
const newUrls = uniqueUrls.filter(url => !existingUrls.has(url));
|
||||
|
||||
mappedLinks = [
|
||||
...mappedLinks,
|
||||
...newUrls.map(url => ({ url, title: "", description: "" }))
|
||||
];
|
||||
|
||||
|
||||
if (mappedLinks.length === 0) {
|
||||
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||
}
|
||||
@ -102,6 +117,7 @@ export async function extractController(
|
||||
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||
);
|
||||
|
||||
|
||||
if (req.body.prompt) {
|
||||
let searchQuery =
|
||||
req.body.prompt && allowExternalLinks
|
||||
|
@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500;
|
||||
|
||||
interface MapResult {
|
||||
success: boolean;
|
||||
links: string[] | any[];
|
||||
links: string[];
|
||||
scrape_id?: string;
|
||||
job_id: string;
|
||||
time_taken: number;
|
||||
mapResults: MapDocument[];
|
||||
}
|
||||
|
||||
export async function getMapResults({
|
||||
@ -215,7 +216,8 @@ export async function getMapResults({
|
||||
|
||||
return {
|
||||
success: true,
|
||||
links: includeMetadata ? mapResults : linksToReturn,
|
||||
links: linksToReturn,
|
||||
mapResults: mapResults,
|
||||
scrape_id: origin?.includes("website") ? id : undefined,
|
||||
job_id: id,
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
|
@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string {
|
||||
Original prompt: "${prompt}"
|
||||
|
||||
Provide a rephrased search query that:
|
||||
1. Maintains the core intent of the original prompt
|
||||
1. Maintains the core intent of the original prompt with ONLY the keywords
|
||||
2. Uses relevant keywords
|
||||
3. Is optimized for search engine results
|
||||
4. Is concise and focused
|
||||
5. Short is better than long
|
||||
6. It is a search engine, not a chatbot
|
||||
7. Concise
|
||||
|
||||
Return only the rephrased search query, without any explanation or additional text.`;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user