mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 19:35:55 +08:00
Nick: better filtering for urls that should be scraped
This commit is contained in:
parent
3b6edef9fa
commit
ac187452c3
@ -26,6 +26,7 @@ import { getMapResults } from "./map";
|
|||||||
import { buildDocument } from "../../lib/extract/build-document";
|
import { buildDocument } from "../../lib/extract/build-document";
|
||||||
import { generateBasicCompletion } from "../../lib/LLM-extraction";
|
import { generateBasicCompletion } from "../../lib/LLM-extraction";
|
||||||
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
|
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
|
||||||
|
import { removeDuplicateUrls } from "../../lib/validateUrl";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -88,7 +89,21 @@ export async function extractController(
|
|||||||
includeSubdomains: req.body.includeSubdomains,
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.links as MapDocument[];
|
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||||
|
|
||||||
|
// Remove duplicates between mapResults.links and mappedLinks
|
||||||
|
const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links];
|
||||||
|
const uniqueUrls = removeDuplicateUrls(allUrls);
|
||||||
|
|
||||||
|
// Only add URLs from mapResults.links that aren't already in mappedLinks
|
||||||
|
const existingUrls = new Set(mappedLinks.map(m => m.url));
|
||||||
|
const newUrls = uniqueUrls.filter(url => !existingUrls.has(url));
|
||||||
|
|
||||||
|
mappedLinks = [
|
||||||
|
...mappedLinks,
|
||||||
|
...newUrls.map(url => ({ url, title: "", description: "" }))
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
if (mappedLinks.length === 0) {
|
if (mappedLinks.length === 0) {
|
||||||
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||||
@ -102,6 +117,7 @@ export async function extractController(
|
|||||||
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
let searchQuery =
|
let searchQuery =
|
||||||
req.body.prompt && allowExternalLinks
|
req.body.prompt && allowExternalLinks
|
||||||
|
@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500;
|
|||||||
|
|
||||||
interface MapResult {
|
interface MapResult {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
links: string[] | any[];
|
links: string[];
|
||||||
scrape_id?: string;
|
scrape_id?: string;
|
||||||
job_id: string;
|
job_id: string;
|
||||||
time_taken: number;
|
time_taken: number;
|
||||||
|
mapResults: MapDocument[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getMapResults({
|
export async function getMapResults({
|
||||||
@ -215,7 +216,8 @@ export async function getMapResults({
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
links: includeMetadata ? mapResults : linksToReturn,
|
links: linksToReturn,
|
||||||
|
mapResults: mapResults,
|
||||||
scrape_id: origin?.includes("website") ? id : undefined,
|
scrape_id: origin?.includes("website") ? id : undefined,
|
||||||
job_id: id,
|
job_id: id,
|
||||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||||
|
@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string {
|
|||||||
Original prompt: "${prompt}"
|
Original prompt: "${prompt}"
|
||||||
|
|
||||||
Provide a rephrased search query that:
|
Provide a rephrased search query that:
|
||||||
1. Maintains the core intent of the original prompt
|
1. Maintains the core intent of the original prompt with ONLY the keywords
|
||||||
2. Uses relevant keywords
|
2. Uses relevant keywords
|
||||||
3. Is optimized for search engine results
|
3. Is optimized for search engine results
|
||||||
4. Is concise and focused
|
4. Is concise and focused
|
||||||
5. Short is better than long
|
5. Short is better than long
|
||||||
|
6. It is a search engine, not a chatbot
|
||||||
|
7. Concise
|
||||||
|
|
||||||
Return only the rephrased search query, without any explanation or additional text.`;
|
Return only the rephrased search query, without any explanation or additional text.`;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user