Nick: better filtering for urls that should be scraped

This commit is contained in:
Nicolas 2024-12-17 17:34:55 -03:00
parent 3b6edef9fa
commit ac187452c3
3 changed files with 24 additions and 4 deletions

View File

@ -26,6 +26,7 @@ import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
import { generateBasicCompletion } from "../../lib/LLM-extraction";
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
import { removeDuplicateUrls } from "../../lib/validateUrl";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
@ -88,8 +89,22 @@ export async function extractController(
includeSubdomains: req.body.includeSubdomains,
});
let mappedLinks = mapResults.links as MapDocument[];
let mappedLinks = mapResults.mapResults as MapDocument[];
// Remove duplicates between mapResults.links and mappedLinks
const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links];
const uniqueUrls = removeDuplicateUrls(allUrls);
// Only add URLs from mapResults.links that aren't already in mappedLinks
const existingUrls = new Set(mappedLinks.map(m => m.url));
const newUrls = uniqueUrls.filter(url => !existingUrls.has(url));
mappedLinks = [
...mappedLinks,
...newUrls.map(url => ({ url, title: "", description: "" }))
];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
@ -102,6 +117,7 @@ export async function extractController(
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
);
if (req.body.prompt) {
let searchQuery =
req.body.prompt && allowExternalLinks

View File

@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500;
interface MapResult {
success: boolean;
links: string[] | any[];
links: string[];
scrape_id?: string;
job_id: string;
time_taken: number;
mapResults: MapDocument[];
}
export async function getMapResults({
@ -215,7 +216,8 @@ export async function getMapResults({
return {
success: true,
links: includeMetadata ? mapResults : linksToReturn,
links: linksToReturn,
mapResults: mapResults,
scrape_id: origin?.includes("website") ? id : undefined,
job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000,

View File

@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string {
Original prompt: "${prompt}"
Provide a rephrased search query that:
1. Maintains the core intent of the original prompt
1. Maintains the core intent of the original prompt with ONLY the keywords
2. Uses relevant keywords
3. Is optimized for search engine results
4. Is concise and focused
5. Short is better than long
6. It is a search engine, not a chatbot
7. Concise
Return only the rephrased search query, without any explanation or additional text.`;
}