From ac187452c3b73d647dfba11b8ff2531d582eda02 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 17:34:55 -0300 Subject: [PATCH] Nick: better filtering for urls that should be scraped --- apps/api/src/controllers/v1/extract.ts | 18 +++++++++++++++++- apps/api/src/controllers/v1/map.ts | 6 ++++-- apps/api/src/lib/extract/build-prompts.ts | 4 +++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index f3e94b77..c0e06a2d 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -26,6 +26,7 @@ import { getMapResults } from "./map"; import { buildDocument } from "../../lib/extract/build-document"; import { generateBasicCompletion } from "../../lib/LLM-extraction"; import { buildRefrasedPrompt } from "../../lib/extract/build-prompts"; +import { removeDuplicateUrls } from "../../lib/validateUrl"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -88,8 +89,22 @@ export async function extractController( includeSubdomains: req.body.includeSubdomains, }); - let mappedLinks = mapResults.links as MapDocument[]; + let mappedLinks = mapResults.mapResults as MapDocument[]; + // Remove duplicates between mapResults.links and mappedLinks + const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links]; + const uniqueUrls = removeDuplicateUrls(allUrls); + + // Only add URLs from mapResults.links that aren't already in mappedLinks + const existingUrls = new Set(mappedLinks.map(m => m.url)); + const newUrls = uniqueUrls.filter(url => !existingUrls.has(url)); + + mappedLinks = [ + ...mappedLinks, + ...newUrls.map(url => ({ url, title: "", description: "" })) + ]; + + if (mappedLinks.length === 0) { mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } @@ -102,6 +117,7 @@ export async function extractController( `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); + if (req.body.prompt) { let searchQuery = req.body.prompt && allowExternalLinks diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 39393313..27a926fc 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500; interface MapResult { success: boolean; - links: string[] | any[]; + links: string[]; scrape_id?: string; job_id: string; time_taken: number; + mapResults: MapDocument[]; } export async function getMapResults({ @@ -215,7 +216,8 @@ export async function getMapResults({ return { success: true, - links: includeMetadata ? mapResults : linksToReturn, + links: linksToReturn, + mapResults: mapResults, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, time_taken: (new Date().getTime() - Date.now()) / 1000, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 1ab117c2..f554eadc 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string { Original prompt: "${prompt}" Provide a rephrased search query that: -1. Maintains the core intent of the original prompt +1. Maintains the core intent of the original prompt with ONLY the keywords 2. Uses relevant keywords 3. Is optimized for search engine results 4. Is concise and focused 5. Short is better than long +6. It is a search engine, not a chatbot +7. Concise Return only the rephrased search query, without any explanation or additional text.`; }