diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index f3e94b77..c0e06a2d 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -26,6 +26,7 @@ import { getMapResults } from "./map"; import { buildDocument } from "../../lib/extract/build-document"; import { generateBasicCompletion } from "../../lib/LLM-extraction"; import { buildRefrasedPrompt } from "../../lib/extract/build-prompts"; +import { removeDuplicateUrls } from "../../lib/validateUrl"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -88,8 +89,22 @@ export async function extractController( includeSubdomains: req.body.includeSubdomains, }); - let mappedLinks = mapResults.links as MapDocument[]; + let mappedLinks = mapResults.mapResults as MapDocument[]; + // Remove duplicates between mapResults.links and mappedLinks + const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links]; + const uniqueUrls = removeDuplicateUrls(allUrls); + + // Only add URLs from mapResults.links that aren't already in mappedLinks + const existingUrls = new Set(mappedLinks.map(m => m.url)); + const newUrls = uniqueUrls.filter(url => !existingUrls.has(url)); + + mappedLinks = [ + ...mappedLinks, + ...newUrls.map(url => ({ url, title: "", description: "" })) + ]; + + if (mappedLinks.length === 0) { mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } @@ -102,6 +117,7 @@ export async function extractController( `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); + if (req.body.prompt) { let searchQuery = req.body.prompt && allowExternalLinks diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 39393313..27a926fc 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500; interface MapResult { success: boolean; - links: string[] | any[]; + links: string[]; scrape_id?: string; job_id: string; time_taken: number; + mapResults: MapDocument[]; } export async function getMapResults({ @@ -215,7 +216,8 @@ export async function getMapResults({ return { success: true, - links: includeMetadata ? mapResults : linksToReturn, + links: linksToReturn, + mapResults: mapResults, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, time_taken: (new Date().getTime() - Date.now()) / 1000, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 1ab117c2..f554eadc 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string { Original prompt: "${prompt}" Provide a rephrased search query that: -1. Maintains the core intent of the original prompt +1. Maintains the core intent of the original prompt with ONLY the keywords 2. Uses relevant keywords 3. Is optimized for search engine results 4. Is concise and focused 5. Short is better than long +6. It is a search engine, not a chatbot +7. Concise Return only the rephrased search query, without any explanation or additional text.`; }