From 513f61a2d1966ea22d6f219ab6d70937bdfafc22 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 19 Jan 2025 12:33:44 -0300 Subject: [PATCH] Nick: map improvements --- apps/api/src/controllers/v1/map.ts | 2 + .../src/scraper/WebScraper/sitemap-index.ts | 34 +++++++--- .../src/services/indexing/crawl-maps-index.ts | 62 +++++++++++++++++++ apps/api/src/services/queue-worker.ts | 56 ++--------------- 4 files changed, 92 insertions(+), 62 deletions(-) create mode 100644 apps/api/src/services/indexing/crawl-maps-index.ts diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index c8487bcd..6f8c0b15 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -226,6 +226,8 @@ export async function getMapResults({ ? links : links.slice(0, limit); + // + return { success: true, links: linksToReturn, diff --git a/apps/api/src/scraper/WebScraper/sitemap-index.ts b/apps/api/src/scraper/WebScraper/sitemap-index.ts index 7e60276e..dec76b11 100644 --- a/apps/api/src/scraper/WebScraper/sitemap-index.ts +++ b/apps/api/src/scraper/WebScraper/sitemap-index.ts @@ -12,19 +12,33 @@ import { withAuth } from "../../lib/withAuth"; async function querySitemapIndexFunction(url: string) { const originUrl = normalizeUrlOnlyHostname(url); - const { data, error } = await supabase_service - .from("crawl_maps") - .select("urls") - .eq("origin_url", originUrl); + for (let attempt = 1; attempt <= 3; attempt++) { + try { + const { data, error } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl); - if (error) { - logger.error("(sitemap-index) Error querying the index", { error }); - return []; + if (error) { + throw error; + } + + const allUrls = data.map((entry) => entry.urls).flat(); + return allUrls; + + } catch (error) { + logger.error("(sitemap-index) Error querying the index", { + error, + attempt + }); + + if (attempt === 3) { + return []; + } + } } - const allUrls = data.map((entry) => entry.urls).flat(); - - return allUrls; + return []; } export const querySitemapIndex = withAuth(querySitemapIndexFunction, []); diff --git a/apps/api/src/services/indexing/crawl-maps-index.ts b/apps/api/src/services/indexing/crawl-maps-index.ts new file mode 100644 index 00000000..d5a08547 --- /dev/null +++ b/apps/api/src/services/indexing/crawl-maps-index.ts @@ -0,0 +1,62 @@ +import { logger } from "../../lib/logger"; +import { + normalizeUrl, + normalizeUrlOnlyHostname, +} from "../../lib/canonical-url"; +import { supabase_service } from "../supabase"; + +export async function saveCrawlMap(originUrl: string, visitedUrls: string[]) { + originUrl = normalizeUrlOnlyHostname(originUrl); + // Fire and forget the upload to Supabase + try { + // Standardize URLs to canonical form (https, no www) + const standardizedUrls = [ + ...new Set( + visitedUrls.map((url) => { + return normalizeUrl(url); + }), + ), + ]; + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl) + .single(); + + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [ + ...new Set([...existingMap.urls, ...standardizedUrls]), + ]; + + const { error } = await supabase_service + .from("crawl_maps") + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service.from("crawl_maps").insert({ + origin_url: originUrl, + urls: standardizedUrls, + num_urls: standardizedUrls.length, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + }); + + if (error) { + logger.error("Failed to save crawl map", { error }); + } + } + } catch (error) { + logger.error("Error saving crawl map", { error }); + } +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index de8f5567..ee24e115 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -61,6 +61,7 @@ import { supabase_service } from "../services/supabase"; import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url"; import { saveExtract, updateExtract } from "../lib/extract/extract-redis"; import { billTeam } from "./billing/credit_billing"; +import { saveCrawlMap } from "./indexing/crawl-maps-index"; configDotenv(); @@ -102,58 +103,9 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { job.data.crawlerOptions !== null && originUrl ) { - // Fire and forget the upload to Supabase - try { - // Standardize URLs to canonical form (https, no www) - const standardizedUrls = [ - ...new Set( - visitedUrls.map((url) => { - return normalizeUrl(url); - }), - ), - ]; - // First check if entry exists for this origin URL - const { data: existingMap } = await supabase_service - .from("crawl_maps") - .select("urls") - .eq("origin_url", originUrl) - .single(); - - if (existingMap) { - // Merge URLs, removing duplicates - const mergedUrls = [ - ...new Set([...existingMap.urls, ...standardizedUrls]), - ]; - - const { error } = await supabase_service - .from("crawl_maps") - .update({ - urls: mergedUrls, - num_urls: mergedUrls.length, - updated_at: new Date().toISOString(), - }) - .eq("origin_url", originUrl); - - if (error) { - _logger.error("Failed to update crawl map", { error }); - } - } else { - // Insert new entry if none exists - const { error } = await supabase_service.from("crawl_maps").insert({ - origin_url: originUrl, - urls: standardizedUrls, - num_urls: standardizedUrls.length, - created_at: new Date().toISOString(), - updated_at: new Date().toISOString(), - }); - - if (error) { - _logger.error("Failed to save crawl map", { error }); - } - } - } catch (error) { - _logger.error("Error saving crawl map", { error }); - } + saveCrawlMap(originUrl, visitedUrls).catch((e) => { + _logger.error("Error saving crawl map", { error: e }); + }); } })();