Nick: fixed

2025-08-08 16:19:02 +08:00 · 2025-01-03 22:50:53 -03:00 · 2025-01-03 22:50:53 -03:00 · c655c6859f
commit c655c6859f
parent a4f7c38834
2 changed files with 63 additions and 41 deletions
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@ -0,0 +1,7 @@
 export function normalizeUrl(url: string) {
  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
  if (url.endsWith("/")) {
    url = url.slice(0, -1);
  }
  return url;
 }
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { supabase_service } from "../services/supabase";
 import { normalizeUrl } from "../lib/canonical-url";
 configDotenv();
@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
  if (await finishCrawl(job.data.crawl_id)) {
-    // Get all visited URLs from Redis
+    (async () => {
-    const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
+      const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
-    
+      // Get all visited URLs from Redis
-    // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
+      const visitedUrls = await redisConnection.smembers(
-    if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
+        "crawl:" + job.data.crawl_id + ":visited",
-      try {
+      );
-        // First check if entry exists for this origin URL
+      // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
-        const { data: existingMap } = await supabase_service
+      if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
-          .from('crawl_maps')
+        // Fire and forget the upload to Supabase
-          .select('urls')
+        try {
-          .eq('origin_url', sc.originUrl)
+          // Standardize URLs to canonical form (https, no www)
-          .single();
+          const standardizedUrls = [
            ...new Set(
              visitedUrls.map((url) => {
                return normalizeUrl(url);
              }),
            ),
          ];
          // First check if entry exists for this origin URL
          const { data: existingMap } = await supabase_service
            .from("crawl_maps")
            .select("urls")
            .eq("origin_url", originUrl)
            .single();
-        if (existingMap) {
+          if (existingMap) {
-          // Merge URLs, removing duplicates
+            // Merge URLs, removing duplicates
-          const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
+            const mergedUrls = [
-          
+              ...new Set([...existingMap.urls, ...standardizedUrls]),
-          const { error } = await supabase_service
+            ];
            .from('crawl_maps')
            .update({
              urls: mergedUrls,
              num_urls: mergedUrls.length,
              updated_at: new Date().toISOString()
            })
            .eq('origin_url', sc.originUrl);
-          if (error) {
+            const { error } = await supabase_service
-            _logger.error("Failed to update crawl map", { error });
+              .from("crawl_maps")
-          }
+              .update({
-        } else {
+                urls: mergedUrls,
-          // Insert new entry if none exists
+                num_urls: mergedUrls.length,
-          const { error } = await supabase_service
+                updated_at: new Date().toISOString(),
-            .from('crawl_maps')
+              })
-            .insert({
+              .eq("origin_url", originUrl);
-              origin_url: sc.originUrl,
+
-              urls: visitedUrls,
+            if (error) {
-              num_urls: visitedUrls.length,
+              _logger.error("Failed to update crawl map", { error });
-              created_at: new Date().toISOString()
+            }
          } else {
            // Insert new entry if none exists
            const { error } = await supabase_service.from("crawl_maps").insert({
              origin_url: originUrl,
              urls: standardizedUrls,
              num_urls: standardizedUrls.length,
              created_at: new Date().toISOString(),
              updated_at: new Date().toISOString(),
            });
-          if (error) {
+            if (error) {
-            _logger.error("Failed to save crawl map", { error });
+              _logger.error("Failed to save crawl map", { error });
            }
          }
        } catch (error) {
          _logger.error("Error saving crawl map", { error });
        }
      } catch (error) {
        _logger.error("Error saving crawl map", { error });
      }
-    }
+    })();
    if (!job.data.v1) {
      const jobIDs = await getCrawlJobs(job.data.crawl_id);