Nick: fixed

2025-08-12 11:19:02 +08:00 · 2025-01-03 22:15:23 -03:00 · 2025-01-03 22:15:23 -03:00 · a4f7c38834
commit a4f7c38834
parent 8df1c67961
2 changed files with 44 additions and 15 deletions
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -300,8 +300,11 @@ export function buildFallbackList(meta: Meta): {
 }[] {
  if (meta.internalOptions.useCache !== true) {
-    engines.splice(engines.indexOf("cache"), 1);
+    const cacheIndex = engines.indexOf("cache");
-  }else{
+    if (cacheIndex !== -1) {
      engines.splice(cacheIndex, 1);
    }
  } else {
    meta.logger.debug("Cache engine enabled by useCache option");
  }
  const prioritySum = [...meta.featureFlags].reduce(
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -84,18 +84,43 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
    // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
    if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
      try {
-        const { error } = await supabase_service
+        // First check if entry exists for this origin URL
        const { data: existingMap } = await supabase_service
          .from('crawl_maps')
-          .insert({
+          .select('urls')
-            crawl_id: job.data.crawl_id,
+          .eq('origin_url', sc.originUrl)
-            team_id: job.data.team_id,
+          .single();
-            origin_url: sc.originUrl,
+
-            urls: visitedUrls,
+        if (existingMap) {
-            created_at: new Date().toISOString()
+          // Merge URLs, removing duplicates
-          });
+          const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
-        if (error) {
+          const { error } = await supabase_service
-          _logger.error("Failed to save crawl map", { error });
+            .from('crawl_maps')
            .update({
              urls: mergedUrls,
              num_urls: mergedUrls.length,
              updated_at: new Date().toISOString()
            })
            .eq('origin_url', sc.originUrl);
          if (error) {
            _logger.error("Failed to update crawl map", { error });
          }
        } else {
          // Insert new entry if none exists
          const { error } = await supabase_service
            .from('crawl_maps')
            .insert({
              origin_url: sc.originUrl,
              urls: visitedUrls,
              num_urls: visitedUrls.length,
              created_at: new Date().toISOString()
            });
          if (error) {
            _logger.error("Failed to save crawl map", { error });
          }
        }
      } catch (error) {
        _logger.error("Error saving crawl map", { error });
@ -802,9 +827,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                newJobId: jobId,
              });
            } else {
-              logger.debug("Could not lock URL " + JSON.stringify(link), {
+              // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
-                url: link,
+              // logger.debug("Could not lock URL " + JSON.stringify(link), {
-              });
+              //   url: link,
              // });
            }
          }
        }