From bafcc008bc0137b1f548a73523e89ae57f1f78a3 Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:27:00 -0300
Subject: [PATCH 01/18] [SDK] fixed none and undefined on response

---
 apps/js-sdk/firecrawl/package.json     |  2 +-
 apps/js-sdk/firecrawl/src/index.ts     | 46 ++++++++++++++++++++------
 apps/python-sdk/firecrawl/__init__.py  |  2 +-
 apps/python-sdk/firecrawl/firecrawl.py | 38 +++++++++++++++------
 4 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json
index 9aab848a..46f85308 100644
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@mendable/firecrawl-js",
-  "version": "1.11.0",
+  "version": "1.11.1",
   "description": "JavaScript SDK for Firecrawl API",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index af9dbc75..6b89960e 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -571,17 +571,30 @@ export default class FirecrawlApp {
             allData = data;
           }
         }
-        return ({
+
+        let resp: CrawlStatusResponse | ErrorResponse = {
           success: response.data.success,
           status: response.data.status,
           total: response.data.total,
           completed: response.data.completed,
           creditsUsed: response.data.creditsUsed,
           expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
-          data: allData,
-          error: response.data.error,
-        })
+          data: allData
+        }
+
+        if (!response.data.success && response.data.error) {
+          resp = {
+            ...resp,
+            success: false,
+            error: response.data.error
+          } as ErrorResponse;
+        }
+
+        if (response.data.next) {
+          (resp as CrawlStatusResponse).next = response.data.next;
+        }
+        
+        return resp;
       } else {
         this.handleError(response, "check crawl status");
       }
@@ -805,17 +818,30 @@ export default class FirecrawlApp {
             allData = data;
           }
         }
-        return ({
+
+        let resp: BatchScrapeStatusResponse | ErrorResponse = {
           success: response.data.success,
           status: response.data.status,
           total: response.data.total,
           completed: response.data.completed,
           creditsUsed: response.data.creditsUsed,
           expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
-          data: allData,
-          error: response.data.error,
-        })
+          data: allData
+        }
+
+        if (!response.data.success && response.data.error) {
+          resp = {
+            ...resp,
+            success: false,
+            error: response.data.error
+          } as ErrorResponse;
+        }
+
+        if (response.data.next) {
+          (resp as BatchScrapeStatusResponse).next = response.data.next;
+        }
+        
+        return resp;
       } else {
         this.handleError(response, "check batch scrape status");
       }
diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index d4d246e9..5528b3b2 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -13,7 +13,7 @@ import os
 
 from .firecrawl import FirecrawlApp # noqa
 
-__version__ = "1.8.0"
+__version__ = "1.8.1"
 
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 271a13f0..8eb7acee 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -266,17 +266,25 @@ class FirecrawlApp:
                             logger.error(f"Error during pagination request: {e}")
                             break
                     status_data['data'] = data
-                    
-            return {
-                'success': True,
+
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check crawl status')
@@ -476,16 +484,24 @@ class FirecrawlApp:
                             break
                     status_data['data'] = data
 
-            return {
-                'success': True,
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check batch scrape status')

From 55dad5ea13da577e86122fb832b8534627d1f03c Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:56:39 -0300
Subject: [PATCH 02/18] fixed empty data with next causing infinite loop

---
 apps/js-sdk/firecrawl/src/index.ts     | 12 ++++++++++++
 apps/python-sdk/firecrawl/firecrawl.py |  9 +++++++++
 2 files changed, 21 insertions(+)

diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index 6b89960e..687325d3 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -565,6 +565,10 @@ export default class FirecrawlApp {
           if ("data" in statusData) {
             let data = statusData.data;
             while (typeof statusData === 'object' && 'next' in statusData) {
+              if (data.length === 0) {
+                console.warn("Expected 'data' is missing.")
+                break
+              }
               statusData = (await this.getRequest(statusData.next, headers)).data;
               data = data.concat(statusData.data);
             }
@@ -812,6 +816,10 @@ export default class FirecrawlApp {
           if ("data" in statusData) {
             let data = statusData.data;
             while (typeof statusData === 'object' && 'next' in statusData) {
+              if (data.length === 0) {
+                console.warn("Expected 'data' is missing.")
+                break
+              }
               statusData = (await this.getRequest(statusData.next, headers)).data;
               data = data.concat(statusData.data);
             }
@@ -995,6 +1003,10 @@ export default class FirecrawlApp {
               if ("data" in statusData) {
                 let data = statusData.data;
                 while (typeof statusData === 'object' && 'next' in statusData) {
+                  if (data.length === 0) {
+                    console.warn("Expected 'data' is missing.")
+                    break
+                  }
                   statusResponse = await this.getRequest(statusData.next, headers);
                   statusData = statusResponse.data;
                   data = data.concat(statusData.data);
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 8eb7acee..812f7bd1 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -250,6 +250,9 @@ class FirecrawlApp:
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            logger.warning("Expected 'data' is missing.")
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -467,6 +470,9 @@ class FirecrawlApp:
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            logger.warning("Expected 'data' is missing.")
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -685,6 +691,9 @@ class FirecrawlApp:
                     if 'data' in status_data:
                         data = status_data['data']
                         while 'next' in status_data:
+                          if len(status_data['data']) == 0:
+                              logger.warning("Expected 'data' is missing.")
+                              break
                           status_response = self._get_request(status_data['next'], headers)
                           status_data = status_response.json()
                           data.extend(status_data.get('data', []))

From 12cd9f083ca5658519dbf2296c4711cc47407fd1 Mon Sep 17 00:00:00 2001
From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 3 Jan 2025 17:12:30 -0300
Subject: [PATCH 03/18] removed warnings

---
 apps/js-sdk/firecrawl/src/index.ts | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index a3038778..474eea83 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -566,7 +566,6 @@ export default class FirecrawlApp {
             let data = statusData.data;
             while (typeof statusData === 'object' && 'next' in statusData) {
               if (data.length === 0) {
-                console.warn("Expected 'data' is missing.")
                 break
               }
               statusData = (await this.getRequest(statusData.next, headers)).data;
@@ -817,7 +816,6 @@ export default class FirecrawlApp {
             let data = statusData.data;
             while (typeof statusData === 'object' && 'next' in statusData) {
               if (data.length === 0) {
-                console.warn("Expected 'data' is missing.")
                 break
               }
               statusData = (await this.getRequest(statusData.next, headers)).data;
@@ -1006,7 +1004,6 @@ export default class FirecrawlApp {
                 let data = statusData.data;
                 while (typeof statusData === 'object' && 'next' in statusData) {
                   if (data.length === 0) {
-                    console.warn("Expected 'data' is missing.")
                     break
                   }
                   statusResponse = await this.getRequest(statusData.next, headers);

From a54a5dbb4510c641b111106a963874a45c441511 Mon Sep 17 00:00:00 2001
From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 3 Jan 2025 17:13:34 -0300
Subject: [PATCH 04/18] removed warnings

---
 apps/python-sdk/firecrawl/firecrawl.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 812f7bd1..d3216405 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -251,7 +251,6 @@ class FirecrawlApp:
                     data = status_data['data']
                     while 'next' in status_data:
                         if len(status_data['data']) == 0:
-                            logger.warning("Expected 'data' is missing.")
                             break
                         next_url = status_data.get('next')
                         if not next_url:
@@ -471,7 +470,6 @@ class FirecrawlApp:
                     data = status_data['data']
                     while 'next' in status_data:
                         if len(status_data['data']) == 0:
-                            logger.warning("Expected 'data' is missing.")
                             break
                         next_url = status_data.get('next')
                         if not next_url:
@@ -692,7 +690,6 @@ class FirecrawlApp:
                         data = status_data['data']
                         while 'next' in status_data:
                           if len(status_data['data']) == 0:
-                              logger.warning("Expected 'data' is missing.")
                               break
                           status_response = self._get_request(status_data['next'], headers)
                           status_data = status_response.json()

From 6b2e1cbb281362405c4b8729e25eae169ec13851 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 21:19:40 -0300
Subject: [PATCH 05/18] Nick: cache /extract scrapes

---
 apps/api/src/lib/extract/document-scraper.ts    | 15 ++++++++++-----
 apps/api/src/scraper/scrapeURL/engines/index.ts |  6 ++++++
 apps/api/src/scraper/scrapeURL/index.ts         |  2 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts
index 04194b0b..91d515df 100644
--- a/apps/api/src/lib/extract/document-scraper.ts
+++ b/apps/api/src/lib/extract/document-scraper.ts
@@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
   timeout: number;
 }
 
-export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
+export async function scrapeDocument(
+  options: ScrapeDocumentOptions,
+  urlTraces: URLTrace[],
+): Promise<Document | null> {
   const trace = urlTraces.find((t) => t.url === options.url);
   if (trace) {
-    trace.status = 'scraped';
+    trace.status = "scraped";
     trace.timing.scrapedAt = new Date().toISOString();
   }
 
@@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
         mode: "single_urls",
         team_id: options.teamId,
         scrapeOptions: scrapeOptions.parse({}),
-        internalOptions: {},
+        internalOptions: {
+          useCache: true,
+        },
         plan: options.plan,
         origin: options.origin,
         is_scrape: true,
@@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
   } catch (error) {
     logger.error(`Error in scrapeDocument: ${error}`);
     if (trace) {
-      trace.status = 'error';
+      trace.status = "error";
       trace.error = error.message;
     }
     return null;
   }
-} 
\ No newline at end of file
+}
diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts
index bf51ac94..956fc3ab 100644
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@@ -298,6 +298,12 @@ export function buildFallbackList(meta: Meta): {
   engine: Engine;
   unsupportedFeatures: Set<FeatureFlag>;
 }[] {
+
+  if (meta.internalOptions.useCache !== true) {
+    engines.splice(engines.indexOf("cache"), 1);
+  }else{
+    meta.logger.debug("Cache engine enabled by useCache option");
+  }
   const prioritySum = [...meta.featureFlags].reduce(
     (a, x) => a + featureFlagOptions[x].priority,
     0,
diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts
index 549ce9d1..b13f7d9a 100644
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@@ -151,7 +151,7 @@ export type InternalOptions = {
 
   v0CrawlOnlyUrls?: boolean;
   v0DisableJsDom?: boolean;
-
+  useCache?: boolean;
   disableSmartWaitCache?: boolean; // Passed along to fire-engine
   isBackgroundIndex?: boolean;
 };

From 432b4106789d495769da3804228b915522f42fa5 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 21:26:05 -0300
Subject: [PATCH 06/18] Update queue-worker.ts

---
 apps/api/src/services/queue-worker.ts | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index f6a033cb..8408cc61 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -531,16 +531,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
     document.markdown &&
     job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
   ) {
-    indexPage({
-      document: document,
-      originUrl: job.data.crawl_id
-        ? (await getCrawl(job.data.crawl_id))?.originUrl!
-        : document.metadata.sourceURL!,
-      crawlId: job.data.crawl_id,
-      teamId: job.data.team_id,
-    }).catch((error) => {
-      _logger.error("Error indexing page", { error });
-    });
+    // indexPage({
+    //   document: document,
+    //   originUrl: job.data.crawl_id
+    //     ? (await getCrawl(job.data.crawl_id))?.originUrl!
+    //     : document.metadata.sourceURL!,
+    //   crawlId: job.data.crawl_id,
+    //   teamId: job.data.team_id,
+    // }).catch((error) => {
+    //   _logger.error("Error indexing page", { error });
+    // });
   }
 }
 

From 499479c85e9da40a86e3c2ef83eaf1f924682ae5 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 21:28:52 -0300
Subject: [PATCH 07/18] Update url-processor.ts

---
 apps/api/src/lib/extract/url-processor.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts
index af250fcd..a5027fa9 100644
--- a/apps/api/src/lib/extract/url-processor.ts
+++ b/apps/api/src/lib/extract/url-processor.ts
@@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
     });
 
     // retry if only one url is returned
-    if (uniqueUrls.length === 1)  {
+    if (uniqueUrls.length <= 1)  {
       const retryMapResults = await getMapResults({
         url: baseUrl,
         teamId: options.teamId,

From 8df1c67961dded611cfe18c9a1c304852d428c9d Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 21:48:28 -0300
Subject: [PATCH 08/18] Update queue-worker.ts

---
 apps/api/src/services/queue-worker.ts | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 8408cc61..4ea3ff84 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -50,6 +50,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
+import { supabase_service } from "../services/supabase";
 
 configDotenv();
 
@@ -77,6 +78,30 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
   if (await finishCrawl(job.data.crawl_id)) {
+    // Get all visited URLs from Redis
+    const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
+    
+    // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
+    if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
+      try {
+        const { error } = await supabase_service
+          .from('crawl_maps')
+          .insert({
+            crawl_id: job.data.crawl_id,
+            team_id: job.data.team_id,
+            origin_url: sc.originUrl,
+            urls: visitedUrls,
+            created_at: new Date().toISOString()
+          });
+          
+        if (error) {
+          _logger.error("Failed to save crawl map", { error });
+        }
+      } catch (error) {
+        _logger.error("Error saving crawl map", { error });
+      }
+    }
+
     if (!job.data.v1) {
       const jobIDs = await getCrawlJobs(job.data.crawl_id);
 

From a4f7c38834426c441d7da0221b7f467195cd2350 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 22:15:23 -0300
Subject: [PATCH 09/18] Nick: fixed

---
 .../src/scraper/scrapeURL/engines/index.ts    |  7 ++-
 apps/api/src/services/queue-worker.ts         | 52 ++++++++++++++-----
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts
index 956fc3ab..e452f7fa 100644
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@@ -300,8 +300,11 @@ export function buildFallbackList(meta: Meta): {
 }[] {
 
   if (meta.internalOptions.useCache !== true) {
-    engines.splice(engines.indexOf("cache"), 1);
-  }else{
+    const cacheIndex = engines.indexOf("cache");
+    if (cacheIndex !== -1) {
+      engines.splice(cacheIndex, 1);
+    }
+  } else {
     meta.logger.debug("Cache engine enabled by useCache option");
   }
   const prioritySum = [...meta.featureFlags].reduce(
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 4ea3ff84..f6ff96a5 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -84,18 +84,43 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
     // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
     if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
       try {
-        const { error } = await supabase_service
+        // First check if entry exists for this origin URL
+        const { data: existingMap } = await supabase_service
           .from('crawl_maps')
-          .insert({
-            crawl_id: job.data.crawl_id,
-            team_id: job.data.team_id,
-            origin_url: sc.originUrl,
-            urls: visitedUrls,
-            created_at: new Date().toISOString()
-          });
+          .select('urls')
+          .eq('origin_url', sc.originUrl)
+          .single();
+
+        if (existingMap) {
+          // Merge URLs, removing duplicates
+          const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
           
-        if (error) {
-          _logger.error("Failed to save crawl map", { error });
+          const { error } = await supabase_service
+            .from('crawl_maps')
+            .update({
+              urls: mergedUrls,
+              num_urls: mergedUrls.length,
+              updated_at: new Date().toISOString()
+            })
+            .eq('origin_url', sc.originUrl);
+
+          if (error) {
+            _logger.error("Failed to update crawl map", { error });
+          }
+        } else {
+          // Insert new entry if none exists
+          const { error } = await supabase_service
+            .from('crawl_maps')
+            .insert({
+              origin_url: sc.originUrl,
+              urls: visitedUrls,
+              num_urls: visitedUrls.length,
+              created_at: new Date().toISOString()
+            });
+
+          if (error) {
+            _logger.error("Failed to save crawl map", { error });
+          }
         }
       } catch (error) {
         _logger.error("Error saving crawl map", { error });
@@ -802,9 +827,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                 newJobId: jobId,
               });
             } else {
-              logger.debug("Could not lock URL " + JSON.stringify(link), {
-                url: link,
-              });
+              // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
+              // logger.debug("Could not lock URL " + JSON.stringify(link), {
+              //   url: link,
+              // });
             }
           }
         }

From c655c6859f256b10cb1a4cdd9d4e039940dea89a Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 22:50:53 -0300
Subject: [PATCH 10/18] Nick: fixed

---
 apps/api/src/lib/canonical-url.ts     |  7 ++
 apps/api/src/services/queue-worker.ts | 97 ++++++++++++++++-----------
 2 files changed, 63 insertions(+), 41 deletions(-)
 create mode 100644 apps/api/src/lib/canonical-url.ts

diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts
new file mode 100644
index 00000000..cbb33f8b
--- /dev/null
+++ b/apps/api/src/lib/canonical-url.ts
@@ -0,0 +1,7 @@
+export function normalizeUrl(url: string) {
+  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
+  if (url.endsWith("/")) {
+    url = url.slice(0, -1);
+  }
+  return url;
+}
\ No newline at end of file
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index f6ff96a5..4fb08337 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { supabase_service } from "../services/supabase";
+import { normalizeUrl } from "../lib/canonical-url";
 
 configDotenv();
 
@@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
   if (await finishCrawl(job.data.crawl_id)) {
-    // Get all visited URLs from Redis
-    const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
-    
-    // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
-    if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
-      try {
-        // First check if entry exists for this origin URL
-        const { data: existingMap } = await supabase_service
-          .from('crawl_maps')
-          .select('urls')
-          .eq('origin_url', sc.originUrl)
-          .single();
+    (async () => {
+      const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
+      // Get all visited URLs from Redis
+      const visitedUrls = await redisConnection.smembers(
+        "crawl:" + job.data.crawl_id + ":visited",
+      );
+      // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
+      if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
+        // Fire and forget the upload to Supabase
+        try {
+          // Standardize URLs to canonical form (https, no www)
+          const standardizedUrls = [
+            ...new Set(
+              visitedUrls.map((url) => {
+                return normalizeUrl(url);
+              }),
+            ),
+          ];
+          // First check if entry exists for this origin URL
+          const { data: existingMap } = await supabase_service
+            .from("crawl_maps")
+            .select("urls")
+            .eq("origin_url", originUrl)
+            .single();
 
-        if (existingMap) {
-          // Merge URLs, removing duplicates
-          const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
-          
-          const { error } = await supabase_service
-            .from('crawl_maps')
-            .update({
-              urls: mergedUrls,
-              num_urls: mergedUrls.length,
-              updated_at: new Date().toISOString()
-            })
-            .eq('origin_url', sc.originUrl);
+          if (existingMap) {
+            // Merge URLs, removing duplicates
+            const mergedUrls = [
+              ...new Set([...existingMap.urls, ...standardizedUrls]),
+            ];
 
-          if (error) {
-            _logger.error("Failed to update crawl map", { error });
-          }
-        } else {
-          // Insert new entry if none exists
-          const { error } = await supabase_service
-            .from('crawl_maps')
-            .insert({
-              origin_url: sc.originUrl,
-              urls: visitedUrls,
-              num_urls: visitedUrls.length,
-              created_at: new Date().toISOString()
+            const { error } = await supabase_service
+              .from("crawl_maps")
+              .update({
+                urls: mergedUrls,
+                num_urls: mergedUrls.length,
+                updated_at: new Date().toISOString(),
+              })
+              .eq("origin_url", originUrl);
+
+            if (error) {
+              _logger.error("Failed to update crawl map", { error });
+            }
+          } else {
+            // Insert new entry if none exists
+            const { error } = await supabase_service.from("crawl_maps").insert({
+              origin_url: originUrl,
+              urls: standardizedUrls,
+              num_urls: standardizedUrls.length,
+              created_at: new Date().toISOString(),
+              updated_at: new Date().toISOString(),
             });
 
-          if (error) {
-            _logger.error("Failed to save crawl map", { error });
+            if (error) {
+              _logger.error("Failed to save crawl map", { error });
+            }
           }
+        } catch (error) {
+          _logger.error("Error saving crawl map", { error });
         }
-      } catch (error) {
-        _logger.error("Error saving crawl map", { error });
       }
-    }
+    })();
 
     if (!job.data.v1) {
       const jobIDs = await getCrawlJobs(job.data.crawl_id);

From 05e845a9711a4e84cedae95de0b58f3964dfcfbf Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 22:55:38 -0300
Subject: [PATCH 11/18] Update cache.ts

---
 apps/api/src/scraper/scrapeURL/transformers/cache.ts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
index 523a8419..4005059f 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@@ -3,6 +3,10 @@ import { Meta } from "..";
 import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
 
 export function saveToCache(meta: Meta, document: Document): Document {
+  if (meta.internalOptions.useCache !== true) {
+    return document;
+  }
+
   if (
     document.metadata.statusCode! < 200 ||
     document.metadata.statusCode! >= 300

From aef040b41e14d67abcc1bdfb751c20d93a3275de Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 23:07:15 -0300
Subject: [PATCH 12/18] Nick: from cache fixes

---
 apps/api/src/scraper/scrapeURL/engines/cache/index.ts | 3 +++
 apps/api/src/scraper/scrapeURL/index.ts               | 1 +
 apps/api/src/scraper/scrapeURL/transformers/cache.ts  | 6 ++++++
 3 files changed, 10 insertions(+)

diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
index f48806fd..c0451df4 100644
--- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
@@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
   const entry = await getEntryFromCache(key);
   if (entry === null) throw new EngineError("Cache missed");
 
+  // Set fromCache flag to indicate this document was retrieved from cache
+  meta.internalOptions.fromCache = true;
+
   return {
     url: entry.url,
     html: entry.html,
diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts
index b13f7d9a..7f4a76e4 100644
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@@ -154,6 +154,7 @@ export type InternalOptions = {
   useCache?: boolean;
   disableSmartWaitCache?: boolean; // Passed along to fire-engine
   isBackgroundIndex?: boolean;
+  fromCache?: boolean; // Indicates if the document was retrieved from cache
 };
 
 export type EngineResultsTracker = {
diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
index 4005059f..f2d7bcf4 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@@ -19,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
     );
   }
 
+  // If the document was retrieved from cache, we don't need to save it
+  if (meta.internalOptions.fromCache) {
+    return document;
+  }
+
+
   const key = cacheKey(meta.url, meta.options, meta.internalOptions);
 
   if (key !== null) {

From f25c0c6d216c3242b114d5fdada4b67a74d4e08c Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 23:16:33 -0300
Subject: [PATCH 13/18] Nick: added canonical tests

---
 apps/api/src/lib/canonical-url.test.ts | 39 ++++++++++++++++++++++++++
 apps/api/src/lib/canonical-url.ts      |  9 +++---
 2 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 apps/api/src/lib/canonical-url.test.ts

diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts
new file mode 100644
index 00000000..0a2c3acd
--- /dev/null
+++ b/apps/api/src/lib/canonical-url.test.ts
@@ -0,0 +1,39 @@
+import { normalizeUrl } from './canonical-url';
+
+describe('normalizeUrl', () => {
+  it('should remove protocol and www from URL', () => {
+    const url = 'https://www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should remove only protocol if www is not present', () => {
+    const url = 'https://example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol', () => {
+    const url = 'www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol and www', () => {
+    const url = 'example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs with paths', () => {
+    const url = 'https://www.example.com/path/to/resource';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle invalid URLs gracefully', () => {
+    const url = 'not a valid url';
+    const expected = 'not a valid url';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+});
diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts
index cbb33f8b..fedea09d 100644
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@@ -1,7 +1,8 @@
 export function normalizeUrl(url: string) {
-  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
-  if (url.endsWith("/")) {
-    url = url.slice(0, -1);
+  try {
+    const hostname = new URL(url).hostname;
+    return hostname.replace(/^www\./, "");
+  } catch (error) {
+    return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0];
   }
-  return url;
 }
\ No newline at end of file

From f2e0bfbfe3048d7b52b44e1c472ddf915eff4134 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 23:54:03 -0300
Subject: [PATCH 14/18] Nick: url normalization

---
 apps/api/src/lib/canonical-url.ts     | 15 +++++++++++++--
 apps/api/src/services/queue-worker.ts |  4 ++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts
index fedea09d..50570293 100644
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@@ -1,8 +1,19 @@
 export function normalizeUrl(url: string) {
+  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
+  if (url.endsWith("/")) {
+    url = url.slice(0, -1);
+  }
+  return url;
+}
+
+export function normalizeUrlOnlyHostname(url: string) {
   try {
     const hostname = new URL(url).hostname;
     return hostname.replace(/^www\./, "");
   } catch (error) {
-    return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0];
+    return url
+      .replace(/^https?:\/\//, "")
+      .replace(/^www\./, "")
+      .split("/")[0];
   }
-}
\ No newline at end of file
+}
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 4fb08337..9e6f3d24 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { supabase_service } from "../services/supabase";
-import { normalizeUrl } from "../lib/canonical-url";
+import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
 
 configDotenv();
 
@@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
   if (await finishCrawl(job.data.crawl_id)) {
     (async () => {
-      const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
+      const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
       // Get all visited URLs from Redis
       const visitedUrls = await redisConnection.smembers(
         "crawl:" + job.data.crawl_id + ":visited",

From d48ddb88200ed474144df3aa43eb0be305597658 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 3 Jan 2025 23:55:05 -0300
Subject: [PATCH 15/18] Update canonical-url.test.ts

---
 apps/api/src/lib/canonical-url.test.ts | 54 +++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts
index 0a2c3acd..65171642 100644
--- a/apps/api/src/lib/canonical-url.test.ts
+++ b/apps/api/src/lib/canonical-url.test.ts
@@ -1,4 +1,44 @@
-import { normalizeUrl } from './canonical-url';
+import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
+
+describe('normalizeUrlOnlyHostname', () => {
+  it('should remove protocol and www from URL', () => {
+    const url = 'https://www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should remove only protocol if www is not present', () => {
+    const url = 'https://example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol', () => {
+    const url = 'www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol and www', () => {
+    const url = 'example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs with paths', () => {
+    const url = 'https://www.example.com/path/to/resource';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle invalid URLs gracefully', () => {
+    const url = 'not a valid url';
+    const expected = 'not a valid url';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+});
+
+
 
 describe('normalizeUrl', () => {
   it('should remove protocol and www from URL', () => {
@@ -27,10 +67,22 @@ describe('normalizeUrl', () => {
 
   it('should handle URLs with paths', () => {
     const url = 'https://www.example.com/path/to/resource';
+    const expected = 'example.com/path/to/resource';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs with trailing slash', () => {
+    const url = 'https://www.example.com/';
     const expected = 'example.com';
     expect(normalizeUrl(url)).toBe(expected);
   });
 
+  it('should handle URLs with trailing slash and path', () => {
+    const url = 'https://www.example.com/path/';
+    const expected = 'example.com/path';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
   it('should handle invalid URLs gracefully', () => {
     const url = 'not a valid url';
     const expected = 'not a valid url';

From b92a4eb79b04d090ccb8322db1af9a95b838b819 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Sat, 4 Jan 2025 16:59:35 +0100
Subject: [PATCH 16/18] fix(queue-worker): only do redirect handling logic on
 crawls, not batch scrape

---
 apps/api/src/controllers/v1/batch-scrape.ts | 2 +-
 apps/api/src/services/queue-worker.ts       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts
index 19ce3ba0..21c9745c 100644
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@@ -61,7 +61,7 @@ export async function batchScrapeController(
   }
 
   logger.debug("Batch scrape " + id + " starting", {
-    urlsLength: urls,
+    urlsLength: urls.length,
     appendToId: req.body.appendToId,
     account: req.account,
   });
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 9e6f3d24..a48c798b 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -710,7 +710,8 @@ async function processJob(job: Job & { id: string }, token: string) {
         doc.metadata.url !== undefined &&
         doc.metadata.sourceURL !== undefined &&
         normalizeURL(doc.metadata.url, sc) !==
-          normalizeURL(doc.metadata.sourceURL, sc)
+          normalizeURL(doc.metadata.sourceURL, sc) &&
+        job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
       ) {
         const crawler = crawlToCrawler(job.data.crawl_id, sc);
         if (

From 461842fe8c1e71388809165f8efff12a8d781500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Sat, 4 Jan 2025 17:24:33 +0100
Subject: [PATCH 17/18] fix(v1/crawl-status): handle job's returnvalue being
 explicitly null (db race)

---
 apps/api/src/controllers/v1/crawl-status.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts
index 1aec86c8..ce3831f2 100644
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@@ -157,10 +157,10 @@ export async function crawlStatusController(
           continue;
         }
 
-        if (job.returnvalue === undefined) {
+        if (job.returnvalue === undefined || job.returnvalue === null) {
           logger.warn(
             "Job was considered done, but returnvalue is undefined!",
-            { jobId: job.id, state },
+            { jobId: job.id, state, returnvalue: job.returnvalue },
           );
           continue;
         }

From 736c3675b66a52e4fe4fdf2097c2b1e8820cdda9 Mon Sep 17 00:00:00 2001
From: Kirill <k.melkozerov@gmail.com>
Date: Sun, 5 Jan 2025 17:07:14 +0400
Subject: [PATCH 18/18] use new agent generation instead of expired one

---
 apps/playwright-service-ts/api.ts       | 4 ++--
 apps/playwright-service-ts/package.json | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts
index eacb35ff..3b024347 100644
--- a/apps/playwright-service-ts/api.ts
+++ b/apps/playwright-service-ts/api.ts
@@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
 import bodyParser from 'body-parser';
 import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
 import dotenv from 'dotenv';
-import randomUseragent from 'random-useragent';
+import UserAgent from 'user-agents';
 import { getError } from './helpers/get_error';
 
 dotenv.config();
@@ -60,7 +60,7 @@ const initializeBrowser = async () => {
     ]
   });
 
-  const userAgent = randomUseragent.getRandom();
+  const userAgent = new UserAgent().toString();
   const viewport = { width: 1280, height: 800 };
 
   const contextOptions: any = {
diff --git a/apps/playwright-service-ts/package.json b/apps/playwright-service-ts/package.json
index fe15209f..af1c10be 100644
--- a/apps/playwright-service-ts/package.json
+++ b/apps/playwright-service-ts/package.json
@@ -16,12 +16,12 @@
     "dotenv": "^16.4.5",
     "express": "^4.19.2",
     "playwright": "^1.45.0",
-    "random-useragent": "^0.5.0"
+    "user-agents": "^1.1.410"
   },
   "devDependencies": {
     "@types/express": "^4.17.21",
     "@types/node": "^20.14.9",
-    "@types/random-useragent": "^0.3.3",
+    "@types/user-agents": "^1.0.4",
     "ts-node": "^10.9.2",
     "typescript": "^5.5.2"
   }