Merge branch 'main' into nsc/extract-queue

2025-08-20 12:39:07 +08:00 · 2025-01-06 13:01:15 -03:00 · 2025-01-06 13:01:15 -03:00 · bb27594443
commit bb27594443
parent 27457ed5db b82cfa8540
17 changed files with 313 additions and 51 deletions
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -61,7 +61,7 @@ export async function batchScrapeController(
  }

  logger.debug("Batch scrape " + id + " starting", {
-    urlsLength: urls,
+    urlsLength: urls.length,
    appendToId: req.body.appendToId,
    account: req.account,
  });
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -157,10 +157,10 @@ export async function crawlStatusController(
          continue;
        }

-        if (job.returnvalue === undefined) {
+        if (job.returnvalue === undefined || job.returnvalue === null) {
          logger.warn(
            "Job was considered done, but returnvalue is undefined!",
-            { jobId: job.id, state },
+            { jobId: job.id, state, returnvalue: job.returnvalue },
          );
          continue;
        }
--- a/apps/api/src/lib/canonical-url.test.ts
+++ b/apps/api/src/lib/canonical-url.test.ts
@ -0,0 +1,91 @@
+import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
+
+describe('normalizeUrlOnlyHostname', () => {
+  it('should remove protocol and www from URL', () => {
+    const url = 'https://www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should remove only protocol if www is not present', () => {
+    const url = 'https://example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol', () => {
+    const url = 'www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol and www', () => {
+    const url = 'example.com';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle URLs with paths', () => {
+    const url = 'https://www.example.com/path/to/resource';
+    const expected = 'example.com';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it('should handle invalid URLs gracefully', () => {
+    const url = 'not a valid url';
+    const expected = 'not a valid url';
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+});
+
+
+
+describe('normalizeUrl', () => {
+  it('should remove protocol and www from URL', () => {
+    const url = 'https://www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should remove only protocol if www is not present', () => {
+    const url = 'https://example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol', () => {
+    const url = 'www.example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs without protocol and www', () => {
+    const url = 'example.com';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs with paths', () => {
+    const url = 'https://www.example.com/path/to/resource';
+    const expected = 'example.com/path/to/resource';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs with trailing slash', () => {
+    const url = 'https://www.example.com/';
+    const expected = 'example.com';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle URLs with trailing slash and path', () => {
+    const url = 'https://www.example.com/path/';
+    const expected = 'example.com/path';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+
+  it('should handle invalid URLs gracefully', () => {
+    const url = 'not a valid url';
+    const expected = 'not a valid url';
+    expect(normalizeUrl(url)).toBe(expected);
+  });
+});
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@ -0,0 +1,19 @@
+export function normalizeUrl(url: string) {
+  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
+  if (url.endsWith("/")) {
+    url = url.slice(0, -1);
+  }
+  return url;
+}
+
+export function normalizeUrlOnlyHostname(url: string) {
+  try {
+    const hostname = new URL(url).hostname;
+    return hostname.replace(/^www\./, "");
+  } catch (error) {
+    return url
+      .replace(/^https?:\/\//, "")
+      .replace(/^www\./, "")
+      .split("/")[0];
+  }
+}
--- a/apps/api/src/lib/extract/document-scraper.ts
+++ b/apps/api/src/lib/extract/document-scraper.ts
@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
  timeout: number;
 }

-export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
+export async function scrapeDocument(
+  options: ScrapeDocumentOptions,
+  urlTraces: URLTrace[],
+): Promise<Document | null> {
  const trace = urlTraces.find((t) => t.url === options.url);
  if (trace) {
-    trace.status = 'scraped';
+    trace.status = "scraped";
    trace.timing.scrapedAt = new Date().toISOString();
  }

@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
        mode: "single_urls",
        team_id: options.teamId,
        scrapeOptions: scrapeOptions.parse({}),
-        internalOptions: {},
+        internalOptions: {
+          useCache: true,
+        },
        plan: options.plan,
        origin: options.origin,
        is_scrape: true,
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
  } catch (error) {
    logger.error(`Error in scrapeDocument: ${error}`);
    if (trace) {
-      trace.status = 'error';
+      trace.status = "error";
      trace.error = error.message;
    }
    return null;
--- a/apps/api/src/lib/extract/url-processor.ts
+++ b/apps/api/src/lib/extract/url-processor.ts
@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
    });

    // retry if only one url is returned
-    if (uniqueUrls.length === 1)  {
+    if (uniqueUrls.length <= 1)  {
      const retryMapResults = await getMapResults({
        url: baseUrl,
        teamId: options.teamId,
--- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
  const entry = await getEntryFromCache(key);
  if (entry === null) throw new EngineError("Cache missed");

+  // Set fromCache flag to indicate this document was retrieved from cache
+  meta.internalOptions.fromCache = true;
+
  return {
    url: entry.url,
    html: entry.html,
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
  engine: Engine;
  unsupportedFeatures: Set<FeatureFlag>;
 }[] {
+
+  if (meta.internalOptions.useCache !== true) {
+    const cacheIndex = engines.indexOf("cache");
+    if (cacheIndex !== -1) {
+      engines.splice(cacheIndex, 1);
+    }
+  } else {
+    meta.logger.debug("Cache engine enabled by useCache option");
+  }
  const prioritySum = [...meta.featureFlags].reduce(
    (a, x) => a + featureFlagOptions[x].priority,
    0,
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -151,9 +151,10 @@ export type InternalOptions = {

  v0CrawlOnlyUrls?: boolean;
  v0DisableJsDom?: boolean;
-
+  useCache?: boolean;
  disableSmartWaitCache?: boolean; // Passed along to fire-engine
  isBackgroundIndex?: boolean;
+  fromCache?: boolean; // Indicates if the document was retrieved from cache
 };

 export type EngineResultsTracker = {
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@ -3,6 +3,10 @@ import { Meta } from "..";
 import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";

 export function saveToCache(meta: Meta, document: Document): Document {
+  if (meta.internalOptions.useCache !== true) {
+    return document;
+  }
+
  if (
    document.metadata.statusCode! < 200 ||
    document.metadata.statusCode! >= 300
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
    );
  }

+  // If the document was retrieved from cache, we don't need to save it
+  if (meta.internalOptions.fromCache) {
+    return document;
+  }
+
+
  const key = cacheKey(meta.url, meta.options, meta.internalOptions);

  if (key !== null) {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { performExtraction } from "../lib/extract/extraction-service";
+import { supabase_service } from "../services/supabase";
+import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";

 configDotenv();

@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;

 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
  if (await finishCrawl(job.data.crawl_id)) {
+    (async () => {
+      const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
+      // Get all visited URLs from Redis
+      const visitedUrls = await redisConnection.smembers(
+        "crawl:" + job.data.crawl_id + ":visited",
+      );
+      // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
+      if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
+        // Fire and forget the upload to Supabase
+        try {
+          // Standardize URLs to canonical form (https, no www)
+          const standardizedUrls = [
+            ...new Set(
+              visitedUrls.map((url) => {
+                return normalizeUrl(url);
+              }),
+            ),
+          ];
+          // First check if entry exists for this origin URL
+          const { data: existingMap } = await supabase_service
+            .from("crawl_maps")
+            .select("urls")
+            .eq("origin_url", originUrl)
+            .single();
+
+          if (existingMap) {
+            // Merge URLs, removing duplicates
+            const mergedUrls = [
+              ...new Set([...existingMap.urls, ...standardizedUrls]),
+            ];
+
+            const { error } = await supabase_service
+              .from("crawl_maps")
+              .update({
+                urls: mergedUrls,
+                num_urls: mergedUrls.length,
+                updated_at: new Date().toISOString(),
+              })
+              .eq("origin_url", originUrl);
+
+            if (error) {
+              _logger.error("Failed to update crawl map", { error });
+            }
+          } else {
+            // Insert new entry if none exists
+            const { error } = await supabase_service.from("crawl_maps").insert({
+              origin_url: originUrl,
+              urls: standardizedUrls,
+              num_urls: standardizedUrls.length,
+              created_at: new Date().toISOString(),
+              updated_at: new Date().toISOString(),
+            });
+
+            if (error) {
+              _logger.error("Failed to save crawl map", { error });
+            }
+          }
+        } catch (error) {
+          _logger.error("Error saving crawl map", { error });
+        }
+      }
+    })();
+
    if (!job.data.v1) {
      const jobIDs = await getCrawlJobs(job.data.crawl_id);

@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
    document.markdown &&
    job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
  ) {
-    indexPage({
-      document: document,
-      originUrl: job.data.crawl_id
-        ? (await getCrawl(job.data.crawl_id))?.originUrl!
-        : document.metadata.sourceURL!,
-      crawlId: job.data.crawl_id,
-      teamId: job.data.team_id,
-    }).catch((error) => {
-      _logger.error("Error indexing page", { error });
-    });
+    // indexPage({
+    //   document: document,
+    //   originUrl: job.data.crawl_id
+    //     ? (await getCrawl(job.data.crawl_id))?.originUrl!
+    //     : document.metadata.sourceURL!,
+    //   crawlId: job.data.crawl_id,
+    //   teamId: job.data.team_id,
+    // }).catch((error) => {
+    //   _logger.error("Error indexing page", { error });
+    // });
  }
 }

@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
        doc.metadata.url !== undefined &&
        doc.metadata.sourceURL !== undefined &&
        normalizeURL(doc.metadata.url, sc) !==
-          normalizeURL(doc.metadata.sourceURL, sc)
+          normalizeURL(doc.metadata.sourceURL, sc) &&
+        job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
      ) {
        const crawler = crawlToCrawler(job.data.crawl_id, sc);
        if (
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                newJobId: jobId,
              });
            } else {
-              logger.debug("Could not lock URL " + JSON.stringify(link), {
-                url: link,
-              });
+              // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
+              // logger.debug("Could not lock URL " + JSON.stringify(link), {
+              //   url: link,
+              // });
            }
          }
        }
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.11.2",
+  "version": "1.11.3",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -565,23 +565,39 @@ export default class FirecrawlApp {
          if ("data" in statusData) {
            let data = statusData.data;
            while (typeof statusData === 'object' && 'next' in statusData) {
+              if (data.length === 0) {
+                break
+              }
              statusData = (await this.getRequest(statusData.next, headers)).data;
              data = data.concat(statusData.data);
            }
            allData = data;
          }
        }
-        return ({
+
+        let resp: CrawlStatusResponse | ErrorResponse = {
          success: response.data.success,
          status: response.data.status,
          total: response.data.total,
          completed: response.data.completed,
          creditsUsed: response.data.creditsUsed,
          expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
-          data: allData,
-          error: response.data.error,
-        })
+          data: allData
+        }
+
+        if (!response.data.success && response.data.error) {
+          resp = {
+            ...resp,
+            success: false,
+            error: response.data.error
+          } as ErrorResponse;
+        }
+
+        if (response.data.next) {
+          (resp as CrawlStatusResponse).next = response.data.next;
+        }
+        
+        return resp;
      } else {
        this.handleError(response, "check crawl status");
      }
@ -799,23 +815,39 @@ export default class FirecrawlApp {
          if ("data" in statusData) {
            let data = statusData.data;
            while (typeof statusData === 'object' && 'next' in statusData) {
+              if (data.length === 0) {
+                break
+              }
              statusData = (await this.getRequest(statusData.next, headers)).data;
              data = data.concat(statusData.data);
            }
            allData = data;
          }
        }
-        return ({
+
+        let resp: BatchScrapeStatusResponse | ErrorResponse = {
          success: response.data.success,
          status: response.data.status,
          total: response.data.total,
          completed: response.data.completed,
          creditsUsed: response.data.creditsUsed,
          expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
-          data: allData,
-          error: response.data.error,
-        })
+          data: allData
+        }
+
+        if (!response.data.success && response.data.error) {
+          resp = {
+            ...resp,
+            success: false,
+            error: response.data.error
+          } as ErrorResponse;
+        }
+
+        if (response.data.next) {
+          (resp as BatchScrapeStatusResponse).next = response.data.next;
+        }
+        
+        return resp;
      } else {
        this.handleError(response, "check batch scrape status");
      }
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
              if ("data" in statusData) {
                let data = statusData.data;
                while (typeof statusData === 'object' && 'next' in statusData) {
+                  if (data.length === 0) {
+                    break
+                  }
                  statusResponse = await this.getRequest(statusData.next, headers);
                  statusData = statusResponse.data;
                  data = data.concat(statusData.data);
--- a/apps/playwright-service-ts/api.ts
+++ b/apps/playwright-service-ts/api.ts
@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
 import bodyParser from 'body-parser';
 import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
 import dotenv from 'dotenv';
-import randomUseragent from 'random-useragent';
+import UserAgent from 'user-agents';
 import { getError } from './helpers/get_error';

 dotenv.config();
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
    ]
  });

-  const userAgent = randomUseragent.getRandom();
+  const userAgent = new UserAgent().toString();
  const viewport = { width: 1280, height: 800 };

  const contextOptions: any = {
--- a/apps/playwright-service-ts/package.json
+++ b/apps/playwright-service-ts/package.json
@ -16,12 +16,12 @@
    "dotenv": "^16.4.5",
    "express": "^4.19.2",
    "playwright": "^1.45.0",
-    "random-useragent": "^0.5.0"
+    "user-agents": "^1.1.410"
  },
  "devDependencies": {
    "@types/express": "^4.17.21",
    "@types/node": "^20.14.9",
-    "@types/random-useragent": "^0.3.3",
+    "@types/user-agents": "^1.0.4",
    "ts-node": "^10.9.2",
    "typescript": "^5.5.2"
  }
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.8.0"
+__version__ = "1.8.1"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -250,6 +250,8 @@ class FirecrawlApp:
                if 'data' in status_data:
                    data = status_data['data']
                    while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                        next_url = status_data.get('next')
                        if not next_url:
                            logger.warning("Expected 'next' URL is missing.")
@ -267,16 +269,24 @@ class FirecrawlApp:
                            break
                    status_data['data'] = data

-            return {
-                'success': True,
+            response = {
                'status': status_data.get('status'),
                'total': status_data.get('total'),
                'completed': status_data.get('completed'),
                'creditsUsed': status_data.get('creditsUsed'),
                'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
            }
        else:
            self._handle_error(response, 'check crawl status')
@ -459,6 +469,8 @@ class FirecrawlApp:
                if 'data' in status_data:
                    data = status_data['data']
                    while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                        next_url = status_data.get('next')
                        if not next_url:
                            logger.warning("Expected 'next' URL is missing.")
@ -476,16 +488,24 @@ class FirecrawlApp:
                            break
                    status_data['data'] = data

-            return {
-                'success': True,
+            response = {
                'status': status_data.get('status'),
                'total': status_data.get('total'),
                'completed': status_data.get('completed'),
                'creditsUsed': status_data.get('creditsUsed'),
                'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
            }
        else:
            self._handle_error(response, 'check batch scrape status')
@ -669,6 +689,8 @@ class FirecrawlApp:
                    if 'data' in status_data:
                        data = status_data['data']
                        while 'next' in status_data:
+                          if len(status_data['data']) == 0:
+                              break
                          status_response = self._get_request(status_data['next'], headers)
                          status_data = status_response.json()
                          data.extend(status_data.get('data', []))