Merge branch 'main' into nsc/extract-queue

2025-08-20 14:39:15 +08:00 · 2025-01-06 13:01:15 -03:00 · 2025-01-06 13:01:15 -03:00 · bb27594443
commit bb27594443
parent 27457ed5db b82cfa8540
17 changed files with 313 additions and 51 deletions
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -61,7 +61,7 @@ export async function batchScrapeController(
  }
  logger.debug("Batch scrape " + id + " starting", {
-    urlsLength: urls,
+    urlsLength: urls.length,
    appendToId: req.body.appendToId,
    account: req.account,
  });
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -157,10 +157,10 @@ export async function crawlStatusController(
          continue;
        }
-        if (job.returnvalue === undefined) {
+        if (job.returnvalue === undefined || job.returnvalue === null) {
          logger.warn(
            "Job was considered done, but returnvalue is undefined!",
-            { jobId: job.id, state },
+            { jobId: job.id, state, returnvalue: job.returnvalue },
          );
          continue;
        }
--- a/apps/api/src/lib/canonical-url.test.ts
+++ b/apps/api/src/lib/canonical-url.test.ts
@ -0,0 +1,91 @@
 import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
 describe('normalizeUrlOnlyHostname', () => {
  it('should remove protocol and www from URL', () => {
    const url = 'https://www.example.com';
    const expected = 'example.com';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
  it('should remove only protocol if www is not present', () => {
    const url = 'https://example.com';
    const expected = 'example.com';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
  it('should handle URLs without protocol', () => {
    const url = 'www.example.com';
    const expected = 'example.com';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
  it('should handle URLs without protocol and www', () => {
    const url = 'example.com';
    const expected = 'example.com';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
  it('should handle URLs with paths', () => {
    const url = 'https://www.example.com/path/to/resource';
    const expected = 'example.com';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
  it('should handle invalid URLs gracefully', () => {
    const url = 'not a valid url';
    const expected = 'not a valid url';
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
 });
 describe('normalizeUrl', () => {
  it('should remove protocol and www from URL', () => {
    const url = 'https://www.example.com';
    const expected = 'example.com';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should remove only protocol if www is not present', () => {
    const url = 'https://example.com';
    const expected = 'example.com';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle URLs without protocol', () => {
    const url = 'www.example.com';
    const expected = 'example.com';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle URLs without protocol and www', () => {
    const url = 'example.com';
    const expected = 'example.com';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle URLs with paths', () => {
    const url = 'https://www.example.com/path/to/resource';
    const expected = 'example.com/path/to/resource';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle URLs with trailing slash', () => {
    const url = 'https://www.example.com/';
    const expected = 'example.com';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle URLs with trailing slash and path', () => {
    const url = 'https://www.example.com/path/';
    const expected = 'example.com/path';
    expect(normalizeUrl(url)).toBe(expected);
  });
  it('should handle invalid URLs gracefully', () => {
    const url = 'not a valid url';
    const expected = 'not a valid url';
    expect(normalizeUrl(url)).toBe(expected);
  });
 });
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@ -0,0 +1,19 @@
 export function normalizeUrl(url: string) {
  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
  if (url.endsWith("/")) {
    url = url.slice(0, -1);
  }
  return url;
 }
 export function normalizeUrlOnlyHostname(url: string) {
  try {
    const hostname = new URL(url).hostname;
    return hostname.replace(/^www\./, "");
  } catch (error) {
    return url
      .replace(/^https?:\/\//, "")
      .replace(/^www\./, "")
      .split("/")[0];
  }
 }
--- a/apps/api/src/lib/extract/document-scraper.ts
+++ b/apps/api/src/lib/extract/document-scraper.ts
@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
  timeout: number;
 }
-export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
+export async function scrapeDocument(
  options: ScrapeDocumentOptions,
  urlTraces: URLTrace[],
 ): Promise<Document | null> {
  const trace = urlTraces.find((t) => t.url === options.url);
  if (trace) {
-    trace.status = 'scraped';
+    trace.status = "scraped";
    trace.timing.scrapedAt = new Date().toISOString();
  }
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
        mode: "single_urls",
        team_id: options.teamId,
        scrapeOptions: scrapeOptions.parse({}),
-        internalOptions: {},
+        internalOptions: {
          useCache: true,
        },
        plan: options.plan,
        origin: options.origin,
        is_scrape: true,
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
  } catch (error) {
    logger.error(`Error in scrapeDocument: ${error}`);
    if (trace) {
-      trace.status = 'error';
+      trace.status = "error";
      trace.error = error.message;
    }
    return null;
--- a/apps/api/src/lib/extract/url-processor.ts
+++ b/apps/api/src/lib/extract/url-processor.ts
@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
    });
    // retry if only one url is returned
-    if (uniqueUrls.length === 1)  {
+    if (uniqueUrls.length <= 1)  {
      const retryMapResults = await getMapResults({
        url: baseUrl,
        teamId: options.teamId,
--- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
  const entry = await getEntryFromCache(key);
  if (entry === null) throw new EngineError("Cache missed");
  // Set fromCache flag to indicate this document was retrieved from cache
  meta.internalOptions.fromCache = true;
  return {
    url: entry.url,
    html: entry.html,
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
  engine: Engine;
  unsupportedFeatures: Set<FeatureFlag>;
 }[] {
  if (meta.internalOptions.useCache !== true) {
    const cacheIndex = engines.indexOf("cache");
    if (cacheIndex !== -1) {
      engines.splice(cacheIndex, 1);
    }
  } else {
    meta.logger.debug("Cache engine enabled by useCache option");
  }
  const prioritySum = [...meta.featureFlags].reduce(
    (a, x) => a + featureFlagOptions[x].priority,
    0,
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -151,9 +151,10 @@ export type InternalOptions = {
  v0CrawlOnlyUrls?: boolean;
  v0DisableJsDom?: boolean;
-
+  useCache?: boolean;
  disableSmartWaitCache?: boolean; // Passed along to fire-engine
  isBackgroundIndex?: boolean;
  fromCache?: boolean; // Indicates if the document was retrieved from cache
 };
 export type EngineResultsTracker = {
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@ -3,6 +3,10 @@ import { Meta } from "..";
 import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
 export function saveToCache(meta: Meta, document: Document): Document {
  if (meta.internalOptions.useCache !== true) {
    return document;
  }
  if (
    document.metadata.statusCode! < 200 ||
    document.metadata.statusCode! >= 300
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
    );
  }
  // If the document was retrieved from cache, we don't need to save it
  if (meta.internalOptions.fromCache) {
    return document;
  }
  const key = cacheKey(meta.url, meta.options, meta.internalOptions);
  if (key !== null) {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { performExtraction } from "../lib/extract/extraction-service";
 import { supabase_service } from "../services/supabase";
 import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
 configDotenv();
@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
  if (await finishCrawl(job.data.crawl_id)) {
    (async () => {
      const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
      // Get all visited URLs from Redis
      const visitedUrls = await redisConnection.smembers(
        "crawl:" + job.data.crawl_id + ":visited",
      );
      // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
      if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
        // Fire and forget the upload to Supabase
        try {
          // Standardize URLs to canonical form (https, no www)
          const standardizedUrls = [
            ...new Set(
              visitedUrls.map((url) => {
                return normalizeUrl(url);
              }),
            ),
          ];
          // First check if entry exists for this origin URL
          const { data: existingMap } = await supabase_service
            .from("crawl_maps")
            .select("urls")
            .eq("origin_url", originUrl)
            .single();
          if (existingMap) {
            // Merge URLs, removing duplicates
            const mergedUrls = [
              ...new Set([...existingMap.urls, ...standardizedUrls]),
            ];
            const { error } = await supabase_service
              .from("crawl_maps")
              .update({
                urls: mergedUrls,
                num_urls: mergedUrls.length,
                updated_at: new Date().toISOString(),
              })
              .eq("origin_url", originUrl);
            if (error) {
              _logger.error("Failed to update crawl map", { error });
            }
          } else {
            // Insert new entry if none exists
            const { error } = await supabase_service.from("crawl_maps").insert({
              origin_url: originUrl,
              urls: standardizedUrls,
              num_urls: standardizedUrls.length,
              created_at: new Date().toISOString(),
              updated_at: new Date().toISOString(),
            });
            if (error) {
              _logger.error("Failed to save crawl map", { error });
            }
          }
        } catch (error) {
          _logger.error("Error saving crawl map", { error });
        }
      }
    })();
    if (!job.data.v1) {
      const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
    document.markdown &&
    job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
  ) {
-    indexPage({
+    // indexPage({
-      document: document,
+    //   document: document,
-      originUrl: job.data.crawl_id
+    //   originUrl: job.data.crawl_id
-        ? (await getCrawl(job.data.crawl_id))?.originUrl!
+    //     ? (await getCrawl(job.data.crawl_id))?.originUrl!
-        : document.metadata.sourceURL!,
+    //     : document.metadata.sourceURL!,
-      crawlId: job.data.crawl_id,
+    //   crawlId: job.data.crawl_id,
-      teamId: job.data.team_id,
+    //   teamId: job.data.team_id,
-    }).catch((error) => {
+    // }).catch((error) => {
-      _logger.error("Error indexing page", { error });
+    //   _logger.error("Error indexing page", { error });
-    });
+    // });
  }
 }
@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
        doc.metadata.url !== undefined &&
        doc.metadata.sourceURL !== undefined &&
        normalizeURL(doc.metadata.url, sc) !==
-          normalizeURL(doc.metadata.sourceURL, sc)
+          normalizeURL(doc.metadata.sourceURL, sc) &&
        job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
      ) {
        const crawler = crawlToCrawler(job.data.crawl_id, sc);
        if (
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                newJobId: jobId,
              });
            } else {
-              logger.debug("Could not lock URL " + JSON.stringify(link), {
+              // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
-                url: link,
+              // logger.debug("Could not lock URL " + JSON.stringify(link), {
-              });
+              //   url: link,
              // });
            }
          }
        }
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.11.2",
+  "version": "1.11.3",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -565,23 +565,39 @@ export default class FirecrawlApp {
          if ("data" in statusData) {
            let data = statusData.data;
            while (typeof statusData === 'object' && 'next' in statusData) {
              if (data.length === 0) {
                break
              }
              statusData = (await this.getRequest(statusData.next, headers)).data;
              data = data.concat(statusData.data);
            }
            allData = data;
          }
        }
-        return ({
+
        let resp: CrawlStatusResponse | ErrorResponse = {
          success: response.data.success,
          status: response.data.status,
          total: response.data.total,
          completed: response.data.completed,
          creditsUsed: response.data.creditsUsed,
          expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
+          data: allData
-          data: allData,
+        }
-          error: response.data.error,
+
-        })
+        if (!response.data.success && response.data.error) {
          resp = {
            ...resp,
            success: false,
            error: response.data.error
          } as ErrorResponse;
        }
        if (response.data.next) {
          (resp as CrawlStatusResponse).next = response.data.next;
        }
        return resp;
      } else {
        this.handleError(response, "check crawl status");
      }
@ -799,23 +815,39 @@ export default class FirecrawlApp {
          if ("data" in statusData) {
            let data = statusData.data;
            while (typeof statusData === 'object' && 'next' in statusData) {
              if (data.length === 0) {
                break
              }
              statusData = (await this.getRequest(statusData.next, headers)).data;
              data = data.concat(statusData.data);
            }
            allData = data;
          }
        }
-        return ({
+
        let resp: BatchScrapeStatusResponse | ErrorResponse = {
          success: response.data.success,
          status: response.data.status,
          total: response.data.total,
          completed: response.data.completed,
          creditsUsed: response.data.creditsUsed,
          expiresAt: new Date(response.data.expiresAt),
-          next: response.data.next,
+          data: allData
-          data: allData,
+        }
-          error: response.data.error,
+
-        })
+        if (!response.data.success && response.data.error) {
          resp = {
            ...resp,
            success: false,
            error: response.data.error
          } as ErrorResponse;
        }
        if (response.data.next) {
          (resp as BatchScrapeStatusResponse).next = response.data.next;
        }
        return resp;
      } else {
        this.handleError(response, "check batch scrape status");
      }
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
              if ("data" in statusData) {
                let data = statusData.data;
                while (typeof statusData === 'object' && 'next' in statusData) {
                  if (data.length === 0) {
                    break
                  }
                  statusResponse = await this.getRequest(statusData.next, headers);
                  statusData = statusResponse.data;
                  data = data.concat(statusData.data);
--- a/apps/playwright-service-ts/api.ts
+++ b/apps/playwright-service-ts/api.ts
@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
 import bodyParser from 'body-parser';
 import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
 import dotenv from 'dotenv';
-import randomUseragent from 'random-useragent';
+import UserAgent from 'user-agents';
 import { getError } from './helpers/get_error';
 dotenv.config();
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
    ]
  });
-  const userAgent = randomUseragent.getRandom();
+  const userAgent = new UserAgent().toString();
  const viewport = { width: 1280, height: 800 };
  const contextOptions: any = {
--- a/apps/playwright-service-ts/package.json
+++ b/apps/playwright-service-ts/package.json
@ -16,12 +16,12 @@
    "dotenv": "^16.4.5",
    "express": "^4.19.2",
    "playwright": "^1.45.0",
-    "random-useragent": "^0.5.0"
+    "user-agents": "^1.1.410"
  },
  "devDependencies": {
    "@types/express": "^4.17.21",
    "@types/node": "^20.14.9",
-    "@types/random-useragent": "^0.3.3",
+    "@types/user-agents": "^1.0.4",
    "ts-node": "^10.9.2",
    "typescript": "^5.5.2"
  }
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os
 from .firecrawl import FirecrawlApp # noqa
-__version__ = "1.8.0"
+__version__ = "1.8.1"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -250,6 +250,8 @@ class FirecrawlApp:
                if 'data' in status_data:
                    data = status_data['data']
                    while 'next' in status_data:
                        if len(status_data['data']) == 0:
                            break
                        next_url = status_data.get('next')
                        if not next_url:
                            logger.warning("Expected 'next' URL is missing.")
@ -267,16 +269,24 @@ class FirecrawlApp:
                            break
                    status_data['data'] = data
-            return {
+            response = {
                'success': True,
                'status': status_data.get('status'),
                'total': status_data.get('total'),
                'completed': status_data.get('completed'),
                'creditsUsed': status_data.get('creditsUsed'),
                'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
+                'data': status_data.get('data')
-                'error': status_data.get('error'),
+            }
-                'next': status_data.get('next', None)
+
            if 'error' in status_data:
                response['error'] = status_data['error']
            if 'next' in status_data:
                response['next'] = status_data['next']
            return {
                'success': False if 'error' in status_data else True,
                **response
            }
        else:
            self._handle_error(response, 'check crawl status')
@ -459,6 +469,8 @@ class FirecrawlApp:
                if 'data' in status_data:
                    data = status_data['data']
                    while 'next' in status_data:
                        if len(status_data['data']) == 0:
                            break
                        next_url = status_data.get('next')
                        if not next_url:
                            logger.warning("Expected 'next' URL is missing.")
@ -476,16 +488,24 @@ class FirecrawlApp:
                            break
                    status_data['data'] = data
-            return {
+            response = {
                'success': True,
                'status': status_data.get('status'),
                'total': status_data.get('total'),
                'completed': status_data.get('completed'),
                'creditsUsed': status_data.get('creditsUsed'),
                'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
+                'data': status_data.get('data')
-                'error': status_data.get('error'),
+            }
-                'next': status_data.get('next', None)
+
            if 'error' in status_data:
                response['error'] = status_data['error']
            if 'next' in status_data:
                response['next'] = status_data['next']
            return {
                'success': False if 'error' in status_data else True,
                **response
            }
        else:
            self._handle_error(response, 'check batch scrape status')
@ -669,6 +689,8 @@ class FirecrawlApp:
                    if 'data' in status_data:
                        data = status_data['data']
                        while 'next' in status_data:
                          if len(status_data['data']) == 0:
                              break
                          status_response = self._get_request(status_data['next'], headers)
                          status_data = status_response.json()
                          data.extend(status_data.get('data', []))