From bafcc008bc0137b1f548a73523e89ae57f1f78a3 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:27:00 -0300 Subject: [PATCH 01/18] [SDK] fixed none and undefined on response --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 46 ++++++++++++++++++++------ apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 38 +++++++++++++++------ 4 files changed, 65 insertions(+), 23 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 9aab848a..46f85308 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.11.0", + "version": "1.11.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index af9dbc75..6b89960e 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -571,17 +571,30 @@ export default class FirecrawlApp { allData = data; } } - return ({ + + let resp: CrawlStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: allData, - error: response.data.error, - }) + data: allData + } + + if (!response.data.success && response.data.error) { + resp = { + ...resp, + success: false, + error: response.data.error + } as ErrorResponse; + } + + if (response.data.next) { + (resp as CrawlStatusResponse).next = response.data.next; + } + + return resp; } else { this.handleError(response, "check crawl status"); } @@ -805,17 +818,30 @@ export default class FirecrawlApp { allData = data; } } - return ({ + + let resp: BatchScrapeStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: allData, - error: response.data.error, - }) + data: allData + } + + if (!response.data.success && response.data.error) { + resp = { + ...resp, + success: false, + error: response.data.error + } as ErrorResponse; + } + + if (response.data.next) { + (resp as BatchScrapeStatusResponse).next = response.data.next; + } + + return resp; } else { this.handleError(response, "check batch scrape status"); } diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index d4d246e9..5528b3b2 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.8.0" +__version__ = "1.8.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 271a13f0..8eb7acee 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -266,17 +266,25 @@ class FirecrawlApp: logger.error(f"Error during pagination request: {e}") break status_data['data'] = data - - return { - 'success': True, + + response = { 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data'), - 'error': status_data.get('error'), - 'next': status_data.get('next', None) + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response } else: self._handle_error(response, 'check crawl status') @@ -476,16 +484,24 @@ class FirecrawlApp: break status_data['data'] = data - return { - 'success': True, + response = { 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data'), - 'error': status_data.get('error'), - 'next': status_data.get('next', None) + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response } else: self._handle_error(response, 'check batch scrape status') From 55dad5ea13da577e86122fb832b8534627d1f03c Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:56:39 -0300 Subject: [PATCH 02/18] fixed empty data with next causing infinite loop --- apps/js-sdk/firecrawl/src/index.ts | 12 ++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6b89960e..687325d3 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -565,6 +565,10 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + console.warn("Expected 'data' is missing.") + break + } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -812,6 +816,10 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + console.warn("Expected 'data' is missing.") + break + } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -995,6 +1003,10 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + console.warn("Expected 'data' is missing.") + break + } statusResponse = await this.getRequest(statusData.next, headers); statusData = statusResponse.data; data = data.concat(statusData.data); diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8eb7acee..812f7bd1 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -250,6 +250,9 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + logger.warning("Expected 'data' is missing.") + break next_url = status_data.get('next') if not next_url: logger.warning("Expected 'next' URL is missing.") @@ -467,6 +470,9 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + logger.warning("Expected 'data' is missing.") + break next_url = status_data.get('next') if not next_url: logger.warning("Expected 'next' URL is missing.") @@ -685,6 +691,9 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + logger.warning("Expected 'data' is missing.") + break status_response = self._get_request(status_data['next'], headers) status_data = status_response.json() data.extend(status_data.get('data', [])) From 12cd9f083ca5658519dbf2296c4711cc47407fd1 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:12:30 -0300 Subject: [PATCH 03/18] removed warnings --- apps/js-sdk/firecrawl/src/index.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a3038778..474eea83 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -566,7 +566,6 @@ export default class FirecrawlApp { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { - console.warn("Expected 'data' is missing.") break } statusData = (await this.getRequest(statusData.next, headers)).data; @@ -817,7 +816,6 @@ export default class FirecrawlApp { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { - console.warn("Expected 'data' is missing.") break } statusData = (await this.getRequest(statusData.next, headers)).data; @@ -1006,7 +1004,6 @@ export default class FirecrawlApp { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { - console.warn("Expected 'data' is missing.") break } statusResponse = await this.getRequest(statusData.next, headers); From a54a5dbb4510c641b111106a963874a45c441511 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:13:34 -0300 Subject: [PATCH 04/18] removed warnings --- apps/python-sdk/firecrawl/firecrawl.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 812f7bd1..d3216405 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -251,7 +251,6 @@ class FirecrawlApp: data = status_data['data'] while 'next' in status_data: if len(status_data['data']) == 0: - logger.warning("Expected 'data' is missing.") break next_url = status_data.get('next') if not next_url: @@ -471,7 +470,6 @@ class FirecrawlApp: data = status_data['data'] while 'next' in status_data: if len(status_data['data']) == 0: - logger.warning("Expected 'data' is missing.") break next_url = status_data.get('next') if not next_url: @@ -692,7 +690,6 @@ class FirecrawlApp: data = status_data['data'] while 'next' in status_data: if len(status_data['data']) == 0: - logger.warning("Expected 'data' is missing.") break status_response = self._get_request(status_data['next'], headers) status_data = status_response.json() From 6b2e1cbb281362405c4b8729e25eae169ec13851 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:19:40 -0300 Subject: [PATCH 05/18] Nick: cache /extract scrapes --- apps/api/src/lib/extract/document-scraper.ts | 15 ++++++++++----- apps/api/src/scraper/scrapeURL/engines/index.ts | 6 ++++++ apps/api/src/scraper/scrapeURL/index.ts | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 04194b0b..91d515df 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -14,10 +14,13 @@ interface ScrapeDocumentOptions { timeout: number; } -export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise { +export async function scrapeDocument( + options: ScrapeDocumentOptions, + urlTraces: URLTrace[], +): Promise { const trace = urlTraces.find((t) => t.url === options.url); if (trace) { - trace.status = 'scraped'; + trace.status = "scraped"; trace.timing.scrapedAt = new Date().toISOString(); } @@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: mode: "single_urls", team_id: options.teamId, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { + useCache: true, + }, plan: options.plan, origin: options.origin, is_scrape: true, @@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: } catch (error) { logger.error(`Error in scrapeDocument: ${error}`); if (trace) { - trace.status = 'error'; + trace.status = "error"; trace.error = error.message; } return null; } -} \ No newline at end of file +} diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bf51ac94..956fc3ab 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -298,6 +298,12 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { + + if (meta.internalOptions.useCache !== true) { + engines.splice(engines.indexOf("cache"), 1); + }else{ + meta.logger.debug("Cache engine enabled by useCache option"); + } const prioritySum = [...meta.featureFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, 0, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 549ce9d1..b13f7d9a 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -151,7 +151,7 @@ export type InternalOptions = { v0CrawlOnlyUrls?: boolean; v0DisableJsDom?: boolean; - + useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine isBackgroundIndex?: boolean; }; From 432b4106789d495769da3804228b915522f42fa5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:26:05 -0300 Subject: [PATCH 06/18] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f6a033cb..8408cc61 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -531,16 +531,16 @@ async function indexJob(job: Job & { id: string }, document: Document) { document.markdown && job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! ) { - indexPage({ - document: document, - originUrl: job.data.crawl_id - ? (await getCrawl(job.data.crawl_id))?.originUrl! - : document.metadata.sourceURL!, - crawlId: job.data.crawl_id, - teamId: job.data.team_id, - }).catch((error) => { - _logger.error("Error indexing page", { error }); - }); + // indexPage({ + // document: document, + // originUrl: job.data.crawl_id + // ? (await getCrawl(job.data.crawl_id))?.originUrl! + // : document.metadata.sourceURL!, + // crawlId: job.data.crawl_id, + // teamId: job.data.team_id, + // }).catch((error) => { + // _logger.error("Error indexing page", { error }); + // }); } } From 499479c85e9da40a86e3c2ef83eaf1f924682ae5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:28:52 -0300 Subject: [PATCH 07/18] Update url-processor.ts --- apps/api/src/lib/extract/url-processor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index af250fcd..a5027fa9 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); // retry if only one url is returned - if (uniqueUrls.length === 1) { + if (uniqueUrls.length <= 1) { const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, From 8df1c67961dded611cfe18c9a1c304852d428c9d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 21:48:28 -0300 Subject: [PATCH 08/18] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 8408cc61..4ea3ff84 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -50,6 +50,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; +import { supabase_service } from "../services/supabase"; configDotenv(); @@ -77,6 +78,30 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); + + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { + try { + const { error } = await supabase_service + .from('crawl_maps') + .insert({ + crawl_id: job.data.crawl_id, + team_id: job.data.team_id, + origin_url: sc.originUrl, + urls: visitedUrls, + created_at: new Date().toISOString() + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } + } catch (error) { + _logger.error("Error saving crawl map", { error }); + } + } + if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); From a4f7c38834426c441d7da0221b7f467195cd2350 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 22:15:23 -0300 Subject: [PATCH 09/18] Nick: fixed --- .../src/scraper/scrapeURL/engines/index.ts | 7 ++- apps/api/src/services/queue-worker.ts | 52 ++++++++++++++----- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 956fc3ab..e452f7fa 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -300,8 +300,11 @@ export function buildFallbackList(meta: Meta): { }[] { if (meta.internalOptions.useCache !== true) { - engines.splice(engines.indexOf("cache"), 1); - }else{ + const cacheIndex = engines.indexOf("cache"); + if (cacheIndex !== -1) { + engines.splice(cacheIndex, 1); + } + } else { meta.logger.debug("Cache engine enabled by useCache option"); } const prioritySum = [...meta.featureFlags].reduce( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4ea3ff84..f6ff96a5 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -84,18 +84,43 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { try { - const { error } = await supabase_service + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service .from('crawl_maps') - .insert({ - crawl_id: job.data.crawl_id, - team_id: job.data.team_id, - origin_url: sc.originUrl, - urls: visitedUrls, - created_at: new Date().toISOString() - }); + .select('urls') + .eq('origin_url', sc.originUrl) + .single(); + + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; - if (error) { - _logger.error("Failed to save crawl map", { error }); + const { error } = await supabase_service + .from('crawl_maps') + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString() + }) + .eq('origin_url', sc.originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service + .from('crawl_maps') + .insert({ + origin_url: sc.originUrl, + urls: visitedUrls, + num_urls: visitedUrls.length, + created_at: new Date().toISOString() + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } } } catch (error) { _logger.error("Error saving crawl map", { error }); @@ -802,9 +827,10 @@ async function processJob(job: Job & { id: string }, token: string) { newJobId: jobId, }); } else { - logger.debug("Could not lock URL " + JSON.stringify(link), { - url: link, - }); + // TODO: removed this, ok? too many 'not useful' logs (?) Mogery! + // logger.debug("Could not lock URL " + JSON.stringify(link), { + // url: link, + // }); } } } From c655c6859f256b10cb1a4cdd9d4e039940dea89a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 22:50:53 -0300 Subject: [PATCH 10/18] Nick: fixed --- apps/api/src/lib/canonical-url.ts | 7 ++ apps/api/src/services/queue-worker.ts | 97 ++++++++++++++++----------- 2 files changed, 63 insertions(+), 41 deletions(-) create mode 100644 apps/api/src/lib/canonical-url.ts diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts new file mode 100644 index 00000000..cbb33f8b --- /dev/null +++ b/apps/api/src/lib/canonical-url.ts @@ -0,0 +1,7 @@ +export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} \ No newline at end of file diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f6ff96a5..4fb08337 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; +import { normalizeUrl } from "../lib/canonical-url"; configDotenv(); @@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { - // Get all visited URLs from Redis - const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); - - // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) - if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { - try { - // First check if entry exists for this origin URL - const { data: existingMap } = await supabase_service - .from('crawl_maps') - .select('urls') - .eq('origin_url', sc.originUrl) - .single(); + (async () => { + const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined; + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited", + ); + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { + // Fire and forget the upload to Supabase + try { + // Standardize URLs to canonical form (https, no www) + const standardizedUrls = [ + ...new Set( + visitedUrls.map((url) => { + return normalizeUrl(url); + }), + ), + ]; + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl) + .single(); - if (existingMap) { - // Merge URLs, removing duplicates - const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; - - const { error } = await supabase_service - .from('crawl_maps') - .update({ - urls: mergedUrls, - num_urls: mergedUrls.length, - updated_at: new Date().toISOString() - }) - .eq('origin_url', sc.originUrl); + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [ + ...new Set([...existingMap.urls, ...standardizedUrls]), + ]; - if (error) { - _logger.error("Failed to update crawl map", { error }); - } - } else { - // Insert new entry if none exists - const { error } = await supabase_service - .from('crawl_maps') - .insert({ - origin_url: sc.originUrl, - urls: visitedUrls, - num_urls: visitedUrls.length, - created_at: new Date().toISOString() + const { error } = await supabase_service + .from("crawl_maps") + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service.from("crawl_maps").insert({ + origin_url: originUrl, + urls: standardizedUrls, + num_urls: standardizedUrls.length, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), }); - if (error) { - _logger.error("Failed to save crawl map", { error }); + if (error) { + _logger.error("Failed to save crawl map", { error }); + } } + } catch (error) { + _logger.error("Error saving crawl map", { error }); } - } catch (error) { - _logger.error("Error saving crawl map", { error }); } - } + })(); if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); From 05e845a9711a4e84cedae95de0b58f3964dfcfbf Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 22:55:38 -0300 Subject: [PATCH 11/18] Update cache.ts --- apps/api/src/scraper/scrapeURL/transformers/cache.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index 523a8419..4005059f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -3,6 +3,10 @@ import { Meta } from ".."; import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache"; export function saveToCache(meta: Meta, document: Document): Document { + if (meta.internalOptions.useCache !== true) { + return document; + } + if ( document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300 From aef040b41e14d67abcc1bdfb751c20d93a3275de Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 23:07:15 -0300 Subject: [PATCH 12/18] Nick: from cache fixes --- apps/api/src/scraper/scrapeURL/engines/cache/index.ts | 3 +++ apps/api/src/scraper/scrapeURL/index.ts | 1 + apps/api/src/scraper/scrapeURL/transformers/cache.ts | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts index f48806fd..c0451df4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise { const entry = await getEntryFromCache(key); if (entry === null) throw new EngineError("Cache missed"); + // Set fromCache flag to indicate this document was retrieved from cache + meta.internalOptions.fromCache = true; + return { url: entry.url, html: entry.html, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index b13f7d9a..7f4a76e4 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -154,6 +154,7 @@ export type InternalOptions = { useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine isBackgroundIndex?: boolean; + fromCache?: boolean; // Indicates if the document was retrieved from cache }; export type EngineResultsTracker = { diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index 4005059f..f2d7bcf4 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -19,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document { ); } + // If the document was retrieved from cache, we don't need to save it + if (meta.internalOptions.fromCache) { + return document; + } + + const key = cacheKey(meta.url, meta.options, meta.internalOptions); if (key !== null) { From f25c0c6d216c3242b114d5fdada4b67a74d4e08c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 23:16:33 -0300 Subject: [PATCH 13/18] Nick: added canonical tests --- apps/api/src/lib/canonical-url.test.ts | 39 ++++++++++++++++++++++++++ apps/api/src/lib/canonical-url.ts | 9 +++--- 2 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 apps/api/src/lib/canonical-url.test.ts diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts new file mode 100644 index 00000000..0a2c3acd --- /dev/null +++ b/apps/api/src/lib/canonical-url.test.ts @@ -0,0 +1,39 @@ +import { normalizeUrl } from './canonical-url'; + +describe('normalizeUrl', () => { + it('should remove protocol and www from URL', () => { + const url = 'https://www.example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should remove only protocol if www is not present', () => { + const url = 'https://example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs without protocol', () => { + const url = 'www.example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs without protocol and www', () => { + const url = 'example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs with paths', () => { + const url = 'https://www.example.com/path/to/resource'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle invalid URLs gracefully', () => { + const url = 'not a valid url'; + const expected = 'not a valid url'; + expect(normalizeUrl(url)).toBe(expected); + }); +}); diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts index cbb33f8b..fedea09d 100644 --- a/apps/api/src/lib/canonical-url.ts +++ b/apps/api/src/lib/canonical-url.ts @@ -1,7 +1,8 @@ export function normalizeUrl(url: string) { - url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); - if (url.endsWith("/")) { - url = url.slice(0, -1); + try { + const hostname = new URL(url).hostname; + return hostname.replace(/^www\./, ""); + } catch (error) { + return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0]; } - return url; } \ No newline at end of file From f2e0bfbfe3048d7b52b44e1c472ddf915eff4134 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 23:54:03 -0300 Subject: [PATCH 14/18] Nick: url normalization --- apps/api/src/lib/canonical-url.ts | 15 +++++++++++++-- apps/api/src/services/queue-worker.ts | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts index fedea09d..50570293 100644 --- a/apps/api/src/lib/canonical-url.ts +++ b/apps/api/src/lib/canonical-url.ts @@ -1,8 +1,19 @@ export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} + +export function normalizeUrlOnlyHostname(url: string) { try { const hostname = new URL(url).hostname; return hostname.replace(/^www\./, ""); } catch (error) { - return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0]; + return url + .replace(/^https?:\/\//, "") + .replace(/^www\./, "") + .split("/")[0]; } -} \ No newline at end of file +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4fb08337..9e6f3d24 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; -import { normalizeUrl } from "../lib/canonical-url"; +import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url"; configDotenv(); @@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { (async () => { - const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined; + const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; // Get all visited URLs from Redis const visitedUrls = await redisConnection.smembers( "crawl:" + job.data.crawl_id + ":visited", From d48ddb88200ed474144df3aa43eb0be305597658 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 23:55:05 -0300 Subject: [PATCH 15/18] Update canonical-url.test.ts --- apps/api/src/lib/canonical-url.test.ts | 54 +++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts index 0a2c3acd..65171642 100644 --- a/apps/api/src/lib/canonical-url.test.ts +++ b/apps/api/src/lib/canonical-url.test.ts @@ -1,4 +1,44 @@ -import { normalizeUrl } from './canonical-url'; +import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url'; + +describe('normalizeUrlOnlyHostname', () => { + it('should remove protocol and www from URL', () => { + const url = 'https://www.example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should remove only protocol if www is not present', () => { + const url = 'https://example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs without protocol', () => { + const url = 'www.example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs without protocol and www', () => { + const url = 'example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs with paths', () => { + const url = 'https://www.example.com/path/to/resource'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle invalid URLs gracefully', () => { + const url = 'not a valid url'; + const expected = 'not a valid url'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); +}); + + describe('normalizeUrl', () => { it('should remove protocol and www from URL', () => { @@ -27,10 +67,22 @@ describe('normalizeUrl', () => { it('should handle URLs with paths', () => { const url = 'https://www.example.com/path/to/resource'; + const expected = 'example.com/path/to/resource'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs with trailing slash', () => { + const url = 'https://www.example.com/'; const expected = 'example.com'; expect(normalizeUrl(url)).toBe(expected); }); + it('should handle URLs with trailing slash and path', () => { + const url = 'https://www.example.com/path/'; + const expected = 'example.com/path'; + expect(normalizeUrl(url)).toBe(expected); + }); + it('should handle invalid URLs gracefully', () => { const url = 'not a valid url'; const expected = 'not a valid url'; From b92a4eb79b04d090ccb8322db1af9a95b838b819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 4 Jan 2025 16:59:35 +0100 Subject: [PATCH 16/18] fix(queue-worker): only do redirect handling logic on crawls, not batch scrape --- apps/api/src/controllers/v1/batch-scrape.ts | 2 +- apps/api/src/services/queue-worker.ts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 19ce3ba0..21c9745c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -61,7 +61,7 @@ export async function batchScrapeController( } logger.debug("Batch scrape " + id + " starting", { - urlsLength: urls, + urlsLength: urls.length, appendToId: req.body.appendToId, account: req.account, }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 9e6f3d24..a48c798b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -710,7 +710,8 @@ async function processJob(job: Job & { id: string }, token: string) { doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== - normalizeURL(doc.metadata.sourceURL, sc) + normalizeURL(doc.metadata.sourceURL, sc) && + job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape ) { const crawler = crawlToCrawler(job.data.crawl_id, sc); if ( From 461842fe8c1e71388809165f8efff12a8d781500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 4 Jan 2025 17:24:33 +0100 Subject: [PATCH 17/18] fix(v1/crawl-status): handle job's returnvalue being explicitly null (db race) --- apps/api/src/controllers/v1/crawl-status.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 1aec86c8..ce3831f2 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -157,10 +157,10 @@ export async function crawlStatusController( continue; } - if (job.returnvalue === undefined) { + if (job.returnvalue === undefined || job.returnvalue === null) { logger.warn( "Job was considered done, but returnvalue is undefined!", - { jobId: job.id, state }, + { jobId: job.id, state, returnvalue: job.returnvalue }, ); continue; } From 736c3675b66a52e4fe4fdf2097c2b1e8820cdda9 Mon Sep 17 00:00:00 2001 From: Kirill Date: Sun, 5 Jan 2025 17:07:14 +0400 Subject: [PATCH 18/18] use new agent generation instead of expired one --- apps/playwright-service-ts/api.ts | 4 ++-- apps/playwright-service-ts/package.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index eacb35ff..3b024347 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -2,7 +2,7 @@ import express, { Request, Response } from 'express'; import bodyParser from 'body-parser'; import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright'; import dotenv from 'dotenv'; -import randomUseragent from 'random-useragent'; +import UserAgent from 'user-agents'; import { getError } from './helpers/get_error'; dotenv.config(); @@ -60,7 +60,7 @@ const initializeBrowser = async () => { ] }); - const userAgent = randomUseragent.getRandom(); + const userAgent = new UserAgent().toString(); const viewport = { width: 1280, height: 800 }; const contextOptions: any = { diff --git a/apps/playwright-service-ts/package.json b/apps/playwright-service-ts/package.json index fe15209f..af1c10be 100644 --- a/apps/playwright-service-ts/package.json +++ b/apps/playwright-service-ts/package.json @@ -16,12 +16,12 @@ "dotenv": "^16.4.5", "express": "^4.19.2", "playwright": "^1.45.0", - "random-useragent": "^0.5.0" + "user-agents": "^1.1.410" }, "devDependencies": { "@types/express": "^4.17.21", "@types/node": "^20.14.9", - "@types/random-useragent": "^0.3.3", + "@types/user-agents": "^1.0.4", "ts-node": "^10.9.2", "typescript": "^5.5.2" }