From 3604f2a3ae8bcb50c95bfd9c5bb11da79396bea1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 21 Jan 2025 16:57:45 -0300 Subject: [PATCH] Nick: misc improvements --- apps/api/src/controllers/v1/map.ts | 16 +++++++--- .../src/scraper/WebScraper/sitemap-index.ts | 17 +++++++---- .../src/services/indexing/crawl-maps-index.ts | 30 ++++++++++++++----- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 5 ++++ 5 files changed, 51 insertions(+), 19 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index da12980c..a50357d1 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -156,13 +156,21 @@ export async function getMapResults({ } // Parallelize sitemap index query with search results - const [sitemapIndexUrls, ...searchResults] = await Promise.all([ + const [sitemapIndexResult, ...searchResults] = await Promise.all([ querySitemapIndex(url), ...(cachedResult ? [] : pagePromises), ]); - // Only query sitemap if index has less than 100 links - if (!ignoreSitemap && sitemapIndexUrls.length < 100) { + const twoDaysAgo = new Date(); + twoDaysAgo.setDate(twoDaysAgo.getDate() - 2); + + + // If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap + if ( + !ignoreSitemap && + (sitemapIndexResult.urls.length < 100 || + new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo) + ) { await crawler.tryGetSitemap(urls => { links.push(...urls); }, true, false, 30000); @@ -197,7 +205,7 @@ export async function getMapResults({ } // Add sitemap-index URLs - links.push(...sitemapIndexUrls); + links.push(...sitemapIndexResult.urls); // Perform cosine similarity between the search query and the list of links if (search) { diff --git a/apps/api/src/scraper/WebScraper/sitemap-index.ts b/apps/api/src/scraper/WebScraper/sitemap-index.ts index 4a97f408..75d2532c 100644 --- a/apps/api/src/scraper/WebScraper/sitemap-index.ts +++ b/apps/api/src/scraper/WebScraper/sitemap-index.ts @@ -16,15 +16,20 @@ async function querySitemapIndexFunction(url: string) { try { const { data, error } = await supabase_service .from("crawl_maps") - .select("urls") - .eq("origin_url", originUrl); + .select("urls, updated_at") + .eq("origin_url", originUrl) + .order("updated_at", { ascending: false }); if (error) { throw error; } + if (!data || data.length === 0) { + return { urls: [], lastUpdated: new Date(0) }; + } + const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))]; - return allUrls; + return { urls: allUrls, lastUpdated: data[0].updated_at }; } catch (error) { logger.error("(sitemap-index) Error querying the index", { @@ -33,12 +38,12 @@ async function querySitemapIndexFunction(url: string) { }); if (attempt === 3) { - return []; + return { urls: [], lastUpdated: new Date(0) }; } } } - return []; + return { urls: [], lastUpdated: new Date(0) }; } -export const querySitemapIndex = withAuth(querySitemapIndexFunction, []); +export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) }); diff --git a/apps/api/src/services/indexing/crawl-maps-index.ts b/apps/api/src/services/indexing/crawl-maps-index.ts index fb6388f5..35e377ea 100644 --- a/apps/api/src/services/indexing/crawl-maps-index.ts +++ b/apps/api/src/services/indexing/crawl-maps-index.ts @@ -91,7 +91,16 @@ async function processBatch() { const inserts: CrawlMapRecord[] = []; const duplicatesToDelete: string[] = []; + // Track processed origins to avoid duplicates in the same batch + const processedOrigins = new Set(); + for (const op of operations) { + // Skip if we've already processed this origin in this batch + if (processedOrigins.has(op.originUrl)) { + continue; + } + processedOrigins.add(op.originUrl); + const existingForOrigin = mapsByOrigin.get(op.originUrl) || []; if (existingForOrigin.length > 0) { @@ -110,7 +119,7 @@ async function processBatch() { ]; updates.push({ - id: mostRecent.id, // Add id to ensure we update the correct record + id: mostRecent.id, origin_url: op.originUrl, urls: mergedUrls, num_urls: mergedUrls.length, @@ -156,14 +165,19 @@ async function processBatch() { logger.info(`🔄 Updating ${updates.length} existing crawl maps`, { origins: updates.map((u) => u.origin_url), }); - const { error: updateError } = await supabase_service - .from("crawl_maps") - .upsert(updates); + + // Process updates one at a time to avoid conflicts + for (const update of updates) { + const { error: updateError } = await supabase_service + .from("crawl_maps") + .upsert(update); - if (updateError) { - logger.error("Failed to batch update crawl maps", { - error: updateError, - }); + if (updateError) { + logger.error("Failed to update crawl map", { + error: updateError, + origin: update.origin_url + }); + } } } diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 83d1909b..1e6964fa 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.10.1" +__version__ = "1.10.2" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b0c09334..04e90c39 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -51,6 +51,11 @@ class FirecrawlApp: schema_: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False + enable_web_search: Optional[bool] = False + # Just for backwards compatibility + enableWebSearch: Optional[bool] = False + + class ExtractResponse(pydantic.BaseModel): """