Nick: misc improvements

2025-08-12 15:49:02 +08:00 · 2025-01-21 16:57:45 -03:00 · 2025-01-21 16:57:45 -03:00 · 3604f2a3ae
commit 3604f2a3ae
parent ac0d10c451
5 changed files with 51 additions and 19 deletions
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -156,13 +156,21 @@ export async function getMapResults({
    }

    // Parallelize sitemap index query with search results
-    const [sitemapIndexUrls, ...searchResults] = await Promise.all([
+    const [sitemapIndexResult, ...searchResults] = await Promise.all([
      querySitemapIndex(url),
      ...(cachedResult ? [] : pagePromises),
    ]);

-    // Only query sitemap if index has less than 100 links
-    if (!ignoreSitemap && sitemapIndexUrls.length < 100) {
+    const twoDaysAgo = new Date();
+    twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
+
+
+    // If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
+    if (
+      !ignoreSitemap && 
+      (sitemapIndexResult.urls.length < 100 ||
+      new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
+    ) {
      await crawler.tryGetSitemap(urls => {
        links.push(...urls);
      }, true, false, 30000);
@ -197,7 +205,7 @@ export async function getMapResults({
    }

    // Add sitemap-index URLs
-    links.push(...sitemapIndexUrls);
+    links.push(...sitemapIndexResult.urls);

    // Perform cosine similarity between the search query and the list of links
    if (search) {
--- a/apps/api/src/scraper/WebScraper/sitemap-index.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap-index.ts
@ -16,15 +16,20 @@ async function querySitemapIndexFunction(url: string) {
    try {
      const { data, error } = await supabase_service
        .from("crawl_maps")
-        .select("urls")
-        .eq("origin_url", originUrl);
+        .select("urls, updated_at")
+        .eq("origin_url", originUrl)
+        .order("updated_at", { ascending: false });

      if (error) {
        throw error;
      }

+      if (!data || data.length === 0) {
+        return { urls: [], lastUpdated: new Date(0) };
+      }
+
      const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
-      return allUrls;
+      return { urls: allUrls, lastUpdated: data[0].updated_at };

    } catch (error) {
      logger.error("(sitemap-index) Error querying the index", { 
@ -33,12 +38,12 @@ async function querySitemapIndexFunction(url: string) {
      });

      if (attempt === 3) {
-        return [];
+        return { urls: [], lastUpdated: new Date(0) };
      }
    }
  }

-  return [];
+  return { urls: [], lastUpdated: new Date(0) };
 }

-export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);
+export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });
--- a/apps/api/src/services/indexing/crawl-maps-index.ts
+++ b/apps/api/src/services/indexing/crawl-maps-index.ts
@ -91,7 +91,16 @@ async function processBatch() {
    const inserts: CrawlMapRecord[] = [];
    const duplicatesToDelete: string[] = [];

+    // Track processed origins to avoid duplicates in the same batch
+    const processedOrigins = new Set<string>();
+
    for (const op of operations) {
+      // Skip if we've already processed this origin in this batch
+      if (processedOrigins.has(op.originUrl)) {
+        continue;
+      }
+      processedOrigins.add(op.originUrl);
+
      const existingForOrigin = mapsByOrigin.get(op.originUrl) || [];

      if (existingForOrigin.length > 0) {
@ -110,7 +119,7 @@ async function processBatch() {
        ];

        updates.push({
-          id: mostRecent.id, // Add id to ensure we update the correct record
+          id: mostRecent.id,
          origin_url: op.originUrl,
          urls: mergedUrls,
          num_urls: mergedUrls.length,
@ -156,14 +165,19 @@ async function processBatch() {
      logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
        origins: updates.map((u) => u.origin_url),
      });
-      const { error: updateError } = await supabase_service
-        .from("crawl_maps")
-        .upsert(updates);
+      
+      // Process updates one at a time to avoid conflicts
+      for (const update of updates) {
+        const { error: updateError } = await supabase_service
+          .from("crawl_maps")
+          .upsert(update);

-      if (updateError) {
-        logger.error("Failed to batch update crawl maps", {
-          error: updateError,
-        });
+        if (updateError) {
+          logger.error("Failed to update crawl map", {
+            error: updateError,
+            origin: update.origin_url
+          });
+        }
      }
    }

--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.10.1"
+__version__ = "1.10.2"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -51,6 +51,11 @@ class FirecrawlApp:
        schema_: Optional[Any] = pydantic.Field(None, alias='schema')
        system_prompt: Optional[str] = None
        allow_external_links: Optional[bool] = False
+        enable_web_search: Optional[bool] = False
+        # Just for backwards compatibility
+        enableWebSearch: Optional[bool] = False
+
+

    class ExtractResponse(pydantic.BaseModel):
        """