cleanup

2025-08-16 16:15:59 +08:00 · 2025-06-03 16:07:59 +02:00 · 2025-06-03 16:07:59 +02:00 · da9a9b0d19
commit da9a9b0d19
parent 014a99ef91
6 changed files with 47 additions and 580 deletions
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -105,7 +105,7 @@ describe("Scrape tests", () => {
        const response1 = await scrape({
          url,
          maxAge: 120000,
-          dontStoreInCache: true,
+          storeInCache: false,
        });

        expect(response1.metadata.cacheState).toBe("miss");
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
 import Redis from "ioredis";
 import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
 import { getIndexQueue } from "../../services/queue-service";
-import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
+import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex as globalUseIndex } from "../../services/index";

 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
@ -59,7 +59,7 @@ export async function getMapResults({
  mock,
  filterByPath = true,
  flags,
-  ignoreIndex = false,
+  useIndex = true,
 }: {
  url: string;
  search?: string;
@ -75,7 +75,7 @@ export async function getMapResults({
  mock?: string;
  filterByPath?: boolean;
  flags: TeamFlags;
-  ignoreIndex?: boolean;
+  useIndex?: boolean;
 }): Promise<MapResult> {
  const id = uuidv4();
  let links: string[] = [url];
@ -172,7 +172,7 @@ export async function getMapResults({
    // Parallelize sitemap index query with search results
    const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
      querySitemapIndex(url, abort),
-      useIndex && !ignoreIndex && process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true" ? (
+      globalUseIndex && useIndex && process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true" ? (
        index_supabase_service
          .from("index")
          .select("resolved_url")
@ -352,7 +352,7 @@ export async function mapController(
        mock: req.body.useMock,
        filterByPath: req.body.filterByPath !== false,
        flags: req.acuc?.flags ?? null,
-        ignoreIndex: req.body.ignoreIndex,
+        useIndex: req.body.useIndex,
      }),
      ...(req.body.timeout !== undefined ? [
        new Promise((resolve, reject) => setTimeout(() => {
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -310,7 +310,7 @@ const baseScrapeOptions = z
    blockAds: z.boolean().default(true),
    proxy: z.enum(["basic", "stealth", "auto"]).optional(),
    maxAge: z.number().int().gte(0).safe().default(0),
-    dontStoreInCache: z.boolean().default(false),
+    storeInCache: z.boolean().default(true),
  })
  .strict(strictMessage);

@ -658,7 +658,7 @@ export const mapRequestSchema = crawlerOptions
    timeout: z.number().positive().finite().optional(),
    useMock: z.string().optional(),
    filterByPath: z.boolean().default(true),
-    ignoreIndex: z.boolean().default(false),
+    useIndex: z.boolean().default(true),
  })
  .strict(strictMessage);

--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -87,7 +87,7 @@ import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
 import { RateLimiterMode } from "../types";
 import { redisEvictConnection } from "./redis";
 import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index";
-import { val } from "node_modules/cheerio/lib/api/attributes";
+import { WebCrawler } from "../scraper/WebScraper/crawler";

 configDotenv();

@ -912,6 +912,38 @@ const workerFun = async (
  }
 };

+async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) {
+  const trimmedURL = new URL(url);
+  trimmedURL.search = "";
+
+  const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x));
+
+  const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex)
+    ? []
+    : sc.crawlerOptions.allowBackwardCrawling
+      ? (await index_supabase_service
+        .from("index")
+        .select("resolved_url")
+        .eq("url_split_0_hash", urlSplits[0])
+        .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
+        .limit(sc.crawlerOptions.limit ?? 100)).data ?? []
+      : (await index_supabase_service
+        .from("index")
+        .select("resolved_url")
+        .eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1])
+        .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
+        .limit(sc.crawlerOptions.limit ?? 100)).data ?? [];
+  
+  const validIndexLinks = crawler.filterLinks(
+    [...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null),
+    sc.crawlerOptions.limit ?? 100,
+    sc.crawlerOptions.maxDepth ?? 10,
+    false,
+  );
+
+  return validIndexLinks;
+}
+
 async function processKickoffJob(job: Job & { id: string }, token: string) {
  const logger = _logger.child({
    module: "queue-worker",
@ -1029,37 +1061,11 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
      });
    }

-    const trimmedURL = new URL(job.data.url);
-    trimmedURL.search = "";
+    const indexLinks = await kickoffGetIndexLinks(sc, crawler, job.data.url);

-    const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x));
-
-      const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex)
-      ? []
-      : sc.crawlerOptions.allowBackwardCrawling
-        ? (await index_supabase_service
-          .from("index")
-          .select("resolved_url")
-          .eq("url_split_0_hash", urlSplits[0])
-          .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
-          .limit(sc.crawlerOptions.limit ?? 100)).data ?? []
-        : (await index_supabase_service
-          .from("index")
-          .select("resolved_url")
-          .eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1])
-          .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
-          .limit(sc.crawlerOptions.limit ?? 100)).data ?? [];
-    
-    const validIndexLinks = crawler.filterLinks(
-      [...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null),
-      sc.crawlerOptions.limit ?? 100,
-      sc.crawlerOptions.maxDepth ?? 10,
-      false,
-    );
-
-    if (validIndexLinks.length > 0) {
-      logger.debug("Using index links of length " + validIndexLinks.length, {
-        indexLinksLength: validIndexLinks.length,
+    if (indexLinks.length > 0) {
+      logger.debug("Using index links of length " + indexLinks.length, {
+        indexLinksLength: indexLinks.length,
      });

      let jobPriority = await getJobPriority({
@ -1068,7 +1074,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
      });
      logger.debug("Using job priority " + jobPriority, { jobPriority });

-      const jobs = validIndexLinks.map((url) => {
+      const jobs = indexLinks.map((url) => {
        const uuid = uuidv4();
        return {
          name: uuid,
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -120,7 +120,7 @@ export interface CrawlScrapeOptions {
  removeBase64Images?: boolean;
  blockAds?: boolean;
  proxy?: "basic" | "stealth" | "auto";
-  dontStoreInCache?: boolean;
+  storeInCache?: boolean;
 }

 export type Action = {
--- a/apps/test-suite/index-benchmark/run.ipynb
+++ b/apps/test-suite/index-benchmark/run.ipynb
@ -1,539 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "51331c01",
-   "metadata": {},
-   "source": [
-    "# Index Benchmark"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9cb10752",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7928e2c9",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "id": "a64ec8a0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 104,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from firecrawl import FirecrawlApp, ScrapeOptions\n",
-    "import os\n",
-    "from dotenv import load_dotenv\n",
-    "from datetime import datetime\n",
-    "import statistics\n",
-    "import requests\n",
-    "\n",
-    "load_dotenv()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "id": "bc7ce797",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "STAGING_API_URL = 'http://default-firecrawl-app-staging-service:3002/' # os.getenv(\"STAGING_API_URL\") or None\n",
-    "\n",
-    "if STAGING_API_URL is None:\n",
-    "    raise ValueError(\"STAGING_API_URL is not set\")\n",
-    "    \n",
-    "app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "id": "a3db6548",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'http://default-firecrawl-app-staging-service:3002/'"
-      ]
-     },
-     "execution_count": 83,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "STAGING_API_URL"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 141,
-   "id": "440f7c2d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "scrape_urls=[\n",
-    "    'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n",
-    "    'https://httpbin.org', # - HTTP testing service (very reliable)\n",
-    "    'https://example.com', # - Standard test domain (minimal content)\n",
-    "    'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n",
-    "    'https://stackoverflow.com/questions', # - Stack Overflow questions page\n",
-    "    'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n",
-    "    'https://jsonplaceholder.typicode.com', # - Fake API for testing\n",
-    "    'https://httpstat.us/200', # - HTTP status testing (minimal response)\n",
-    "    'https://www.reddit.com/r/programming', # - Reddit programming subreddit\n",
-    "    'https://docs.python.org/3/' # - Python documentation (structured docs)\n",
-    "]\n",
-    "\n",
-    "\n",
-    "crawl_urls = [\n",
-    "    \"https://www.pcbgogo.com/*\", # 7825\n",
-    "    \"https://github.com/Uniswap/v4-core/*\", # 7353\n",
-    "    \"http://arcep.fr/actualites/*\", # 9764\n",
-    "    \"https://www.synapticure.com/*\", # 7746\n",
-    "    \"https://www.elecrow.com/*\", # 8025\n",
-    "    \"https://idfcfirstbank.com/*\", # 9912\n",
-    "    \"https://www.todaytix.com/*\", # 7532\n",
-    "    \"https://www.wheel-size.com/size/*\", # 7102\n",
-    "    \"http://www.drymerge.com/*\", # 8422\n",
-    "    \"https://telegramindex.org/*\" # 5335\n",
-    "]\n",
-    "\n",
-    "\n",
-    "map_urls = []\n",
-    "for i in crawl_urls:\n",
-    "    map_urls.append(i.replace('*',''))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e54e6677",
-   "metadata": {},
-   "source": [
-    "## Scrape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3fed4cb6",
-   "metadata": {},
-   "source": [
-    "Hypothesis: Indexed scrapes are faster"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "id": "fb052d01",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Testing 1/10: https://news.ycombinator.com\n",
-      "  No cache: 2.39s\n",
-      "  Cached: 0.93s\n",
-      "Testing 2/10: https://httpbin.org\n",
-      "  No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
-      "  Cached: 4.79s\n",
-      "Testing 3/10: https://example.com\n",
-      "  No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
-      "  Cached: 17.33s\n",
-      "Testing 4/10: https://github.com/microsoft/vscode\n",
-      "  No cache: 2.81s\n",
-      "  Cached: 1.06s\n",
-      "Testing 5/10: https://stackoverflow.com/questions\n",
-      "  No cache: 2.92s\n",
-      "  Cached: 1.16s\n",
-      "Testing 6/10: https://www.wikipedia.org\n",
-      "  No cache: 3.53s\n",
-      "  Cached: 0.78s\n",
-      "Testing 7/10: https://jsonplaceholder.typicode.com\n",
-      "  No cache: 3.54s\n",
-      "  Cached: 0.80s\n",
-      "Testing 8/10: https://httpstat.us/200\n",
-      "  No cache: 10.79s\n",
-      "  Cached: 0.54s\n",
-      "Testing 9/10: https://www.reddit.com/r/programming\n",
-      "  No cache: 4.53s\n",
-      "  Cached: 0.79s\n",
-      "Testing 10/10: https://docs.python.org/3/\n",
-      "  No cache: 3.95s\n",
-      "  Cached: 0.53s\n"
-     ]
-    }
-   ],
-   "source": [
-    "scrape_times_no_cache = []\n",
-    "scrape_times_cached = []\n",
-    "\n",
-    "for i, url in enumerate(scrape_urls):  # Test first 5 URLs\n",
-    "    print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n",
-    "    \n",
-    "    # No cache (maxAge=1)\n",
-    "    try:\n",
-    "        start = datetime.now()\n",
-    "        doc = app.scrape_url(url, maxAge=1)\n",
-    "        no_cache_time = (datetime.now() - start).total_seconds()\n",
-    "        scrape_times_no_cache.append(no_cache_time)\n",
-    "        print(f\"  No cache: {no_cache_time:.2f}s\")\n",
-    "    except Exception as e:\n",
-    "        print(f\"  No cache: FAILED - {e}\")\n",
-    "        scrape_times_no_cache.append(None)\n",
-    "    \n",
-    "    # Cached (maxAge=100000)\n",
-    "    try:\n",
-    "        start = datetime.now()\n",
-    "        doc = app.scrape_url(url, maxAge=100000)\n",
-    "        cached_time = (datetime.now() - start).total_seconds()\n",
-    "        scrape_times_cached.append(cached_time)\n",
-    "        print(f\"  Cached: {cached_time:.2f}s\")\n",
-    "    except Exception as e:\n",
-    "        print(f\"  Cached: FAILED - {e}\")\n",
-    "        scrape_times_cached.append(None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "7dce8a83",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "SCRAPE RESULTS:\n",
-      "Average no cache: 4.31s\n",
-      "Average cached: 2.87s\n",
-      "Speedup: 1.5x faster with cache\n",
-      "Time saved: 1.44s per request\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Calculate averages\n",
-    "valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n",
-    "valid_cached = [t for t in scrape_times_cached if t is not None]\n",
-    "\n",
-    "if valid_no_cache and valid_cached:\n",
-    "    avg_no_cache = statistics.mean(valid_no_cache)\n",
-    "    avg_cached = statistics.mean(valid_cached)\n",
-    "    speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n",
-    "    \n",
-    "    print(\"SCRAPE RESULTS:\")\n",
-    "    print(f\"Average no cache: {avg_no_cache:.2f}s\")\n",
-    "    print(f\"Average cached: {avg_cached:.2f}s\")\n",
-    "    print(f\"Speedup: {speedup:.1f}x faster with cache\")\n",
-    "    print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc682ba4",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "df801504",
-   "metadata": {},
-   "source": [
-    "## Crawl"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "658374d7",
-   "metadata": {},
-   "source": [
-    "--- for now used to improve map "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5628c39d",
-   "metadata": {},
-   "source": [
-    "Hypothesis: Indexed crawls are faster"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "id": "1482e163",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Crawling 1/10: https://www.pcbgogo.com/*\n",
-      "Crawling 2/10: https://github.com/Uniswap/v4-core/*\n",
-      "Crawling 3/10: http://arcep.fr/actualites/*\n",
-      "Crawling 4/10: https://www.synapticure.com/*\n",
-      "Crawling 5/10: https://www.elecrow.com/*\n",
-      "Crawling 6/10: https://idfcfirstbank.com/*\n",
-      "Crawling 7/10: https://www.todaytix.com/*\n",
-      "Crawling 8/10: https://www.wheel-size.com/size/*\n",
-      "Crawling 9/10: http://www.drymerge.com/*\n",
-      "h - Crawl FAILED - HTTPConnectionPool(host='default-firecrawl-app-staging-service', port=3002): Max retries exceeded with url: /v1/crawl/31485d6f-ceeb-458b-b010-f5efc3286a0e (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1195ac0d0>: Failed to establish a new connection: [Errno 65] No route to host'))\n",
-      "Crawling 10/10: https://telegramindex.org/*\n"
-     ]
-    }
-   ],
-   "source": [
-    "crawl_times_no_cache = []\n",
-    "crawl_times_cached = []\n",
-    "\n",
-    "for i, url in enumerate(crawl_urls):\n",
-    "    try:\n",
-    "        print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n",
-    "        result = app.crawl_url(url)\n",
-    "    except Exception as e:\n",
-    "        print(f\"{url[0]} - Crawl FAILED - {e}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7eb27685",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "abe3f30e",
-   "metadata": {},
-   "source": [
-    "## Map"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "683c74da",
-   "metadata": {},
-   "source": [
-    "Hypothesis: Indexed Map should get more urls after crawl"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "id": "1450bf4c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'http://default-firecrawl-app-staging-service:3002/'"
-      ]
-     },
-     "execution_count": 143,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "STAGING_API_URL"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "id": "f8d79207",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def map_request(url, ignore_index):\n",
-    "    \"\"\"\n",
-    "    Make a map request and return the links\n",
-    "    \"\"\"\n",
-    "    payload = {\"url\": url, \"ignoreIndex\": ignore_index, \"limit\": 10000}\n",
-    "    headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n",
-    "    response = requests.post(STAGING_API_URL + \"v1/map\", headers=headers, json=payload)\n",
-    "    \n",
-    "    if response.status_code == 200:\n",
-    "        data = response.json()\n",
-    "        return data.get('links', [])\n",
-    "    else:\n",
-    "        return []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 147,
-   "id": "a74da0a5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Testing 1/10: https://www.pcbgogo.com/\n",
-      "  No index: 1.44s, 664 URLs\n",
-      "  With index: 1.70s, 664 URLs\n",
-      "Testing 2/10: https://github.com/Uniswap/v4-core/\n",
-      "  No index: 0.72s, 235 URLs\n",
-      "  With index: 0.75s, 235 URLs\n",
-      "Testing 3/10: http://arcep.fr/actualites/\n",
-      "  No index: 6.23s, 8059 URLs\n",
-      "  With index: 6.71s, 9048 URLs\n",
-      "Testing 4/10: https://www.synapticure.com/\n",
-      "  No index: 0.72s, 365 URLs\n",
-      "  With index: 0.94s, 365 URLs\n",
-      "Testing 5/10: https://www.elecrow.com/\n",
-      "  No index: 4.03s, 2747 URLs\n",
-      "  With index: 4.13s, 2747 URLs\n",
-      "Testing 6/10: https://idfcfirstbank.com/\n",
-      "  No index: 2.62s, 279 URLs\n",
-      "  With index: 4.02s, 279 URLs\n",
-      "Testing 7/10: https://www.todaytix.com/\n",
-      "  No index: 4.53s, 8933 URLs\n",
-      "  With index: 3.63s, 8933 URLs\n",
-      "Testing 8/10: https://www.wheel-size.com/size/\n",
-      "  No index: 10.89s, 10000 URLs\n",
-      "  With index: 10.73s, 10000 URLs\n",
-      "Testing 9/10: http://www.drymerge.com/\n",
-      "  No index: 32.94s, 3602 URLs\n",
-      "  With index: 37.07s, 3602 URLs\n",
-      "Testing 10/10: https://telegramindex.org/\n",
-      "  No index: 0.98s, 2 URLs\n",
-      "  With index: 1.19s, 2 URLs\n"
-     ]
-    }
-   ],
-   "source": [
-    "map_times_no_cache = []\n",
-    "map_times_cached = []\n",
-    "map_url_counts_no_cache = []\n",
-    "map_url_counts_cached = []\n",
-    "\n",
-    "for i, url in enumerate(map_urls):\n",
-    "    print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n",
-    "    \n",
-    "    # No index (ignoreIndex=True)\n",
-    "    start = datetime.now()\n",
-    "    links_no_index = map_request(url, True)\n",
-    "    time_no_index = (datetime.now() - start).total_seconds()\n",
-    "    \n",
-    "    map_times_no_cache.append(time_no_index)\n",
-    "    map_url_counts_no_cache.append(len(links_no_index))\n",
-    "    print(f\"  No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n",
-    "    \n",
-    "    # With index (ignoreIndex=False)\n",
-    "    start = datetime.now()\n",
-    "    links_indexed = map_request(url, False)\n",
-    "    time_indexed = (datetime.now() - start).total_seconds()\n",
-    "    \n",
-    "    map_times_cached.append(time_indexed)\n",
-    "    map_url_counts_cached.append(len(links_indexed))\n",
-    "    print(f\"  With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 149,
-   "id": "2fa88f5d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "MAP RESULTS:\n",
-      "Average time (no cache): 6.51s\n",
-      "Average time (cached): 7.09s\n",
-      "Time speedup: 0.92x faster with cache\n",
-      "Average URLs found (no cache): 3488.6\n",
-      "Average URLs found (cached): 3587.5\n",
-      "URL difference: +98.9 URLs with cache\n",
-      "URL percentage: 102.8% of no-cache results\n",
-      "✅ Cache finds MORE URLs\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Calculate averages\n",
-    "avg_time_no_cache = statistics.mean(map_times_no_cache)\n",
-    "avg_time_cached = statistics.mean(map_times_cached)\n",
-    "avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n",
-    "avg_urls_cached = statistics.mean(map_url_counts_cached)\n",
-    "\n",
-    "time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n",
-    "url_difference = avg_urls_cached - avg_urls_no_cache\n",
-    "url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n",
-    "\n",
-    "print(\"MAP RESULTS:\")\n",
-    "print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n",
-    "print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n",
-    "print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n",
-    "print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n",
-    "print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n",
-    "print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n",
-    "print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n",
-    "\n",
-    "if url_difference > 0:\n",
-    "    print(\"✅ Cache finds MORE URLs\")\n",
-    "elif url_difference < 0:\n",
-    "    print(\"⚠️  Cache finds FEWER URLs\")\n",
-    "else:\n",
-    "    print(\"➡️  Cache finds SAME number of URLs\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e5ee2116",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}