From 014a99ef91a4036d4d1ddbf2bf6d613dc7c727f0 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 2 Jun 2025 13:38:43 -0300 Subject: [PATCH] map benchmarks --- apps/test-suite/index-benchmark/run.ipynb | 404 +++++++++++++--------- 1 file changed, 248 insertions(+), 156 deletions(-) diff --git a/apps/test-suite/index-benchmark/run.ipynb b/apps/test-suite/index-benchmark/run.ipynb index 7b8441fe..dbf65bc3 100644 --- a/apps/test-suite/index-benchmark/run.ipynb +++ b/apps/test-suite/index-benchmark/run.ipynb @@ -2,23 +2,21 @@ "cells": [ { "cell_type": "markdown", - "id": "74281828", + "id": "51331c01", "metadata": {}, "source": [ "# Index Benchmark" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "6d152695", + "cell_type": "markdown", + "id": "9cb10752", "metadata": {}, - "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "099ea204", + "id": "7928e2c9", "metadata": {}, "source": [ "---" @@ -26,8 +24,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "id": "4eae0ba5", + "execution_count": 104, + "id": "a64ec8a0", "metadata": {}, "outputs": [ { @@ -36,7 +34,7 @@ "True" ] }, - "execution_count": 70, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -47,18 +45,19 @@ "from dotenv import load_dotenv\n", "from datetime import datetime\n", "import statistics\n", + "import requests\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", - "execution_count": 73, - "id": "cb1f50fe", + "execution_count": 82, + "id": "bc7ce797", "metadata": {}, "outputs": [], "source": [ - "STAGING_API_URL = os.getenv(\"STAGING_API_URL\") or None\n", + "STAGING_API_URL = 'http://default-firecrawl-app-staging-service:3002/' # os.getenv(\"STAGING_API_URL\") or None\n", "\n", "if STAGING_API_URL is None:\n", " raise ValueError(\"STAGING_API_URL is not set\")\n", @@ -68,8 +67,29 @@ }, { "cell_type": "code", - "execution_count": 66, - "id": "038c04b8", + "execution_count": 83, + "id": "a3db6548", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'http://default-firecrawl-app-staging-service:3002/'" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "STAGING_API_URL" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "440f7c2d", "metadata": {}, "outputs": [], "source": [ @@ -88,25 +108,27 @@ "\n", "\n", "crawl_urls = [\n", - " [\"docs.python.org/3/tutorial/*\"],\n", - " [\"httpbin.org/*\"],\n", - " [\"example.com/*\"],\n", - " [\"jsonplaceholder.typicode.com/*\"],\n", - " [\"httpstat.us/*\"],\n", - " [\"postman-echo.com/*\"],\n", - " [\"docs.github.com/*\"],\n", - " [\"news.ycombinator.com/*\"],\n", - " [\"dev.to/*\"],\n", - " [\"reqres.in/*\"]\n", + " \"https://www.pcbgogo.com/*\", # 7825\n", + " \"https://github.com/Uniswap/v4-core/*\", # 7353\n", + " \"http://arcep.fr/actualites/*\", # 9764\n", + " \"https://www.synapticure.com/*\", # 7746\n", + " \"https://www.elecrow.com/*\", # 8025\n", + " \"https://idfcfirstbank.com/*\", # 9912\n", + " \"https://www.todaytix.com/*\", # 7532\n", + " \"https://www.wheel-size.com/size/*\", # 7102\n", + " \"http://www.drymerge.com/*\", # 8422\n", + " \"https://telegramindex.org/*\" # 5335\n", "]\n", "\n", "\n", - "map_urls=crawl_urls" + "map_urls = []\n", + "for i in crawl_urls:\n", + " map_urls.append(i.replace('*',''))" ] }, { "cell_type": "markdown", - "id": "d83baad5", + "id": "e54e6677", "metadata": {}, "source": [ "## Scrape" @@ -114,7 +136,7 @@ }, { "cell_type": "markdown", - "id": "b83fa3c0", + "id": "3fed4cb6", "metadata": {}, "source": [ "Hypothesis: Indexed scrapes are faster" @@ -122,8 +144,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "id": "d846b0f1", + "execution_count": 85, + "id": "fb052d01", "metadata": {}, "outputs": [ { @@ -131,35 +153,35 @@ "output_type": "stream", "text": [ "Testing 1/10: https://news.ycombinator.com\n", - " No cache: 2.15s\n", - " Cached: 0.98s\n", + " No cache: 2.39s\n", + " Cached: 0.93s\n", "Testing 2/10: https://httpbin.org\n", - " No cache: 11.61s\n", - " Cached: 0.59s\n", + " No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", + " Cached: 4.79s\n", "Testing 3/10: https://example.com\n", " No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", - " Cached: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", + " Cached: 17.33s\n", "Testing 4/10: https://github.com/microsoft/vscode\n", - " No cache: 15.73s\n", - " Cached: 1.12s\n", + " No cache: 2.81s\n", + " Cached: 1.06s\n", "Testing 5/10: https://stackoverflow.com/questions\n", - " No cache: 2.74s\n", - " Cached: 0.98s\n", + " No cache: 2.92s\n", + " Cached: 1.16s\n", "Testing 6/10: https://www.wikipedia.org\n", - " No cache: 3.84s\n", - " Cached: 0.58s\n", + " No cache: 3.53s\n", + " Cached: 0.78s\n", "Testing 7/10: https://jsonplaceholder.typicode.com\n", - " No cache: 4.09s\n", - " Cached: 0.81s\n", + " No cache: 3.54s\n", + " Cached: 0.80s\n", "Testing 8/10: https://httpstat.us/200\n", - " No cache: 10.84s\n", - " Cached: 0.56s\n", + " No cache: 10.79s\n", + " Cached: 0.54s\n", "Testing 9/10: https://www.reddit.com/r/programming\n", - " No cache: 3.63s\n", - " Cached: 0.92s\n", + " No cache: 4.53s\n", + " Cached: 0.79s\n", "Testing 10/10: https://docs.python.org/3/\n", - " No cache: 1.60s\n", - " Cached: 0.62s\n" + " No cache: 3.95s\n", + " Cached: 0.53s\n" ] } ], @@ -195,8 +217,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "id": "ef9cf1cb", + "execution_count": 86, + "id": "7dce8a83", "metadata": {}, "outputs": [ { @@ -204,10 +226,10 @@ "output_type": "stream", "text": [ "SCRAPE RESULTS:\n", - "Average no cache: 6.25s\n", - "Average cached: 0.80s\n", - "Speedup: 7.8x faster with cache\n", - "Time saved: 5.45s per request\n" + "Average no cache: 4.31s\n", + "Average cached: 2.87s\n", + "Speedup: 1.5x faster with cache\n", + "Time saved: 1.44s per request\n" ] } ], @@ -230,7 +252,7 @@ }, { "cell_type": "markdown", - "id": "f7d75182", + "id": "cc682ba4", "metadata": {}, "source": [ "---" @@ -238,7 +260,7 @@ }, { "cell_type": "markdown", - "id": "195d0b0b", + "id": "df801504", "metadata": {}, "source": [ "## Crawl" @@ -246,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "520b9c80", + "id": "658374d7", "metadata": {}, "source": [ "--- for now used to improve map " @@ -254,7 +276,7 @@ }, { "cell_type": "markdown", - "id": "9a1961ef", + "id": "5628c39d", "metadata": {}, "source": [ "Hypothesis: Indexed crawls are faster" @@ -262,76 +284,43 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a9877e32", + "execution_count": 142, + "id": "1482e163", "metadata": {}, - "outputs": [], - "source": [ - "# I think this one only improves /map for now..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2a6c751", - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Crawling 1/10: https://www.pcbgogo.com/*\n", + "Crawling 2/10: https://github.com/Uniswap/v4-core/*\n", + "Crawling 3/10: http://arcep.fr/actualites/*\n", + "Crawling 4/10: https://www.synapticure.com/*\n", + "Crawling 5/10: https://www.elecrow.com/*\n", + "Crawling 6/10: https://idfcfirstbank.com/*\n", + "Crawling 7/10: https://www.todaytix.com/*\n", + "Crawling 8/10: https://www.wheel-size.com/size/*\n", + "Crawling 9/10: http://www.drymerge.com/*\n", + "h - Crawl FAILED - HTTPConnectionPool(host='default-firecrawl-app-staging-service', port=3002): Max retries exceeded with url: /v1/crawl/31485d6f-ceeb-458b-b010-f5efc3286a0e (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 65] No route to host'))\n", + "Crawling 10/10: https://telegramindex.org/*\n" + ] + } + ], "source": [ "crawl_times_no_cache = []\n", "crawl_times_cached = []\n", "\n", "for i, url in enumerate(crawl_urls):\n", - " print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n", - " \n", - " # No cache\n", " try:\n", - " start = datetime.now()\n", - " result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=1), limit=5)\n", - " no_cache_time = (datetime.now() - start).total_seconds()\n", - " crawl_times_no_cache.append(no_cache_time)\n", - " print(f\" No cache: {no_cache_time:.2f}s\")\n", + " print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n", + " result = app.crawl_url(url)\n", " except Exception as e:\n", - " print(f\" No cache: FAILED - {e}\")\n", - " crawl_times_no_cache.append(None)\n", - " \n", - " # Cached\n", - " try:\n", - " start = datetime.now()\n", - " result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=100000), limit=5)\n", - " cached_time = (datetime.now() - start).total_seconds()\n", - " crawl_times_cached.append(cached_time)\n", - " print(f\" Cached: {cached_time:.2f}s\")\n", - " except Exception as e:\n", - " print(f\" Cached: FAILED - {e}\")\n", - " crawl_times_cached.append(None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c96dddc6", - "metadata": {}, - "outputs": [], - "source": [ - "# Calculate averages\n", - "valid_no_cache = [t for t in crawl_times_no_cache if t is not None]\n", - "valid_cached = [t for t in crawl_times_cached if t is not None]\n", - "\n", - "if valid_no_cache and valid_cached:\n", - " avg_no_cache = statistics.mean(valid_no_cache)\n", - " avg_cached = statistics.mean(valid_cached)\n", - " speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n", - " \n", - " print(\"CRAWL RESULTS:\")\n", - " print(f\"Average no cache: {avg_no_cache:.2f}s\")\n", - " print(f\"Average cached: {avg_cached:.2f}s\")\n", - " print(f\"Speedup: {speedup:.1f}x faster with cache\")\n", - " print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per crawl\")" + " print(f\"{url[0]} - Crawl FAILED - {e}\")" ] }, { "cell_type": "markdown", - "id": "c9c5d28f", + "id": "7eb27685", "metadata": {}, "source": [ "---" @@ -339,7 +328,7 @@ }, { "cell_type": "markdown", - "id": "290752bb", + "id": "abe3f30e", "metadata": {}, "source": [ "## Map" @@ -347,76 +336,179 @@ }, { "cell_type": "markdown", - "id": "efe67e8e", + "id": "683c74da", "metadata": {}, "source": [ - "Hypothesis: Indexed Map should get more urls" + "Hypothesis: Indexed Map should get more urls after crawl" ] }, { "cell_type": "code", - "execution_count": 72, - "id": "eb9ac2cb", + "execution_count": 143, + "id": "1450bf4c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'http://default-firecrawl-app-staging-service:3002/'" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# TODO: use crawl urls, compare with prod" + "STAGING_API_URL" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c9211270", + "execution_count": 144, + "id": "f8d79207", "metadata": {}, "outputs": [], "source": [ - "map_times = []\n", - "map_url_counts = []\n", + "def map_request(url, ignore_index):\n", + " \"\"\"\n", + " Make a map request and return the links\n", + " \"\"\"\n", + " payload = {\"url\": url, \"ignoreIndex\": ignore_index, \"limit\": 10000}\n", + " headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n", + " response = requests.post(STAGING_API_URL + \"v1/map\", headers=headers, json=payload)\n", + " \n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " return data.get('links', [])\n", + " else:\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "a74da0a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing 1/10: https://www.pcbgogo.com/\n", + " No index: 1.44s, 664 URLs\n", + " With index: 1.70s, 664 URLs\n", + "Testing 2/10: https://github.com/Uniswap/v4-core/\n", + " No index: 0.72s, 235 URLs\n", + " With index: 0.75s, 235 URLs\n", + "Testing 3/10: http://arcep.fr/actualites/\n", + " No index: 6.23s, 8059 URLs\n", + " With index: 6.71s, 9048 URLs\n", + "Testing 4/10: https://www.synapticure.com/\n", + " No index: 0.72s, 365 URLs\n", + " With index: 0.94s, 365 URLs\n", + "Testing 5/10: https://www.elecrow.com/\n", + " No index: 4.03s, 2747 URLs\n", + " With index: 4.13s, 2747 URLs\n", + "Testing 6/10: https://idfcfirstbank.com/\n", + " No index: 2.62s, 279 URLs\n", + " With index: 4.02s, 279 URLs\n", + "Testing 7/10: https://www.todaytix.com/\n", + " No index: 4.53s, 8933 URLs\n", + " With index: 3.63s, 8933 URLs\n", + "Testing 8/10: https://www.wheel-size.com/size/\n", + " No index: 10.89s, 10000 URLs\n", + " With index: 10.73s, 10000 URLs\n", + "Testing 9/10: http://www.drymerge.com/\n", + " No index: 32.94s, 3602 URLs\n", + " With index: 37.07s, 3602 URLs\n", + "Testing 10/10: https://telegramindex.org/\n", + " No index: 0.98s, 2 URLs\n", + " With index: 1.19s, 2 URLs\n" + ] + } + ], + "source": [ + "map_times_no_cache = []\n", + "map_times_cached = []\n", + "map_url_counts_no_cache = []\n", + "map_url_counts_cached = []\n", "\n", "for i, url in enumerate(map_urls):\n", " print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n", " \n", - " try:\n", - " start = datetime.now()\n", - " result = app.map_url(url)\n", - " map_time = (datetime.now() - start).total_seconds()\n", - " url_count = len(result.links) if hasattr(result, 'links') else 0\n", - " \n", - " map_times.append(map_time)\n", - " map_url_counts.append(url_count)\n", - " \n", - " print(f\" Time: {map_time:.2f}s\")\n", - " print(f\" URLs found: {url_count}\")\n", - " except Exception as e:\n", - " print(f\" FAILED - {e}\")\n", - " map_times.append(None)\n", - " map_url_counts.append(0)" + " # No index (ignoreIndex=True)\n", + " start = datetime.now()\n", + " links_no_index = map_request(url, True)\n", + " time_no_index = (datetime.now() - start).total_seconds()\n", + " \n", + " map_times_no_cache.append(time_no_index)\n", + " map_url_counts_no_cache.append(len(links_no_index))\n", + " print(f\" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n", + " \n", + " # With index (ignoreIndex=False)\n", + " start = datetime.now()\n", + " links_indexed = map_request(url, False)\n", + " time_indexed = (datetime.now() - start).total_seconds()\n", + " \n", + " map_times_cached.append(time_indexed)\n", + " map_url_counts_cached.append(len(links_indexed))\n", + " print(f\" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b782e07b", + "execution_count": 149, + "id": "2fa88f5d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAP RESULTS:\n", + "Average time (no cache): 6.51s\n", + "Average time (cached): 7.09s\n", + "Time speedup: 0.92x faster with cache\n", + "Average URLs found (no cache): 3488.6\n", + "Average URLs found (cached): 3587.5\n", + "URL difference: +98.9 URLs with cache\n", + "URL percentage: 102.8% of no-cache results\n", + "✅ Cache finds MORE URLs\n" + ] + } + ], "source": [ "# Calculate averages\n", - "valid_times = [t for t in map_times if t is not None]\n", - "valid_counts = [c for c in map_url_counts if c > 0]\n", + "avg_time_no_cache = statistics.mean(map_times_no_cache)\n", + "avg_time_cached = statistics.mean(map_times_cached)\n", + "avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n", + "avg_urls_cached = statistics.mean(map_url_counts_cached)\n", "\n", - "if valid_times:\n", - " avg_time = statistics.mean(valid_times)\n", - " avg_urls = statistics.mean(valid_counts) if valid_counts else 0\n", - " \n", - " print(\"MAP RESULTS:\")\n", - " print(f\"Average time: {avg_time:.2f}s\")\n", - " print(f\"Average URLs found: {avg_urls:.1f}\")\n", - " print(f\"URLs per second: {avg_urls/avg_time:.1f}\")" + "time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n", + "url_difference = avg_urls_cached - avg_urls_no_cache\n", + "url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n", + "\n", + "print(\"MAP RESULTS:\")\n", + "print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n", + "print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n", + "print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n", + "print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n", + "print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n", + "print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n", + "print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n", + "\n", + "if url_difference > 0:\n", + " print(\"✅ Cache finds MORE URLs\")\n", + "elif url_difference < 0:\n", + " print(\"⚠️ Cache finds FEWER URLs\")\n", + "else:\n", + " print(\"➡️ Cache finds SAME number of URLs\")" ] }, { "cell_type": "markdown", - "id": "e3cf02db", + "id": "e5ee2116", "metadata": {}, "source": [ "---"