diff --git a/apps/test-suite/index-benchmark/run.ipynb b/apps/test-suite/index-benchmark/run.ipynb new file mode 100644 index 00000000..7b8441fe --- /dev/null +++ b/apps/test-suite/index-benchmark/run.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74281828", + "metadata": {}, + "source": [ + "# Index Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d152695", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "099ea204", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4eae0ba5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from firecrawl import FirecrawlApp, ScrapeOptions\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from datetime import datetime\n", + "import statistics\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "cb1f50fe", + "metadata": {}, + "outputs": [], + "source": [ + "STAGING_API_URL = os.getenv(\"STAGING_API_URL\") or None\n", + "\n", + "if STAGING_API_URL is None:\n", + " raise ValueError(\"STAGING_API_URL is not set\")\n", + " \n", + "app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "038c04b8", + "metadata": {}, + "outputs": [], + "source": [ + "scrape_urls=[\n", + " 'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n", + " 'https://httpbin.org', # - HTTP testing service (very reliable)\n", + " 'https://example.com', # - Standard test domain (minimal content)\n", + " 'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n", + " 'https://stackoverflow.com/questions', # - Stack Overflow questions page\n", + " 'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n", + " 'https://jsonplaceholder.typicode.com', # - Fake API for testing\n", + " 'https://httpstat.us/200', # - HTTP status testing (minimal response)\n", + " 'https://www.reddit.com/r/programming', # - Reddit programming subreddit\n", + " 'https://docs.python.org/3/' # - Python documentation (structured docs)\n", + "]\n", + "\n", + "\n", + "crawl_urls = [\n", + " [\"docs.python.org/3/tutorial/*\"],\n", + " [\"httpbin.org/*\"],\n", + " [\"example.com/*\"],\n", + " [\"jsonplaceholder.typicode.com/*\"],\n", + " [\"httpstat.us/*\"],\n", + " [\"postman-echo.com/*\"],\n", + " [\"docs.github.com/*\"],\n", + " [\"news.ycombinator.com/*\"],\n", + " [\"dev.to/*\"],\n", + " [\"reqres.in/*\"]\n", + "]\n", + "\n", + "\n", + "map_urls=crawl_urls" + ] + }, + { + "cell_type": "markdown", + "id": "d83baad5", + "metadata": {}, + "source": [ + "## Scrape" + ] + }, + { + "cell_type": "markdown", + "id": "b83fa3c0", + "metadata": {}, + "source": [ + "Hypothesis: Indexed scrapes are faster" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "d846b0f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing 1/10: https://news.ycombinator.com\n", + " No cache: 2.15s\n", + " Cached: 0.98s\n", + "Testing 2/10: https://httpbin.org\n", + " No cache: 11.61s\n", + " Cached: 0.59s\n", + "Testing 3/10: https://example.com\n", + " No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", + " Cached: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", + "Testing 4/10: https://github.com/microsoft/vscode\n", + " No cache: 15.73s\n", + " Cached: 1.12s\n", + "Testing 5/10: https://stackoverflow.com/questions\n", + " No cache: 2.74s\n", + " Cached: 0.98s\n", + "Testing 6/10: https://www.wikipedia.org\n", + " No cache: 3.84s\n", + " Cached: 0.58s\n", + "Testing 7/10: https://jsonplaceholder.typicode.com\n", + " No cache: 4.09s\n", + " Cached: 0.81s\n", + "Testing 8/10: https://httpstat.us/200\n", + " No cache: 10.84s\n", + " Cached: 0.56s\n", + "Testing 9/10: https://www.reddit.com/r/programming\n", + " No cache: 3.63s\n", + " Cached: 0.92s\n", + "Testing 10/10: https://docs.python.org/3/\n", + " No cache: 1.60s\n", + " Cached: 0.62s\n" + ] + } + ], + "source": [ + "scrape_times_no_cache = []\n", + "scrape_times_cached = []\n", + "\n", + "for i, url in enumerate(scrape_urls): # Test first 5 URLs\n", + " print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n", + " \n", + " # No cache (maxAge=1)\n", + " try:\n", + " start = datetime.now()\n", + " doc = app.scrape_url(url, maxAge=1)\n", + " no_cache_time = (datetime.now() - start).total_seconds()\n", + " scrape_times_no_cache.append(no_cache_time)\n", + " print(f\" No cache: {no_cache_time:.2f}s\")\n", + " except Exception as e:\n", + " print(f\" No cache: FAILED - {e}\")\n", + " scrape_times_no_cache.append(None)\n", + " \n", + " # Cached (maxAge=100000)\n", + " try:\n", + " start = datetime.now()\n", + " doc = app.scrape_url(url, maxAge=100000)\n", + " cached_time = (datetime.now() - start).total_seconds()\n", + " scrape_times_cached.append(cached_time)\n", + " print(f\" Cached: {cached_time:.2f}s\")\n", + " except Exception as e:\n", + " print(f\" Cached: FAILED - {e}\")\n", + " scrape_times_cached.append(None)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "ef9cf1cb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCRAPE RESULTS:\n", + "Average no cache: 6.25s\n", + "Average cached: 0.80s\n", + "Speedup: 7.8x faster with cache\n", + "Time saved: 5.45s per request\n" + ] + } + ], + "source": [ + "# Calculate averages\n", + "valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n", + "valid_cached = [t for t in scrape_times_cached if t is not None]\n", + "\n", + "if valid_no_cache and valid_cached:\n", + " avg_no_cache = statistics.mean(valid_no_cache)\n", + " avg_cached = statistics.mean(valid_cached)\n", + " speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n", + " \n", + " print(\"SCRAPE RESULTS:\")\n", + " print(f\"Average no cache: {avg_no_cache:.2f}s\")\n", + " print(f\"Average cached: {avg_cached:.2f}s\")\n", + " print(f\"Speedup: {speedup:.1f}x faster with cache\")\n", + " print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7d75182", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "195d0b0b", + "metadata": {}, + "source": [ + "## Crawl" + ] + }, + { + "cell_type": "markdown", + "id": "520b9c80", + "metadata": {}, + "source": [ + "--- for now used to improve map " + ] + }, + { + "cell_type": "markdown", + "id": "9a1961ef", + "metadata": {}, + "source": [ + "Hypothesis: Indexed crawls are faster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9877e32", + "metadata": {}, + "outputs": [], + "source": [ + "# I think this one only improves /map for now..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2a6c751", + "metadata": {}, + "outputs": [], + "source": [ + "crawl_times_no_cache = []\n", + "crawl_times_cached = []\n", + "\n", + "for i, url in enumerate(crawl_urls):\n", + " print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n", + " \n", + " # No cache\n", + " try:\n", + " start = datetime.now()\n", + " result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=1), limit=5)\n", + " no_cache_time = (datetime.now() - start).total_seconds()\n", + " crawl_times_no_cache.append(no_cache_time)\n", + " print(f\" No cache: {no_cache_time:.2f}s\")\n", + " except Exception as e:\n", + " print(f\" No cache: FAILED - {e}\")\n", + " crawl_times_no_cache.append(None)\n", + " \n", + " # Cached\n", + " try:\n", + " start = datetime.now()\n", + " result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=100000), limit=5)\n", + " cached_time = (datetime.now() - start).total_seconds()\n", + " crawl_times_cached.append(cached_time)\n", + " print(f\" Cached: {cached_time:.2f}s\")\n", + " except Exception as e:\n", + " print(f\" Cached: FAILED - {e}\")\n", + " crawl_times_cached.append(None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c96dddc6", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate averages\n", + "valid_no_cache = [t for t in crawl_times_no_cache if t is not None]\n", + "valid_cached = [t for t in crawl_times_cached if t is not None]\n", + "\n", + "if valid_no_cache and valid_cached:\n", + " avg_no_cache = statistics.mean(valid_no_cache)\n", + " avg_cached = statistics.mean(valid_cached)\n", + " speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n", + " \n", + " print(\"CRAWL RESULTS:\")\n", + " print(f\"Average no cache: {avg_no_cache:.2f}s\")\n", + " print(f\"Average cached: {avg_cached:.2f}s\")\n", + " print(f\"Speedup: {speedup:.1f}x faster with cache\")\n", + " print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per crawl\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9c5d28f", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "290752bb", + "metadata": {}, + "source": [ + "## Map" + ] + }, + { + "cell_type": "markdown", + "id": "efe67e8e", + "metadata": {}, + "source": [ + "Hypothesis: Indexed Map should get more urls" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "eb9ac2cb", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: use crawl urls, compare with prod" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9211270", + "metadata": {}, + "outputs": [], + "source": [ + "map_times = []\n", + "map_url_counts = []\n", + "\n", + "for i, url in enumerate(map_urls):\n", + " print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n", + " \n", + " try:\n", + " start = datetime.now()\n", + " result = app.map_url(url)\n", + " map_time = (datetime.now() - start).total_seconds()\n", + " url_count = len(result.links) if hasattr(result, 'links') else 0\n", + " \n", + " map_times.append(map_time)\n", + " map_url_counts.append(url_count)\n", + " \n", + " print(f\" Time: {map_time:.2f}s\")\n", + " print(f\" URLs found: {url_count}\")\n", + " except Exception as e:\n", + " print(f\" FAILED - {e}\")\n", + " map_times.append(None)\n", + " map_url_counts.append(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b782e07b", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate averages\n", + "valid_times = [t for t in map_times if t is not None]\n", + "valid_counts = [c for c in map_url_counts if c > 0]\n", + "\n", + "if valid_times:\n", + " avg_time = statistics.mean(valid_times)\n", + " avg_urls = statistics.mean(valid_counts) if valid_counts else 0\n", + " \n", + " print(\"MAP RESULTS:\")\n", + " print(f\"Average time: {avg_time:.2f}s\")\n", + " print(f\"Average URLs found: {avg_urls:.1f}\")\n", + " print(f\"URLs per second: {avg_urls/avg_time:.1f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e3cf02db", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}