mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 22:45:55 +08:00
feat/added benchmark for scrapes
This commit is contained in:
parent
99d3db743d
commit
22c7685239
447
apps/test-suite/index-benchmark/run.ipynb
Normal file
447
apps/test-suite/index-benchmark/run.ipynb
Normal file
@ -0,0 +1,447 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74281828",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Index Benchmark"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d152695",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "099ea204",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"id": "4eae0ba5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from firecrawl import FirecrawlApp, ScrapeOptions\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from datetime import datetime\n",
|
||||
"import statistics\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"id": "cb1f50fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"STAGING_API_URL = os.getenv(\"STAGING_API_URL\") or None\n",
|
||||
"\n",
|
||||
"if STAGING_API_URL is None:\n",
|
||||
" raise ValueError(\"STAGING_API_URL is not set\")\n",
|
||||
" \n",
|
||||
"app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"id": "038c04b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scrape_urls=[\n",
|
||||
" 'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n",
|
||||
" 'https://httpbin.org', # - HTTP testing service (very reliable)\n",
|
||||
" 'https://example.com', # - Standard test domain (minimal content)\n",
|
||||
" 'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n",
|
||||
" 'https://stackoverflow.com/questions', # - Stack Overflow questions page\n",
|
||||
" 'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n",
|
||||
" 'https://jsonplaceholder.typicode.com', # - Fake API for testing\n",
|
||||
" 'https://httpstat.us/200', # - HTTP status testing (minimal response)\n",
|
||||
" 'https://www.reddit.com/r/programming', # - Reddit programming subreddit\n",
|
||||
" 'https://docs.python.org/3/' # - Python documentation (structured docs)\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"crawl_urls = [\n",
|
||||
" [\"docs.python.org/3/tutorial/*\"],\n",
|
||||
" [\"httpbin.org/*\"],\n",
|
||||
" [\"example.com/*\"],\n",
|
||||
" [\"jsonplaceholder.typicode.com/*\"],\n",
|
||||
" [\"httpstat.us/*\"],\n",
|
||||
" [\"postman-echo.com/*\"],\n",
|
||||
" [\"docs.github.com/*\"],\n",
|
||||
" [\"news.ycombinator.com/*\"],\n",
|
||||
" [\"dev.to/*\"],\n",
|
||||
" [\"reqres.in/*\"]\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"map_urls=crawl_urls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d83baad5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Scrape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b83fa3c0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed scrapes are faster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"id": "d846b0f1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Testing 1/10: https://news.ycombinator.com\n",
|
||||
" No cache: 2.15s\n",
|
||||
" Cached: 0.98s\n",
|
||||
"Testing 2/10: https://httpbin.org\n",
|
||||
" No cache: 11.61s\n",
|
||||
" Cached: 0.59s\n",
|
||||
"Testing 3/10: https://example.com\n",
|
||||
" No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
||||
" Cached: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
||||
"Testing 4/10: https://github.com/microsoft/vscode\n",
|
||||
" No cache: 15.73s\n",
|
||||
" Cached: 1.12s\n",
|
||||
"Testing 5/10: https://stackoverflow.com/questions\n",
|
||||
" No cache: 2.74s\n",
|
||||
" Cached: 0.98s\n",
|
||||
"Testing 6/10: https://www.wikipedia.org\n",
|
||||
" No cache: 3.84s\n",
|
||||
" Cached: 0.58s\n",
|
||||
"Testing 7/10: https://jsonplaceholder.typicode.com\n",
|
||||
" No cache: 4.09s\n",
|
||||
" Cached: 0.81s\n",
|
||||
"Testing 8/10: https://httpstat.us/200\n",
|
||||
" No cache: 10.84s\n",
|
||||
" Cached: 0.56s\n",
|
||||
"Testing 9/10: https://www.reddit.com/r/programming\n",
|
||||
" No cache: 3.63s\n",
|
||||
" Cached: 0.92s\n",
|
||||
"Testing 10/10: https://docs.python.org/3/\n",
|
||||
" No cache: 1.60s\n",
|
||||
" Cached: 0.62s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"scrape_times_no_cache = []\n",
|
||||
"scrape_times_cached = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(scrape_urls): # Test first 5 URLs\n",
|
||||
" print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n",
|
||||
" \n",
|
||||
" # No cache (maxAge=1)\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" doc = app.scrape_url(url, maxAge=1)\n",
|
||||
" no_cache_time = (datetime.now() - start).total_seconds()\n",
|
||||
" scrape_times_no_cache.append(no_cache_time)\n",
|
||||
" print(f\" No cache: {no_cache_time:.2f}s\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" No cache: FAILED - {e}\")\n",
|
||||
" scrape_times_no_cache.append(None)\n",
|
||||
" \n",
|
||||
" # Cached (maxAge=100000)\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" doc = app.scrape_url(url, maxAge=100000)\n",
|
||||
" cached_time = (datetime.now() - start).total_seconds()\n",
|
||||
" scrape_times_cached.append(cached_time)\n",
|
||||
" print(f\" Cached: {cached_time:.2f}s\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" Cached: FAILED - {e}\")\n",
|
||||
" scrape_times_cached.append(None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "ef9cf1cb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"SCRAPE RESULTS:\n",
|
||||
"Average no cache: 6.25s\n",
|
||||
"Average cached: 0.80s\n",
|
||||
"Speedup: 7.8x faster with cache\n",
|
||||
"Time saved: 5.45s per request\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Calculate averages\n",
|
||||
"valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n",
|
||||
"valid_cached = [t for t in scrape_times_cached if t is not None]\n",
|
||||
"\n",
|
||||
"if valid_no_cache and valid_cached:\n",
|
||||
" avg_no_cache = statistics.mean(valid_no_cache)\n",
|
||||
" avg_cached = statistics.mean(valid_cached)\n",
|
||||
" speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n",
|
||||
" \n",
|
||||
" print(\"SCRAPE RESULTS:\")\n",
|
||||
" print(f\"Average no cache: {avg_no_cache:.2f}s\")\n",
|
||||
" print(f\"Average cached: {avg_cached:.2f}s\")\n",
|
||||
" print(f\"Speedup: {speedup:.1f}x faster with cache\")\n",
|
||||
" print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f7d75182",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "195d0b0b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Crawl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "520b9c80",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"--- for now used to improve map "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9a1961ef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed crawls are faster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a9877e32",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# I think this one only improves /map for now..."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f2a6c751",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"crawl_times_no_cache = []\n",
|
||||
"crawl_times_cached = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(crawl_urls):\n",
|
||||
" print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n",
|
||||
" \n",
|
||||
" # No cache\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=1), limit=5)\n",
|
||||
" no_cache_time = (datetime.now() - start).total_seconds()\n",
|
||||
" crawl_times_no_cache.append(no_cache_time)\n",
|
||||
" print(f\" No cache: {no_cache_time:.2f}s\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" No cache: FAILED - {e}\")\n",
|
||||
" crawl_times_no_cache.append(None)\n",
|
||||
" \n",
|
||||
" # Cached\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=100000), limit=5)\n",
|
||||
" cached_time = (datetime.now() - start).total_seconds()\n",
|
||||
" crawl_times_cached.append(cached_time)\n",
|
||||
" print(f\" Cached: {cached_time:.2f}s\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" Cached: FAILED - {e}\")\n",
|
||||
" crawl_times_cached.append(None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c96dddc6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Calculate averages\n",
|
||||
"valid_no_cache = [t for t in crawl_times_no_cache if t is not None]\n",
|
||||
"valid_cached = [t for t in crawl_times_cached if t is not None]\n",
|
||||
"\n",
|
||||
"if valid_no_cache and valid_cached:\n",
|
||||
" avg_no_cache = statistics.mean(valid_no_cache)\n",
|
||||
" avg_cached = statistics.mean(valid_cached)\n",
|
||||
" speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n",
|
||||
" \n",
|
||||
" print(\"CRAWL RESULTS:\")\n",
|
||||
" print(f\"Average no cache: {avg_no_cache:.2f}s\")\n",
|
||||
" print(f\"Average cached: {avg_cached:.2f}s\")\n",
|
||||
" print(f\"Speedup: {speedup:.1f}x faster with cache\")\n",
|
||||
" print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per crawl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c9c5d28f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "290752bb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Map"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "efe67e8e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed Map should get more urls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "eb9ac2cb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO: use crawl urls, compare with prod"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c9211270",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"map_times = []\n",
|
||||
"map_url_counts = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(map_urls):\n",
|
||||
" print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" result = app.map_url(url)\n",
|
||||
" map_time = (datetime.now() - start).total_seconds()\n",
|
||||
" url_count = len(result.links) if hasattr(result, 'links') else 0\n",
|
||||
" \n",
|
||||
" map_times.append(map_time)\n",
|
||||
" map_url_counts.append(url_count)\n",
|
||||
" \n",
|
||||
" print(f\" Time: {map_time:.2f}s\")\n",
|
||||
" print(f\" URLs found: {url_count}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" FAILED - {e}\")\n",
|
||||
" map_times.append(None)\n",
|
||||
" map_url_counts.append(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b782e07b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Calculate averages\n",
|
||||
"valid_times = [t for t in map_times if t is not None]\n",
|
||||
"valid_counts = [c for c in map_url_counts if c > 0]\n",
|
||||
"\n",
|
||||
"if valid_times:\n",
|
||||
" avg_time = statistics.mean(valid_times)\n",
|
||||
" avg_urls = statistics.mean(valid_counts) if valid_counts else 0\n",
|
||||
" \n",
|
||||
" print(\"MAP RESULTS:\")\n",
|
||||
" print(f\"Average time: {avg_time:.2f}s\")\n",
|
||||
" print(f\"Average URLs found: {avg_urls:.1f}\")\n",
|
||||
" print(f\"URLs per second: {avg_urls/avg_time:.1f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3cf02db",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user