mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 18:45:54 +08:00
map benchmarks
This commit is contained in:
parent
8b864345e3
commit
014a99ef91
@ -2,23 +2,21 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "74281828",
|
"id": "51331c01",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Index Benchmark"
|
"# Index Benchmark"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": null,
|
"id": "9cb10752",
|
||||||
"id": "6d152695",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
"source": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "099ea204",
|
"id": "7928e2c9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"---"
|
"---"
|
||||||
@ -26,8 +24,8 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 70,
|
"execution_count": 104,
|
||||||
"id": "4eae0ba5",
|
"id": "a64ec8a0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -36,7 +34,7 @@
|
|||||||
"True"
|
"True"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 70,
|
"execution_count": 104,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -47,18 +45,19 @@
|
|||||||
"from dotenv import load_dotenv\n",
|
"from dotenv import load_dotenv\n",
|
||||||
"from datetime import datetime\n",
|
"from datetime import datetime\n",
|
||||||
"import statistics\n",
|
"import statistics\n",
|
||||||
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"load_dotenv()"
|
"load_dotenv()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 73,
|
"execution_count": 82,
|
||||||
"id": "cb1f50fe",
|
"id": "bc7ce797",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"STAGING_API_URL = os.getenv(\"STAGING_API_URL\") or None\n",
|
"STAGING_API_URL = 'http://default-firecrawl-app-staging-service:3002/' # os.getenv(\"STAGING_API_URL\") or None\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if STAGING_API_URL is None:\n",
|
"if STAGING_API_URL is None:\n",
|
||||||
" raise ValueError(\"STAGING_API_URL is not set\")\n",
|
" raise ValueError(\"STAGING_API_URL is not set\")\n",
|
||||||
@ -68,8 +67,29 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 66,
|
"execution_count": 83,
|
||||||
"id": "038c04b8",
|
"id": "a3db6548",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'http://default-firecrawl-app-staging-service:3002/'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 83,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"STAGING_API_URL"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 141,
|
||||||
|
"id": "440f7c2d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -88,25 +108,27 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"crawl_urls = [\n",
|
"crawl_urls = [\n",
|
||||||
" [\"docs.python.org/3/tutorial/*\"],\n",
|
" \"https://www.pcbgogo.com/*\", # 7825\n",
|
||||||
" [\"httpbin.org/*\"],\n",
|
" \"https://github.com/Uniswap/v4-core/*\", # 7353\n",
|
||||||
" [\"example.com/*\"],\n",
|
" \"http://arcep.fr/actualites/*\", # 9764\n",
|
||||||
" [\"jsonplaceholder.typicode.com/*\"],\n",
|
" \"https://www.synapticure.com/*\", # 7746\n",
|
||||||
" [\"httpstat.us/*\"],\n",
|
" \"https://www.elecrow.com/*\", # 8025\n",
|
||||||
" [\"postman-echo.com/*\"],\n",
|
" \"https://idfcfirstbank.com/*\", # 9912\n",
|
||||||
" [\"docs.github.com/*\"],\n",
|
" \"https://www.todaytix.com/*\", # 7532\n",
|
||||||
" [\"news.ycombinator.com/*\"],\n",
|
" \"https://www.wheel-size.com/size/*\", # 7102\n",
|
||||||
" [\"dev.to/*\"],\n",
|
" \"http://www.drymerge.com/*\", # 8422\n",
|
||||||
" [\"reqres.in/*\"]\n",
|
" \"https://telegramindex.org/*\" # 5335\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"map_urls=crawl_urls"
|
"map_urls = []\n",
|
||||||
|
"for i in crawl_urls:\n",
|
||||||
|
" map_urls.append(i.replace('*',''))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "d83baad5",
|
"id": "e54e6677",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Scrape"
|
"## Scrape"
|
||||||
@ -114,7 +136,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b83fa3c0",
|
"id": "3fed4cb6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Hypothesis: Indexed scrapes are faster"
|
"Hypothesis: Indexed scrapes are faster"
|
||||||
@ -122,8 +144,8 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 67,
|
"execution_count": 85,
|
||||||
"id": "d846b0f1",
|
"id": "fb052d01",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -131,35 +153,35 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Testing 1/10: https://news.ycombinator.com\n",
|
"Testing 1/10: https://news.ycombinator.com\n",
|
||||||
" No cache: 2.15s\n",
|
" No cache: 2.39s\n",
|
||||||
" Cached: 0.98s\n",
|
" Cached: 0.93s\n",
|
||||||
"Testing 2/10: https://httpbin.org\n",
|
"Testing 2/10: https://httpbin.org\n",
|
||||||
" No cache: 11.61s\n",
|
" No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
||||||
" Cached: 0.59s\n",
|
" Cached: 4.79s\n",
|
||||||
"Testing 3/10: https://example.com\n",
|
"Testing 3/10: https://example.com\n",
|
||||||
" No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
" No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
||||||
" Cached: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n",
|
" Cached: 17.33s\n",
|
||||||
"Testing 4/10: https://github.com/microsoft/vscode\n",
|
"Testing 4/10: https://github.com/microsoft/vscode\n",
|
||||||
" No cache: 15.73s\n",
|
" No cache: 2.81s\n",
|
||||||
" Cached: 1.12s\n",
|
" Cached: 1.06s\n",
|
||||||
"Testing 5/10: https://stackoverflow.com/questions\n",
|
"Testing 5/10: https://stackoverflow.com/questions\n",
|
||||||
" No cache: 2.74s\n",
|
" No cache: 2.92s\n",
|
||||||
" Cached: 0.98s\n",
|
" Cached: 1.16s\n",
|
||||||
"Testing 6/10: https://www.wikipedia.org\n",
|
"Testing 6/10: https://www.wikipedia.org\n",
|
||||||
" No cache: 3.84s\n",
|
" No cache: 3.53s\n",
|
||||||
" Cached: 0.58s\n",
|
" Cached: 0.78s\n",
|
||||||
"Testing 7/10: https://jsonplaceholder.typicode.com\n",
|
"Testing 7/10: https://jsonplaceholder.typicode.com\n",
|
||||||
" No cache: 4.09s\n",
|
" No cache: 3.54s\n",
|
||||||
" Cached: 0.81s\n",
|
" Cached: 0.80s\n",
|
||||||
"Testing 8/10: https://httpstat.us/200\n",
|
"Testing 8/10: https://httpstat.us/200\n",
|
||||||
" No cache: 10.84s\n",
|
" No cache: 10.79s\n",
|
||||||
" Cached: 0.56s\n",
|
" Cached: 0.54s\n",
|
||||||
"Testing 9/10: https://www.reddit.com/r/programming\n",
|
"Testing 9/10: https://www.reddit.com/r/programming\n",
|
||||||
" No cache: 3.63s\n",
|
" No cache: 4.53s\n",
|
||||||
" Cached: 0.92s\n",
|
" Cached: 0.79s\n",
|
||||||
"Testing 10/10: https://docs.python.org/3/\n",
|
"Testing 10/10: https://docs.python.org/3/\n",
|
||||||
" No cache: 1.60s\n",
|
" No cache: 3.95s\n",
|
||||||
" Cached: 0.62s\n"
|
" Cached: 0.53s\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -195,8 +217,8 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 71,
|
"execution_count": 86,
|
||||||
"id": "ef9cf1cb",
|
"id": "7dce8a83",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -204,10 +226,10 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"SCRAPE RESULTS:\n",
|
"SCRAPE RESULTS:\n",
|
||||||
"Average no cache: 6.25s\n",
|
"Average no cache: 4.31s\n",
|
||||||
"Average cached: 0.80s\n",
|
"Average cached: 2.87s\n",
|
||||||
"Speedup: 7.8x faster with cache\n",
|
"Speedup: 1.5x faster with cache\n",
|
||||||
"Time saved: 5.45s per request\n"
|
"Time saved: 1.44s per request\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -230,7 +252,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "f7d75182",
|
"id": "cc682ba4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"---"
|
"---"
|
||||||
@ -238,7 +260,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "195d0b0b",
|
"id": "df801504",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Crawl"
|
"## Crawl"
|
||||||
@ -246,7 +268,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "520b9c80",
|
"id": "658374d7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"--- for now used to improve map "
|
"--- for now used to improve map "
|
||||||
@ -254,7 +276,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "9a1961ef",
|
"id": "5628c39d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Hypothesis: Indexed crawls are faster"
|
"Hypothesis: Indexed crawls are faster"
|
||||||
@ -262,76 +284,43 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 142,
|
||||||
"id": "a9877e32",
|
"id": "1482e163",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
"source": [
|
{
|
||||||
"# I think this one only improves /map for now..."
|
"name": "stdout",
|
||||||
]
|
"output_type": "stream",
|
||||||
},
|
"text": [
|
||||||
{
|
"Crawling 1/10: https://www.pcbgogo.com/*\n",
|
||||||
"cell_type": "code",
|
"Crawling 2/10: https://github.com/Uniswap/v4-core/*\n",
|
||||||
"execution_count": null,
|
"Crawling 3/10: http://arcep.fr/actualites/*\n",
|
||||||
"id": "f2a6c751",
|
"Crawling 4/10: https://www.synapticure.com/*\n",
|
||||||
"metadata": {},
|
"Crawling 5/10: https://www.elecrow.com/*\n",
|
||||||
"outputs": [],
|
"Crawling 6/10: https://idfcfirstbank.com/*\n",
|
||||||
|
"Crawling 7/10: https://www.todaytix.com/*\n",
|
||||||
|
"Crawling 8/10: https://www.wheel-size.com/size/*\n",
|
||||||
|
"Crawling 9/10: http://www.drymerge.com/*\n",
|
||||||
|
"h - Crawl FAILED - HTTPConnectionPool(host='default-firecrawl-app-staging-service', port=3002): Max retries exceeded with url: /v1/crawl/31485d6f-ceeb-458b-b010-f5efc3286a0e (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1195ac0d0>: Failed to establish a new connection: [Errno 65] No route to host'))\n",
|
||||||
|
"Crawling 10/10: https://telegramindex.org/*\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"crawl_times_no_cache = []\n",
|
"crawl_times_no_cache = []\n",
|
||||||
"crawl_times_cached = []\n",
|
"crawl_times_cached = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for i, url in enumerate(crawl_urls):\n",
|
"for i, url in enumerate(crawl_urls):\n",
|
||||||
" print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n",
|
|
||||||
" \n",
|
|
||||||
" # No cache\n",
|
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" start = datetime.now()\n",
|
" print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n",
|
||||||
" result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=1), limit=5)\n",
|
" result = app.crawl_url(url)\n",
|
||||||
" no_cache_time = (datetime.now() - start).total_seconds()\n",
|
|
||||||
" crawl_times_no_cache.append(no_cache_time)\n",
|
|
||||||
" print(f\" No cache: {no_cache_time:.2f}s\")\n",
|
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" print(f\" No cache: FAILED - {e}\")\n",
|
" print(f\"{url[0]} - Crawl FAILED - {e}\")"
|
||||||
" crawl_times_no_cache.append(None)\n",
|
|
||||||
" \n",
|
|
||||||
" # Cached\n",
|
|
||||||
" try:\n",
|
|
||||||
" start = datetime.now()\n",
|
|
||||||
" result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=100000), limit=5)\n",
|
|
||||||
" cached_time = (datetime.now() - start).total_seconds()\n",
|
|
||||||
" crawl_times_cached.append(cached_time)\n",
|
|
||||||
" print(f\" Cached: {cached_time:.2f}s\")\n",
|
|
||||||
" except Exception as e:\n",
|
|
||||||
" print(f\" Cached: FAILED - {e}\")\n",
|
|
||||||
" crawl_times_cached.append(None)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c96dddc6",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Calculate averages\n",
|
|
||||||
"valid_no_cache = [t for t in crawl_times_no_cache if t is not None]\n",
|
|
||||||
"valid_cached = [t for t in crawl_times_cached if t is not None]\n",
|
|
||||||
"\n",
|
|
||||||
"if valid_no_cache and valid_cached:\n",
|
|
||||||
" avg_no_cache = statistics.mean(valid_no_cache)\n",
|
|
||||||
" avg_cached = statistics.mean(valid_cached)\n",
|
|
||||||
" speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"CRAWL RESULTS:\")\n",
|
|
||||||
" print(f\"Average no cache: {avg_no_cache:.2f}s\")\n",
|
|
||||||
" print(f\"Average cached: {avg_cached:.2f}s\")\n",
|
|
||||||
" print(f\"Speedup: {speedup:.1f}x faster with cache\")\n",
|
|
||||||
" print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per crawl\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "c9c5d28f",
|
"id": "7eb27685",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"---"
|
"---"
|
||||||
@ -339,7 +328,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "290752bb",
|
"id": "abe3f30e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Map"
|
"## Map"
|
||||||
@ -347,76 +336,179 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "efe67e8e",
|
"id": "683c74da",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Hypothesis: Indexed Map should get more urls"
|
"Hypothesis: Indexed Map should get more urls after crawl"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 72,
|
"execution_count": 143,
|
||||||
"id": "eb9ac2cb",
|
"id": "1450bf4c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'http://default-firecrawl-app-staging-service:3002/'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 143,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# TODO: use crawl urls, compare with prod"
|
"STAGING_API_URL"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 144,
|
||||||
"id": "c9211270",
|
"id": "f8d79207",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"map_times = []\n",
|
"def map_request(url, ignore_index):\n",
|
||||||
"map_url_counts = []\n",
|
" \"\"\"\n",
|
||||||
|
" Make a map request and return the links\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" payload = {\"url\": url, \"ignoreIndex\": ignore_index, \"limit\": 10000}\n",
|
||||||
|
" headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n",
|
||||||
|
" response = requests.post(STAGING_API_URL + \"v1/map\", headers=headers, json=payload)\n",
|
||||||
|
" \n",
|
||||||
|
" if response.status_code == 200:\n",
|
||||||
|
" data = response.json()\n",
|
||||||
|
" return data.get('links', [])\n",
|
||||||
|
" else:\n",
|
||||||
|
" return []"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 147,
|
||||||
|
"id": "a74da0a5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Testing 1/10: https://www.pcbgogo.com/\n",
|
||||||
|
" No index: 1.44s, 664 URLs\n",
|
||||||
|
" With index: 1.70s, 664 URLs\n",
|
||||||
|
"Testing 2/10: https://github.com/Uniswap/v4-core/\n",
|
||||||
|
" No index: 0.72s, 235 URLs\n",
|
||||||
|
" With index: 0.75s, 235 URLs\n",
|
||||||
|
"Testing 3/10: http://arcep.fr/actualites/\n",
|
||||||
|
" No index: 6.23s, 8059 URLs\n",
|
||||||
|
" With index: 6.71s, 9048 URLs\n",
|
||||||
|
"Testing 4/10: https://www.synapticure.com/\n",
|
||||||
|
" No index: 0.72s, 365 URLs\n",
|
||||||
|
" With index: 0.94s, 365 URLs\n",
|
||||||
|
"Testing 5/10: https://www.elecrow.com/\n",
|
||||||
|
" No index: 4.03s, 2747 URLs\n",
|
||||||
|
" With index: 4.13s, 2747 URLs\n",
|
||||||
|
"Testing 6/10: https://idfcfirstbank.com/\n",
|
||||||
|
" No index: 2.62s, 279 URLs\n",
|
||||||
|
" With index: 4.02s, 279 URLs\n",
|
||||||
|
"Testing 7/10: https://www.todaytix.com/\n",
|
||||||
|
" No index: 4.53s, 8933 URLs\n",
|
||||||
|
" With index: 3.63s, 8933 URLs\n",
|
||||||
|
"Testing 8/10: https://www.wheel-size.com/size/\n",
|
||||||
|
" No index: 10.89s, 10000 URLs\n",
|
||||||
|
" With index: 10.73s, 10000 URLs\n",
|
||||||
|
"Testing 9/10: http://www.drymerge.com/\n",
|
||||||
|
" No index: 32.94s, 3602 URLs\n",
|
||||||
|
" With index: 37.07s, 3602 URLs\n",
|
||||||
|
"Testing 10/10: https://telegramindex.org/\n",
|
||||||
|
" No index: 0.98s, 2 URLs\n",
|
||||||
|
" With index: 1.19s, 2 URLs\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"map_times_no_cache = []\n",
|
||||||
|
"map_times_cached = []\n",
|
||||||
|
"map_url_counts_no_cache = []\n",
|
||||||
|
"map_url_counts_cached = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for i, url in enumerate(map_urls):\n",
|
"for i, url in enumerate(map_urls):\n",
|
||||||
" print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n",
|
" print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n",
|
||||||
" \n",
|
" \n",
|
||||||
" try:\n",
|
" # No index (ignoreIndex=True)\n",
|
||||||
" start = datetime.now()\n",
|
" start = datetime.now()\n",
|
||||||
" result = app.map_url(url)\n",
|
" links_no_index = map_request(url, True)\n",
|
||||||
" map_time = (datetime.now() - start).total_seconds()\n",
|
" time_no_index = (datetime.now() - start).total_seconds()\n",
|
||||||
" url_count = len(result.links) if hasattr(result, 'links') else 0\n",
|
" \n",
|
||||||
" \n",
|
" map_times_no_cache.append(time_no_index)\n",
|
||||||
" map_times.append(map_time)\n",
|
" map_url_counts_no_cache.append(len(links_no_index))\n",
|
||||||
" map_url_counts.append(url_count)\n",
|
" print(f\" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n",
|
||||||
" \n",
|
" \n",
|
||||||
" print(f\" Time: {map_time:.2f}s\")\n",
|
" # With index (ignoreIndex=False)\n",
|
||||||
" print(f\" URLs found: {url_count}\")\n",
|
" start = datetime.now()\n",
|
||||||
" except Exception as e:\n",
|
" links_indexed = map_request(url, False)\n",
|
||||||
" print(f\" FAILED - {e}\")\n",
|
" time_indexed = (datetime.now() - start).total_seconds()\n",
|
||||||
" map_times.append(None)\n",
|
" \n",
|
||||||
" map_url_counts.append(0)"
|
" map_times_cached.append(time_indexed)\n",
|
||||||
|
" map_url_counts_cached.append(len(links_indexed))\n",
|
||||||
|
" print(f\" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 149,
|
||||||
"id": "b782e07b",
|
"id": "2fa88f5d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"MAP RESULTS:\n",
|
||||||
|
"Average time (no cache): 6.51s\n",
|
||||||
|
"Average time (cached): 7.09s\n",
|
||||||
|
"Time speedup: 0.92x faster with cache\n",
|
||||||
|
"Average URLs found (no cache): 3488.6\n",
|
||||||
|
"Average URLs found (cached): 3587.5\n",
|
||||||
|
"URL difference: +98.9 URLs with cache\n",
|
||||||
|
"URL percentage: 102.8% of no-cache results\n",
|
||||||
|
"✅ Cache finds MORE URLs\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Calculate averages\n",
|
"# Calculate averages\n",
|
||||||
"valid_times = [t for t in map_times if t is not None]\n",
|
"avg_time_no_cache = statistics.mean(map_times_no_cache)\n",
|
||||||
"valid_counts = [c for c in map_url_counts if c > 0]\n",
|
"avg_time_cached = statistics.mean(map_times_cached)\n",
|
||||||
|
"avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n",
|
||||||
|
"avg_urls_cached = statistics.mean(map_url_counts_cached)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if valid_times:\n",
|
"time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n",
|
||||||
" avg_time = statistics.mean(valid_times)\n",
|
"url_difference = avg_urls_cached - avg_urls_no_cache\n",
|
||||||
" avg_urls = statistics.mean(valid_counts) if valid_counts else 0\n",
|
"url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n",
|
||||||
" \n",
|
"\n",
|
||||||
" print(\"MAP RESULTS:\")\n",
|
"print(\"MAP RESULTS:\")\n",
|
||||||
" print(f\"Average time: {avg_time:.2f}s\")\n",
|
"print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n",
|
||||||
" print(f\"Average URLs found: {avg_urls:.1f}\")\n",
|
"print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n",
|
||||||
" print(f\"URLs per second: {avg_urls/avg_time:.1f}\")"
|
"print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n",
|
||||||
|
"print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n",
|
||||||
|
"print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n",
|
||||||
|
"print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n",
|
||||||
|
"print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n",
|
||||||
|
"\n",
|
||||||
|
"if url_difference > 0:\n",
|
||||||
|
" print(\"✅ Cache finds MORE URLs\")\n",
|
||||||
|
"elif url_difference < 0:\n",
|
||||||
|
" print(\"⚠️ Cache finds FEWER URLs\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"➡️ Cache finds SAME number of URLs\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "e3cf02db",
|
"id": "e5ee2116",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"---"
|
"---"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user