From da9a9b0d19a0e09fb2a5b046abd5b833516fa797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 3 Jun 2025 16:07:59 +0200 Subject: [PATCH] cleanup --- apps/api/src/__tests__/snips/scrape.test.ts | 2 +- apps/api/src/controllers/v1/map.ts | 10 +- apps/api/src/controllers/v1/types.ts | 4 +- apps/api/src/services/queue-worker.ts | 70 +-- apps/js-sdk/firecrawl/src/index.ts | 2 +- apps/test-suite/index-benchmark/run.ipynb | 539 -------------------- 6 files changed, 47 insertions(+), 580 deletions(-) delete mode 100644 apps/test-suite/index-benchmark/run.ipynb diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index bebac9e0..06920f18 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -105,7 +105,7 @@ describe("Scrape tests", () => { const response1 = await scrape({ url, maxAge: 120000, - dontStoreInCache: true, + storeInCache: false, }); expect(response1.metadata.cacheState).toBe("miss"); diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 54df0c7b..e5f83e2d 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -25,7 +25,7 @@ import { logger } from "../../lib/logger"; import Redis from "ioredis"; import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index"; import { getIndexQueue } from "../../services/queue-service"; -import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index"; +import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex as globalUseIndex } from "../../services/index"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -59,7 +59,7 @@ export async function getMapResults({ mock, filterByPath = true, flags, - ignoreIndex = false, + useIndex = true, }: { url: string; search?: string; @@ -75,7 +75,7 @@ export async function getMapResults({ mock?: string; filterByPath?: boolean; flags: TeamFlags; - ignoreIndex?: boolean; + useIndex?: boolean; }): Promise { const id = uuidv4(); let links: string[] = [url]; @@ -172,7 +172,7 @@ export async function getMapResults({ // Parallelize sitemap index query with search results const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([ querySitemapIndex(url, abort), - useIndex && !ignoreIndex && process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true" ? ( + globalUseIndex && useIndex && process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true" ? ( index_supabase_service .from("index") .select("resolved_url") @@ -352,7 +352,7 @@ export async function mapController( mock: req.body.useMock, filterByPath: req.body.filterByPath !== false, flags: req.acuc?.flags ?? null, - ignoreIndex: req.body.ignoreIndex, + useIndex: req.body.useIndex, }), ...(req.body.timeout !== undefined ? [ new Promise((resolve, reject) => setTimeout(() => { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9019270c..762e7b45 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -310,7 +310,7 @@ const baseScrapeOptions = z blockAds: z.boolean().default(true), proxy: z.enum(["basic", "stealth", "auto"]).optional(), maxAge: z.number().int().gte(0).safe().default(0), - dontStoreInCache: z.boolean().default(false), + storeInCache: z.boolean().default(true), }) .strict(strictMessage); @@ -658,7 +658,7 @@ export const mapRequestSchema = crawlerOptions timeout: z.number().positive().finite().optional(), useMock: z.string().optional(), filterByPath: z.boolean().default(true), - ignoreIndex: z.boolean().default(false), + useIndex: z.boolean().default(true), }) .strict(strictMessage); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 9aa507e3..75ed92c1 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -87,7 +87,7 @@ import { robustFetch } from "../scraper/scrapeURL/lib/fetch"; import { RateLimiterMode } from "../types"; import { redisEvictConnection } from "./redis"; import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index"; -import { val } from "node_modules/cheerio/lib/api/attributes"; +import { WebCrawler } from "../scraper/WebScraper/crawler"; configDotenv(); @@ -912,6 +912,38 @@ const workerFun = async ( } }; +async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) { + const trimmedURL = new URL(url); + trimmedURL.search = ""; + + const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x)); + + const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex) + ? [] + : sc.crawlerOptions.allowBackwardCrawling + ? (await index_supabase_service + .from("index") + .select("resolved_url") + .eq("url_split_0_hash", urlSplits[0]) + .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) + .limit(sc.crawlerOptions.limit ?? 100)).data ?? [] + : (await index_supabase_service + .from("index") + .select("resolved_url") + .eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1]) + .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) + .limit(sc.crawlerOptions.limit ?? 100)).data ?? []; + + const validIndexLinks = crawler.filterLinks( + [...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null), + sc.crawlerOptions.limit ?? 100, + sc.crawlerOptions.maxDepth ?? 10, + false, + ); + + return validIndexLinks; +} + async function processKickoffJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", @@ -1029,37 +1061,11 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { }); } - const trimmedURL = new URL(job.data.url); - trimmedURL.search = ""; + const indexLinks = await kickoffGetIndexLinks(sc, crawler, job.data.url); - const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x)); - - const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex) - ? [] - : sc.crawlerOptions.allowBackwardCrawling - ? (await index_supabase_service - .from("index") - .select("resolved_url") - .eq("url_split_0_hash", urlSplits[0]) - .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) - .limit(sc.crawlerOptions.limit ?? 100)).data ?? [] - : (await index_supabase_service - .from("index") - .select("resolved_url") - .eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1]) - .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) - .limit(sc.crawlerOptions.limit ?? 100)).data ?? []; - - const validIndexLinks = crawler.filterLinks( - [...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null), - sc.crawlerOptions.limit ?? 100, - sc.crawlerOptions.maxDepth ?? 10, - false, - ); - - if (validIndexLinks.length > 0) { - logger.debug("Using index links of length " + validIndexLinks.length, { - indexLinksLength: validIndexLinks.length, + if (indexLinks.length > 0) { + logger.debug("Using index links of length " + indexLinks.length, { + indexLinksLength: indexLinks.length, }); let jobPriority = await getJobPriority({ @@ -1068,7 +1074,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { }); logger.debug("Using job priority " + jobPriority, { jobPriority }); - const jobs = validIndexLinks.map((url) => { + const jobs = indexLinks.map((url) => { const uuid = uuidv4(); return { name: uuid, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 9c1fc8e8..86e5a91c 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -120,7 +120,7 @@ export interface CrawlScrapeOptions { removeBase64Images?: boolean; blockAds?: boolean; proxy?: "basic" | "stealth" | "auto"; - dontStoreInCache?: boolean; + storeInCache?: boolean; } export type Action = { diff --git a/apps/test-suite/index-benchmark/run.ipynb b/apps/test-suite/index-benchmark/run.ipynb deleted file mode 100644 index dbf65bc3..00000000 --- a/apps/test-suite/index-benchmark/run.ipynb +++ /dev/null @@ -1,539 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "51331c01", - "metadata": {}, - "source": [ - "# Index Benchmark" - ] - }, - { - "cell_type": "markdown", - "id": "9cb10752", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "7928e2c9", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "id": "a64ec8a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from firecrawl import FirecrawlApp, ScrapeOptions\n", - "import os\n", - "from dotenv import load_dotenv\n", - "from datetime import datetime\n", - "import statistics\n", - "import requests\n", - "\n", - "load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "bc7ce797", - "metadata": {}, - "outputs": [], - "source": [ - "STAGING_API_URL = 'http://default-firecrawl-app-staging-service:3002/' # os.getenv(\"STAGING_API_URL\") or None\n", - "\n", - "if STAGING_API_URL is None:\n", - " raise ValueError(\"STAGING_API_URL is not set\")\n", - " \n", - "app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "a3db6548", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://default-firecrawl-app-staging-service:3002/'" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "STAGING_API_URL" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "id": "440f7c2d", - "metadata": {}, - "outputs": [], - "source": [ - "scrape_urls=[\n", - " 'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n", - " 'https://httpbin.org', # - HTTP testing service (very reliable)\n", - " 'https://example.com', # - Standard test domain (minimal content)\n", - " 'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n", - " 'https://stackoverflow.com/questions', # - Stack Overflow questions page\n", - " 'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n", - " 'https://jsonplaceholder.typicode.com', # - Fake API for testing\n", - " 'https://httpstat.us/200', # - HTTP status testing (minimal response)\n", - " 'https://www.reddit.com/r/programming', # - Reddit programming subreddit\n", - " 'https://docs.python.org/3/' # - Python documentation (structured docs)\n", - "]\n", - "\n", - "\n", - "crawl_urls = [\n", - " \"https://www.pcbgogo.com/*\", # 7825\n", - " \"https://github.com/Uniswap/v4-core/*\", # 7353\n", - " \"http://arcep.fr/actualites/*\", # 9764\n", - " \"https://www.synapticure.com/*\", # 7746\n", - " \"https://www.elecrow.com/*\", # 8025\n", - " \"https://idfcfirstbank.com/*\", # 9912\n", - " \"https://www.todaytix.com/*\", # 7532\n", - " \"https://www.wheel-size.com/size/*\", # 7102\n", - " \"http://www.drymerge.com/*\", # 8422\n", - " \"https://telegramindex.org/*\" # 5335\n", - "]\n", - "\n", - "\n", - "map_urls = []\n", - "for i in crawl_urls:\n", - " map_urls.append(i.replace('*',''))" - ] - }, - { - "cell_type": "markdown", - "id": "e54e6677", - "metadata": {}, - "source": [ - "## Scrape" - ] - }, - { - "cell_type": "markdown", - "id": "3fed4cb6", - "metadata": {}, - "source": [ - "Hypothesis: Indexed scrapes are faster" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "fb052d01", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing 1/10: https://news.ycombinator.com\n", - " No cache: 2.39s\n", - " Cached: 0.93s\n", - "Testing 2/10: https://httpbin.org\n", - " No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", - " Cached: 4.79s\n", - "Testing 3/10: https://example.com\n", - " No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.\n", - " Cached: 17.33s\n", - "Testing 4/10: https://github.com/microsoft/vscode\n", - " No cache: 2.81s\n", - " Cached: 1.06s\n", - "Testing 5/10: https://stackoverflow.com/questions\n", - " No cache: 2.92s\n", - " Cached: 1.16s\n", - "Testing 6/10: https://www.wikipedia.org\n", - " No cache: 3.53s\n", - " Cached: 0.78s\n", - "Testing 7/10: https://jsonplaceholder.typicode.com\n", - " No cache: 3.54s\n", - " Cached: 0.80s\n", - "Testing 8/10: https://httpstat.us/200\n", - " No cache: 10.79s\n", - " Cached: 0.54s\n", - "Testing 9/10: https://www.reddit.com/r/programming\n", - " No cache: 4.53s\n", - " Cached: 0.79s\n", - "Testing 10/10: https://docs.python.org/3/\n", - " No cache: 3.95s\n", - " Cached: 0.53s\n" - ] - } - ], - "source": [ - "scrape_times_no_cache = []\n", - "scrape_times_cached = []\n", - "\n", - "for i, url in enumerate(scrape_urls): # Test first 5 URLs\n", - " print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n", - " \n", - " # No cache (maxAge=1)\n", - " try:\n", - " start = datetime.now()\n", - " doc = app.scrape_url(url, maxAge=1)\n", - " no_cache_time = (datetime.now() - start).total_seconds()\n", - " scrape_times_no_cache.append(no_cache_time)\n", - " print(f\" No cache: {no_cache_time:.2f}s\")\n", - " except Exception as e:\n", - " print(f\" No cache: FAILED - {e}\")\n", - " scrape_times_no_cache.append(None)\n", - " \n", - " # Cached (maxAge=100000)\n", - " try:\n", - " start = datetime.now()\n", - " doc = app.scrape_url(url, maxAge=100000)\n", - " cached_time = (datetime.now() - start).total_seconds()\n", - " scrape_times_cached.append(cached_time)\n", - " print(f\" Cached: {cached_time:.2f}s\")\n", - " except Exception as e:\n", - " print(f\" Cached: FAILED - {e}\")\n", - " scrape_times_cached.append(None)" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "7dce8a83", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SCRAPE RESULTS:\n", - "Average no cache: 4.31s\n", - "Average cached: 2.87s\n", - "Speedup: 1.5x faster with cache\n", - "Time saved: 1.44s per request\n" - ] - } - ], - "source": [ - "# Calculate averages\n", - "valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n", - "valid_cached = [t for t in scrape_times_cached if t is not None]\n", - "\n", - "if valid_no_cache and valid_cached:\n", - " avg_no_cache = statistics.mean(valid_no_cache)\n", - " avg_cached = statistics.mean(valid_cached)\n", - " speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n", - " \n", - " print(\"SCRAPE RESULTS:\")\n", - " print(f\"Average no cache: {avg_no_cache:.2f}s\")\n", - " print(f\"Average cached: {avg_cached:.2f}s\")\n", - " print(f\"Speedup: {speedup:.1f}x faster with cache\")\n", - " print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")" - ] - }, - { - "cell_type": "markdown", - "id": "cc682ba4", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "df801504", - "metadata": {}, - "source": [ - "## Crawl" - ] - }, - { - "cell_type": "markdown", - "id": "658374d7", - "metadata": {}, - "source": [ - "--- for now used to improve map " - ] - }, - { - "cell_type": "markdown", - "id": "5628c39d", - "metadata": {}, - "source": [ - "Hypothesis: Indexed crawls are faster" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "1482e163", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Crawling 1/10: https://www.pcbgogo.com/*\n", - "Crawling 2/10: https://github.com/Uniswap/v4-core/*\n", - "Crawling 3/10: http://arcep.fr/actualites/*\n", - "Crawling 4/10: https://www.synapticure.com/*\n", - "Crawling 5/10: https://www.elecrow.com/*\n", - "Crawling 6/10: https://idfcfirstbank.com/*\n", - "Crawling 7/10: https://www.todaytix.com/*\n", - "Crawling 8/10: https://www.wheel-size.com/size/*\n", - "Crawling 9/10: http://www.drymerge.com/*\n", - "h - Crawl FAILED - HTTPConnectionPool(host='default-firecrawl-app-staging-service', port=3002): Max retries exceeded with url: /v1/crawl/31485d6f-ceeb-458b-b010-f5efc3286a0e (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 65] No route to host'))\n", - "Crawling 10/10: https://telegramindex.org/*\n" - ] - } - ], - "source": [ - "crawl_times_no_cache = []\n", - "crawl_times_cached = []\n", - "\n", - "for i, url in enumerate(crawl_urls):\n", - " try:\n", - " print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n", - " result = app.crawl_url(url)\n", - " except Exception as e:\n", - " print(f\"{url[0]} - Crawl FAILED - {e}\")" - ] - }, - { - "cell_type": "markdown", - "id": "7eb27685", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "abe3f30e", - "metadata": {}, - "source": [ - "## Map" - ] - }, - { - "cell_type": "markdown", - "id": "683c74da", - "metadata": {}, - "source": [ - "Hypothesis: Indexed Map should get more urls after crawl" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "id": "1450bf4c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://default-firecrawl-app-staging-service:3002/'" - ] - }, - "execution_count": 143, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "STAGING_API_URL" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "id": "f8d79207", - "metadata": {}, - "outputs": [], - "source": [ - "def map_request(url, ignore_index):\n", - " \"\"\"\n", - " Make a map request and return the links\n", - " \"\"\"\n", - " payload = {\"url\": url, \"ignoreIndex\": ignore_index, \"limit\": 10000}\n", - " headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n", - " response = requests.post(STAGING_API_URL + \"v1/map\", headers=headers, json=payload)\n", - " \n", - " if response.status_code == 200:\n", - " data = response.json()\n", - " return data.get('links', [])\n", - " else:\n", - " return []" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "id": "a74da0a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing 1/10: https://www.pcbgogo.com/\n", - " No index: 1.44s, 664 URLs\n", - " With index: 1.70s, 664 URLs\n", - "Testing 2/10: https://github.com/Uniswap/v4-core/\n", - " No index: 0.72s, 235 URLs\n", - " With index: 0.75s, 235 URLs\n", - "Testing 3/10: http://arcep.fr/actualites/\n", - " No index: 6.23s, 8059 URLs\n", - " With index: 6.71s, 9048 URLs\n", - "Testing 4/10: https://www.synapticure.com/\n", - " No index: 0.72s, 365 URLs\n", - " With index: 0.94s, 365 URLs\n", - "Testing 5/10: https://www.elecrow.com/\n", - " No index: 4.03s, 2747 URLs\n", - " With index: 4.13s, 2747 URLs\n", - "Testing 6/10: https://idfcfirstbank.com/\n", - " No index: 2.62s, 279 URLs\n", - " With index: 4.02s, 279 URLs\n", - "Testing 7/10: https://www.todaytix.com/\n", - " No index: 4.53s, 8933 URLs\n", - " With index: 3.63s, 8933 URLs\n", - "Testing 8/10: https://www.wheel-size.com/size/\n", - " No index: 10.89s, 10000 URLs\n", - " With index: 10.73s, 10000 URLs\n", - "Testing 9/10: http://www.drymerge.com/\n", - " No index: 32.94s, 3602 URLs\n", - " With index: 37.07s, 3602 URLs\n", - "Testing 10/10: https://telegramindex.org/\n", - " No index: 0.98s, 2 URLs\n", - " With index: 1.19s, 2 URLs\n" - ] - } - ], - "source": [ - "map_times_no_cache = []\n", - "map_times_cached = []\n", - "map_url_counts_no_cache = []\n", - "map_url_counts_cached = []\n", - "\n", - "for i, url in enumerate(map_urls):\n", - " print(f\"Testing {i+1}/{len(map_urls)}: {url}\")\n", - " \n", - " # No index (ignoreIndex=True)\n", - " start = datetime.now()\n", - " links_no_index = map_request(url, True)\n", - " time_no_index = (datetime.now() - start).total_seconds()\n", - " \n", - " map_times_no_cache.append(time_no_index)\n", - " map_url_counts_no_cache.append(len(links_no_index))\n", - " print(f\" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n", - " \n", - " # With index (ignoreIndex=False)\n", - " start = datetime.now()\n", - " links_indexed = map_request(url, False)\n", - " time_indexed = (datetime.now() - start).total_seconds()\n", - " \n", - " map_times_cached.append(time_indexed)\n", - " map_url_counts_cached.append(len(links_indexed))\n", - " print(f\" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "id": "2fa88f5d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MAP RESULTS:\n", - "Average time (no cache): 6.51s\n", - "Average time (cached): 7.09s\n", - "Time speedup: 0.92x faster with cache\n", - "Average URLs found (no cache): 3488.6\n", - "Average URLs found (cached): 3587.5\n", - "URL difference: +98.9 URLs with cache\n", - "URL percentage: 102.8% of no-cache results\n", - "✅ Cache finds MORE URLs\n" - ] - } - ], - "source": [ - "# Calculate averages\n", - "avg_time_no_cache = statistics.mean(map_times_no_cache)\n", - "avg_time_cached = statistics.mean(map_times_cached)\n", - "avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n", - "avg_urls_cached = statistics.mean(map_url_counts_cached)\n", - "\n", - "time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n", - "url_difference = avg_urls_cached - avg_urls_no_cache\n", - "url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n", - "\n", - "print(\"MAP RESULTS:\")\n", - "print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n", - "print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n", - "print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n", - "print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n", - "print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n", - "print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n", - "print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n", - "\n", - "if url_difference > 0:\n", - " print(\"✅ Cache finds MORE URLs\")\n", - "elif url_difference < 0:\n", - " print(\"⚠️ Cache finds FEWER URLs\")\n", - "else:\n", - " print(\"➡️ Cache finds SAME number of URLs\")" - ] - }, - { - "cell_type": "markdown", - "id": "e5ee2116", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}