diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 0de1c53c..a7a47d05 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -25,7 +25,7 @@ import { logger } from "../../lib/logger"; import Redis from "ioredis"; import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index"; import { getIndexQueue } from "../../services/queue-service"; -import { generateURLSplits, hashURL, index_supabase_service, useIndex as globalUseIndex } from "../../services/index"; +import { queryIndexAtSplitLevel } from "../../services/index"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -45,25 +45,11 @@ interface MapResult { } async function queryIndex(url: string, limit: number, useIndex: boolean): Promise { - if (!globalUseIndex || !useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") { + if (!useIndex) { return []; } - const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x)); - - const { data, error } = await index_supabase_service - .from("index") - .select("resolved_url") - .eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1]) - .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) - .limit(limit) - - if (error) { - logger.warn("Error querying index", { error }); - return []; - } - - return (data ?? []).map((x) => x.resolved_url); + return await queryIndexAtSplitLevel(url, limit); } export async function getMapResults({ diff --git a/apps/api/src/services/index.ts b/apps/api/src/services/index.ts index 85b686d0..df7969de 100644 --- a/apps/api/src/services/index.ts +++ b/apps/api/src/services/index.ts @@ -214,3 +214,28 @@ export async function processIndexInsertJobs() { export async function getIndexInsertQueueLength(): Promise { return await redisEvictConnection.llen(INDEX_INSERT_QUEUE_KEY) ?? 0; } + +export async function queryIndexAtSplitLevel(url: string, limit: number): Promise { + if (!useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") { + return []; + } + + const urlObj = new URL(url); + urlObj.search = ""; + + const urlSplitsHash = generateURLSplits(urlObj.href).map(x => hashURL(x)); + + const { data, error } = await index_supabase_service + .from("index") + .select("resolved_url") + .eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1]) + .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) + .limit(limit) + + if (error) { + logger.warn("Error querying index", { error, url, limit }); + return []; + } + + return [...new Set((data ?? []).map((x) => x.resolved_url))]; +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 791a945f..2a1bdf66 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -86,7 +86,7 @@ import { robustFetch } from "../scraper/scrapeURL/lib/fetch"; import { RateLimiterMode } from "../types"; import { calculateCreditsToBeBilled } from "../lib/scrape-billing"; import { redisEvictConnection } from "./redis"; -import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index"; +import { generateURLSplits, queryIndexAtSplitLevel } from "./index"; import { WebCrawler } from "../scraper/WebScraper/crawler"; import type { Logger } from "winston"; @@ -916,29 +916,20 @@ const workerFun = async ( }; async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) { + if (sc.crawlerOptions.ignoreSitemap) { + return []; + } + const trimmedURL = new URL(url); trimmedURL.search = ""; - const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x)); + const index = await queryIndexAtSplitLevel( + sc.crawlerOptions.allowBackwardCrawling ? generateURLSplits(trimmedURL.href)[0] : trimmedURL.href, + sc.crawlerOptions.limit ?? 100, + ); - const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex) - ? [] - : sc.crawlerOptions.allowBackwardCrawling - ? (await index_supabase_service - .from("index") - .select("resolved_url") - .eq("url_split_0_hash", urlSplits[0]) - .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) - .limit(sc.crawlerOptions.limit ?? 100)).data ?? [] - : (await index_supabase_service - .from("index") - .select("resolved_url") - .eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1]) - .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) - .limit(sc.crawlerOptions.limit ?? 100)).data ?? []; - const validIndexLinks = crawler.filterLinks( - [...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null), + index.filter(x => crawler.filterURL(x, trimmedURL.href) !== null), sc.crawlerOptions.limit ?? 100, sc.crawlerOptions.maxDepth ?? 10, false,