diff --git a/apps/api/src/__tests__/queue-concurrency-integration.test.ts b/apps/api/src/__tests__/queue-concurrency-integration.test.ts index fdc21eda..650a8bb1 100644 --- a/apps/api/src/__tests__/queue-concurrency-integration.test.ts +++ b/apps/api/src/__tests__/queue-concurrency-integration.test.ts @@ -65,6 +65,7 @@ describe("Queue Concurrency Integration", () => { fastMode: false, blockAds: true, maxAge: 0, + dontStoreInCache: false, }; beforeEach(() => { diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 7b1d3540..4c54f251 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -25,7 +25,7 @@ import { logger } from "../../lib/logger"; import Redis from "ioredis"; import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index"; import { getIndexQueue } from "../../services/queue-service"; -import { hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index"; +import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -165,6 +165,8 @@ export async function getMapResults({ await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours } + const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x)); + // Parallelize sitemap index query with search results const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([ querySitemapIndex(url, abort), @@ -172,7 +174,7 @@ export async function getMapResults({ index_supabase_service .from("index") .select("resolved_url") - .overlaps("url_splits_hash", [await hashURL(normalizeURLForIndex(url))]) + .eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1]) .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString()) .limit(limit) ) : Promise.resolve({ data: [], error: null }), diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 46c9317e..2d2b5f91 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -132,6 +132,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { fastMode: false, blockAds: false, maxAge: 0, + dontStoreInCache: false, }, }, logger, costTracking, acuc?.flags ?? null); return response.length > 0 ? response : []; diff --git a/apps/api/src/scraper/scrapeURL/engines/index/index.ts b/apps/api/src/scraper/scrapeURL/engines/index/index.ts index 26b24e31..7a756a17 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts @@ -68,6 +68,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) { location_country: meta.options.location?.country ?? null, location_languages: meta.options.location?.languages ?? null, status: document.metadata.statusCode, + ...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({ + ...a, + [`url_split_${i}_hash`]: x, + }), {})), }); if (error) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6653a543..a4a06dcc 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1242,8 +1242,6 @@ async function processJob(job: Job & { id: string }, token: string) { ); } - indexJob(job, doc); - logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true);