feat: use url split columns

2025-08-18 03:55:55 +08:00 · 2025-05-30 13:56:28 +02:00 · 2025-05-30 13:56:28 +02:00 · 96c753f9a9
commit 96c753f9a9
parent 2fd31174fb
5 changed files with 10 additions and 4 deletions
--- a/apps/api/src/tests/queue-concurrency-integration.test.ts
+++ b/apps/api/src/tests/queue-concurrency-integration.test.ts
@ -65,6 +65,7 @@ describe("Queue Concurrency Integration", () => {
    fastMode: false,
    blockAds: true,
    maxAge: 0,
+    dontStoreInCache: false,
  };

  beforeEach(() => {
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
 import Redis from "ioredis";
 import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
 import { getIndexQueue } from "../../services/queue-service";
-import { hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
+import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";

 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
@ -165,6 +165,8 @@ export async function getMapResults({
      await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
    }

+    const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
+
    // Parallelize sitemap index query with search results
    const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
      querySitemapIndex(url, abort),
@ -172,7 +174,7 @@ export async function getMapResults({
        index_supabase_service
          .from("index")
          .select("resolved_url")
-          .overlaps("url_splits_hash", [await hashURL(normalizeURLForIndex(url))])
+          .eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
          .gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
          .limit(limit)
      ) : Promise.resolve({ data: [], error: null }),
--- a/apps/api/src/lib/deep-research/deep-research-service.ts
+++ b/apps/api/src/lib/deep-research/deep-research-service.ts
@ -132,6 +132,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
            fastMode: false,
            blockAds: false,
            maxAge: 0,
+            dontStoreInCache: false,
          },
        }, logger, costTracking, acuc?.flags ?? null);
        return response.length > 0 ? response : [];
--- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts
@ -68,6 +68,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
            location_country: meta.options.location?.country ?? null,
            location_languages: meta.options.location?.languages ?? null,
            status: document.metadata.statusCode,
+            ...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
+                ...a,
+                [`url_split_${i}_hash`]: x,
+            }), {})),
        });

    if (error) {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -1242,8 +1242,6 @@ async function processJob(job: Job & { id: string }, token: string) {
        );
      }

-      indexJob(job, doc);
-
      logger.debug("Declaring job as done...");
      await addCrawlJobDone(job.data.crawl_id, job.id, true);