feat: use url split columns

This commit is contained in:
Gergő Móricz 2025-05-30 13:56:28 +02:00
parent 2fd31174fb
commit 96c753f9a9
5 changed files with 10 additions and 4 deletions

View File

@ -65,6 +65,7 @@ describe("Queue Concurrency Integration", () => {
fastMode: false,
blockAds: true,
maxAge: 0,
dontStoreInCache: false,
};
beforeEach(() => {

View File

@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
import Redis from "ioredis";
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
import { getIndexQueue } from "../../services/queue-service";
import { hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
@ -165,6 +165,8 @@ export async function getMapResults({
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
}
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
// Parallelize sitemap index query with search results
const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
querySitemapIndex(url, abort),
@ -172,7 +174,7 @@ export async function getMapResults({
index_supabase_service
.from("index")
.select("resolved_url")
.overlaps("url_splits_hash", [await hashURL(normalizeURLForIndex(url))])
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
.limit(limit)
) : Promise.resolve({ data: [], error: null }),

View File

@ -132,6 +132,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
fastMode: false,
blockAds: false,
maxAge: 0,
dontStoreInCache: false,
},
}, logger, costTracking, acuc?.flags ?? null);
return response.length > 0 ? response : [];

View File

@ -68,6 +68,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
location_country: meta.options.location?.country ?? null,
location_languages: meta.options.location?.languages ?? null,
status: document.metadata.statusCode,
...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
...a,
[`url_split_${i}_hash`]: x,
}), {})),
});
if (error) {

View File

@ -1242,8 +1242,6 @@ async function processJob(job: Job & { id: string }, token: string) {
);
}
indexJob(job, doc);
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, true);