Nick: fixed

This commit is contained in:
Nicolas 2025-01-03 22:50:53 -03:00
parent a4f7c38834
commit c655c6859f
2 changed files with 63 additions and 41 deletions

View File

@ -0,0 +1,7 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}

View File

@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone"; import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { normalizeUrl } from "../lib/canonical-url";
configDotenv(); configDotenv();
@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id)) {
// Get all visited URLs from Redis (async () => {
const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
// Get all visited URLs from Redis
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) const visitedUrls = await redisConnection.smembers(
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { "crawl:" + job.data.crawl_id + ":visited",
try { );
// First check if entry exists for this origin URL // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
const { data: existingMap } = await supabase_service if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
.from('crawl_maps') // Fire and forget the upload to Supabase
.select('urls') try {
.eq('origin_url', sc.originUrl) // Standardize URLs to canonical form (https, no www)
.single(); const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) { if (existingMap) {
// Merge URLs, removing duplicates // Merge URLs, removing duplicates
const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
const { error } = await supabase_service ];
.from('crawl_maps')
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString()
})
.eq('origin_url', sc.originUrl);
if (error) { const { error } = await supabase_service
_logger.error("Failed to update crawl map", { error }); .from("crawl_maps")
} .update({
} else { urls: mergedUrls,
// Insert new entry if none exists num_urls: mergedUrls.length,
const { error } = await supabase_service updated_at: new Date().toISOString(),
.from('crawl_maps') })
.insert({ .eq("origin_url", originUrl);
origin_url: sc.originUrl,
urls: visitedUrls, if (error) {
num_urls: visitedUrls.length, _logger.error("Failed to update crawl map", { error });
created_at: new Date().toISOString() }
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
}); });
if (error) { if (error) {
_logger.error("Failed to save crawl map", { error }); _logger.error("Failed to save crawl map", { error });
}
} }
} catch (error) {
_logger.error("Error saving crawl map", { error });
} }
} catch (error) {
_logger.error("Error saving crawl map", { error });
} }
} })();
if (!job.data.v1) { if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id); const jobIDs = await getCrawlJobs(job.data.crawl_id);