Nick: map improvements

This commit is contained in:
Nicolas 2025-01-19 12:33:44 -03:00
parent c19af6ef42
commit 513f61a2d1
4 changed files with 92 additions and 62 deletions

View File

@ -226,6 +226,8 @@ export async function getMapResults({
? links
: links.slice(0, limit);
//
return {
success: true,
links: linksToReturn,

View File

@ -12,19 +12,33 @@ import { withAuth } from "../../lib/withAuth";
async function querySitemapIndexFunction(url: string) {
const originUrl = normalizeUrlOnlyHostname(url);
const { data, error } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl);
for (let attempt = 1; attempt <= 3; attempt++) {
try {
const { data, error } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl);
if (error) {
logger.error("(sitemap-index) Error querying the index", { error });
return [];
if (error) {
throw error;
}
const allUrls = data.map((entry) => entry.urls).flat();
return allUrls;
} catch (error) {
logger.error("(sitemap-index) Error querying the index", {
error,
attempt
});
if (attempt === 3) {
return [];
}
}
}
const allUrls = data.map((entry) => entry.urls).flat();
return allUrls;
return [];
}
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);

View File

@ -0,0 +1,62 @@
import { logger } from "../../lib/logger";
import {
normalizeUrl,
normalizeUrlOnlyHostname,
} from "../../lib/canonical-url";
import { supabase_service } from "../supabase";
export async function saveCrawlMap(originUrl: string, visitedUrls: string[]) {
originUrl = normalizeUrlOnlyHostname(originUrl);
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});
if (error) {
logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
logger.error("Error saving crawl map", { error });
}
}

View File

@ -61,6 +61,7 @@ import { supabase_service } from "../services/supabase";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
import { saveExtract, updateExtract } from "../lib/extract/extract-redis";
import { billTeam } from "./billing/credit_billing";
import { saveCrawlMap } from "./indexing/crawl-maps-index";
configDotenv();
@ -102,58 +103,9 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
job.data.crawlerOptions !== null &&
originUrl
) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});
if (error) {
_logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
saveCrawlMap(originUrl, visitedUrls).catch((e) => {
_logger.error("Error saving crawl map", { error: e });
});
}
})();