mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 11:10:40 +08:00
Nick: fixed crawl maps index dedup
This commit is contained in:
parent
720a429115
commit
c7b219169b
@ -1,5 +1,5 @@
|
||||
import { logger } from "../../lib/logger";
|
||||
import { normalizeUrlOnlyHostname } from "../../lib/canonical-url";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
|
||||
/**
|
||||
@ -23,7 +23,7 @@ async function querySitemapIndexFunction(url: string) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const allUrls = data.map((entry) => entry.urls).flat();
|
||||
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
|
||||
return allUrls;
|
||||
|
||||
} catch (error) {
|
||||
|
@ -18,6 +18,15 @@ interface CrawlMapOperation {
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
interface CrawlMapRecord {
|
||||
id?: string;
|
||||
origin_url: string;
|
||||
urls: string[];
|
||||
num_urls: number;
|
||||
updated_at: string;
|
||||
created_at?: string;
|
||||
}
|
||||
|
||||
async function acquireLock(): Promise<boolean> {
|
||||
const redis = redisConnection;
|
||||
// Set lock with NX (only if it doesn't exist) and PX (millisecond expiry)
|
||||
@ -65,51 +74,83 @@ async function processBatch() {
|
||||
const origins = operations.map((op) => op.originUrl);
|
||||
const { data: existingMaps } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("origin_url, urls")
|
||||
.in("origin_url", origins);
|
||||
.select("id, origin_url, urls, updated_at")
|
||||
.in("origin_url", origins)
|
||||
.order("updated_at", { ascending: false });
|
||||
|
||||
const existingMapsByOrigin = new Map(
|
||||
existingMaps?.map((map) => [map.origin_url, map.urls]) || [],
|
||||
);
|
||||
|
||||
// Prepare updates and inserts
|
||||
interface CrawlMapRecord {
|
||||
origin_url: string;
|
||||
urls: string[];
|
||||
num_urls: number;
|
||||
updated_at: string;
|
||||
created_at?: string;
|
||||
}
|
||||
// Group maps by origin and handle duplicates
|
||||
const mapsByOrigin = new Map<string, any[]>();
|
||||
existingMaps?.forEach((map) => {
|
||||
const maps = mapsByOrigin.get(map.origin_url) || [];
|
||||
maps.push(map);
|
||||
mapsByOrigin.set(map.origin_url, maps);
|
||||
});
|
||||
|
||||
// Handle duplicates and prepare updates
|
||||
const updates: CrawlMapRecord[] = [];
|
||||
const inserts: CrawlMapRecord[] = [];
|
||||
const duplicatesToDelete: string[] = [];
|
||||
|
||||
for (const op of operations) {
|
||||
const existingUrls = existingMapsByOrigin.get(op.originUrl);
|
||||
const existingForOrigin = mapsByOrigin.get(op.originUrl) || [];
|
||||
|
||||
if (existingUrls) {
|
||||
// Merge URLs for update
|
||||
if (existingForOrigin.length > 0) {
|
||||
// Keep most recent entry and mark others for deletion
|
||||
const [mostRecent, ...duplicates] = existingForOrigin;
|
||||
if (duplicates.length > 0) {
|
||||
duplicatesToDelete.push(...duplicates.map(d => d.id));
|
||||
}
|
||||
|
||||
// Merge and deduplicate URLs
|
||||
const mergedUrls = [
|
||||
...new Set([...existingUrls, ...op.standardizedUrls]),
|
||||
...new Set([
|
||||
...mostRecent.urls,
|
||||
...op.standardizedUrls.map(url => normalizeUrl(url))
|
||||
])
|
||||
];
|
||||
|
||||
updates.push({
|
||||
id: mostRecent.id, // Add id to ensure we update the correct record
|
||||
origin_url: op.originUrl,
|
||||
urls: mergedUrls,
|
||||
num_urls: mergedUrls.length,
|
||||
updated_at: op.timestamp,
|
||||
});
|
||||
} else {
|
||||
// Prepare insert
|
||||
// Prepare insert with deduplicated URLs
|
||||
const deduplicatedUrls = [...new Set(op.standardizedUrls.map(url => normalizeUrl(url)))];
|
||||
inserts.push({
|
||||
origin_url: op.originUrl,
|
||||
urls: op.standardizedUrls,
|
||||
num_urls: op.standardizedUrls.length,
|
||||
urls: deduplicatedUrls,
|
||||
num_urls: deduplicatedUrls.length,
|
||||
created_at: op.timestamp,
|
||||
updated_at: op.timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Delete duplicate entries
|
||||
if (duplicatesToDelete.length > 0) {
|
||||
logger.info(`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`);
|
||||
|
||||
// Delete in batches of 100
|
||||
for (let i = 0; i < duplicatesToDelete.length; i += 100) {
|
||||
const batch = duplicatesToDelete.slice(i, i + 100);
|
||||
const { error: deleteError } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.delete()
|
||||
.in("id", batch);
|
||||
|
||||
if (deleteError) {
|
||||
logger.error(`Failed to delete batch ${i/100 + 1} of duplicate crawl maps`, {
|
||||
error: deleteError,
|
||||
batchSize: batch.length,
|
||||
startIndex: i
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Execute batch operations
|
||||
if (updates.length > 0) {
|
||||
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
|
||||
|
Loading…
x
Reference in New Issue
Block a user