Nick: misc improvements

This commit is contained in:
Nicolas 2025-01-21 16:57:45 -03:00
parent ac0d10c451
commit 3604f2a3ae
5 changed files with 51 additions and 19 deletions

View File

@ -156,13 +156,21 @@ export async function getMapResults({
} }
// Parallelize sitemap index query with search results // Parallelize sitemap index query with search results
const [sitemapIndexUrls, ...searchResults] = await Promise.all([ const [sitemapIndexResult, ...searchResults] = await Promise.all([
querySitemapIndex(url), querySitemapIndex(url),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),
]); ]);
// Only query sitemap if index has less than 100 links const twoDaysAgo = new Date();
if (!ignoreSitemap && sitemapIndexUrls.length < 100) { twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
if (
!ignoreSitemap &&
(sitemapIndexResult.urls.length < 100 ||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
) {
await crawler.tryGetSitemap(urls => { await crawler.tryGetSitemap(urls => {
links.push(...urls); links.push(...urls);
}, true, false, 30000); }, true, false, 30000);
@ -197,7 +205,7 @@ export async function getMapResults({
} }
// Add sitemap-index URLs // Add sitemap-index URLs
links.push(...sitemapIndexUrls); links.push(...sitemapIndexResult.urls);
// Perform cosine similarity between the search query and the list of links // Perform cosine similarity between the search query and the list of links
if (search) { if (search) {

View File

@ -16,15 +16,20 @@ async function querySitemapIndexFunction(url: string) {
try { try {
const { data, error } = await supabase_service const { data, error } = await supabase_service
.from("crawl_maps") .from("crawl_maps")
.select("urls") .select("urls, updated_at")
.eq("origin_url", originUrl); .eq("origin_url", originUrl)
.order("updated_at", { ascending: false });
if (error) { if (error) {
throw error; throw error;
} }
if (!data || data.length === 0) {
return { urls: [], lastUpdated: new Date(0) };
}
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))]; const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
return allUrls; return { urls: allUrls, lastUpdated: data[0].updated_at };
} catch (error) { } catch (error) {
logger.error("(sitemap-index) Error querying the index", { logger.error("(sitemap-index) Error querying the index", {
@ -33,12 +38,12 @@ async function querySitemapIndexFunction(url: string) {
}); });
if (attempt === 3) { if (attempt === 3) {
return []; return { urls: [], lastUpdated: new Date(0) };
} }
} }
} }
return []; return { urls: [], lastUpdated: new Date(0) };
} }
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []); export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });

View File

@ -91,7 +91,16 @@ async function processBatch() {
const inserts: CrawlMapRecord[] = []; const inserts: CrawlMapRecord[] = [];
const duplicatesToDelete: string[] = []; const duplicatesToDelete: string[] = [];
// Track processed origins to avoid duplicates in the same batch
const processedOrigins = new Set<string>();
for (const op of operations) { for (const op of operations) {
// Skip if we've already processed this origin in this batch
if (processedOrigins.has(op.originUrl)) {
continue;
}
processedOrigins.add(op.originUrl);
const existingForOrigin = mapsByOrigin.get(op.originUrl) || []; const existingForOrigin = mapsByOrigin.get(op.originUrl) || [];
if (existingForOrigin.length > 0) { if (existingForOrigin.length > 0) {
@ -110,7 +119,7 @@ async function processBatch() {
]; ];
updates.push({ updates.push({
id: mostRecent.id, // Add id to ensure we update the correct record id: mostRecent.id,
origin_url: op.originUrl, origin_url: op.originUrl,
urls: mergedUrls, urls: mergedUrls,
num_urls: mergedUrls.length, num_urls: mergedUrls.length,
@ -156,14 +165,19 @@ async function processBatch() {
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, { logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
origins: updates.map((u) => u.origin_url), origins: updates.map((u) => u.origin_url),
}); });
const { error: updateError } = await supabase_service
.from("crawl_maps")
.upsert(updates);
if (updateError) { // Process updates one at a time to avoid conflicts
logger.error("Failed to batch update crawl maps", { for (const update of updates) {
error: updateError, const { error: updateError } = await supabase_service
}); .from("crawl_maps")
.upsert(update);
if (updateError) {
logger.error("Failed to update crawl map", {
error: updateError,
origin: update.origin_url
});
}
} }
} }

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.10.1" __version__ = "1.10.2"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -51,6 +51,11 @@ class FirecrawlApp:
schema_: Optional[Any] = pydantic.Field(None, alias='schema') schema_: Optional[Any] = pydantic.Field(None, alias='schema')
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False allow_external_links: Optional[bool] = False
enable_web_search: Optional[bool] = False
# Just for backwards compatibility
enableWebSearch: Optional[bool] = False
class ExtractResponse(pydantic.BaseModel): class ExtractResponse(pydantic.BaseModel):
""" """