mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 09:55:59 +08:00
Nick: misc improvements
This commit is contained in:
parent
ac0d10c451
commit
3604f2a3ae
@ -156,13 +156,21 @@ export async function getMapResults({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parallelize sitemap index query with search results
|
// Parallelize sitemap index query with search results
|
||||||
const [sitemapIndexUrls, ...searchResults] = await Promise.all([
|
const [sitemapIndexResult, ...searchResults] = await Promise.all([
|
||||||
querySitemapIndex(url),
|
querySitemapIndex(url),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// Only query sitemap if index has less than 100 links
|
const twoDaysAgo = new Date();
|
||||||
if (!ignoreSitemap && sitemapIndexUrls.length < 100) {
|
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
||||||
|
|
||||||
|
|
||||||
|
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
|
||||||
|
if (
|
||||||
|
!ignoreSitemap &&
|
||||||
|
(sitemapIndexResult.urls.length < 100 ||
|
||||||
|
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
|
||||||
|
) {
|
||||||
await crawler.tryGetSitemap(urls => {
|
await crawler.tryGetSitemap(urls => {
|
||||||
links.push(...urls);
|
links.push(...urls);
|
||||||
}, true, false, 30000);
|
}, true, false, 30000);
|
||||||
@ -197,7 +205,7 @@ export async function getMapResults({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add sitemap-index URLs
|
// Add sitemap-index URLs
|
||||||
links.push(...sitemapIndexUrls);
|
links.push(...sitemapIndexResult.urls);
|
||||||
|
|
||||||
// Perform cosine similarity between the search query and the list of links
|
// Perform cosine similarity between the search query and the list of links
|
||||||
if (search) {
|
if (search) {
|
||||||
|
@ -16,15 +16,20 @@ async function querySitemapIndexFunction(url: string) {
|
|||||||
try {
|
try {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("crawl_maps")
|
.from("crawl_maps")
|
||||||
.select("urls")
|
.select("urls, updated_at")
|
||||||
.eq("origin_url", originUrl);
|
.eq("origin_url", originUrl)
|
||||||
|
.order("updated_at", { ascending: false });
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!data || data.length === 0) {
|
||||||
|
return { urls: [], lastUpdated: new Date(0) };
|
||||||
|
}
|
||||||
|
|
||||||
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
|
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
|
||||||
return allUrls;
|
return { urls: allUrls, lastUpdated: data[0].updated_at };
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("(sitemap-index) Error querying the index", {
|
logger.error("(sitemap-index) Error querying the index", {
|
||||||
@ -33,12 +38,12 @@ async function querySitemapIndexFunction(url: string) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (attempt === 3) {
|
if (attempt === 3) {
|
||||||
return [];
|
return { urls: [], lastUpdated: new Date(0) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
return { urls: [], lastUpdated: new Date(0) };
|
||||||
}
|
}
|
||||||
|
|
||||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);
|
export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });
|
||||||
|
@ -91,7 +91,16 @@ async function processBatch() {
|
|||||||
const inserts: CrawlMapRecord[] = [];
|
const inserts: CrawlMapRecord[] = [];
|
||||||
const duplicatesToDelete: string[] = [];
|
const duplicatesToDelete: string[] = [];
|
||||||
|
|
||||||
|
// Track processed origins to avoid duplicates in the same batch
|
||||||
|
const processedOrigins = new Set<string>();
|
||||||
|
|
||||||
for (const op of operations) {
|
for (const op of operations) {
|
||||||
|
// Skip if we've already processed this origin in this batch
|
||||||
|
if (processedOrigins.has(op.originUrl)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
processedOrigins.add(op.originUrl);
|
||||||
|
|
||||||
const existingForOrigin = mapsByOrigin.get(op.originUrl) || [];
|
const existingForOrigin = mapsByOrigin.get(op.originUrl) || [];
|
||||||
|
|
||||||
if (existingForOrigin.length > 0) {
|
if (existingForOrigin.length > 0) {
|
||||||
@ -110,7 +119,7 @@ async function processBatch() {
|
|||||||
];
|
];
|
||||||
|
|
||||||
updates.push({
|
updates.push({
|
||||||
id: mostRecent.id, // Add id to ensure we update the correct record
|
id: mostRecent.id,
|
||||||
origin_url: op.originUrl,
|
origin_url: op.originUrl,
|
||||||
urls: mergedUrls,
|
urls: mergedUrls,
|
||||||
num_urls: mergedUrls.length,
|
num_urls: mergedUrls.length,
|
||||||
@ -156,14 +165,19 @@ async function processBatch() {
|
|||||||
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
|
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
|
||||||
origins: updates.map((u) => u.origin_url),
|
origins: updates.map((u) => u.origin_url),
|
||||||
});
|
});
|
||||||
const { error: updateError } = await supabase_service
|
|
||||||
.from("crawl_maps")
|
|
||||||
.upsert(updates);
|
|
||||||
|
|
||||||
if (updateError) {
|
// Process updates one at a time to avoid conflicts
|
||||||
logger.error("Failed to batch update crawl maps", {
|
for (const update of updates) {
|
||||||
error: updateError,
|
const { error: updateError } = await supabase_service
|
||||||
});
|
.from("crawl_maps")
|
||||||
|
.upsert(update);
|
||||||
|
|
||||||
|
if (updateError) {
|
||||||
|
logger.error("Failed to update crawl map", {
|
||||||
|
error: updateError,
|
||||||
|
origin: update.origin_url
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.10.1"
|
__version__ = "1.10.2"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -51,6 +51,11 @@ class FirecrawlApp:
|
|||||||
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
|
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
|
||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
allow_external_links: Optional[bool] = False
|
allow_external_links: Optional[bool] = False
|
||||||
|
enable_web_search: Optional[bool] = False
|
||||||
|
# Just for backwards compatibility
|
||||||
|
enableWebSearch: Optional[bool] = False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ExtractResponse(pydantic.BaseModel):
|
class ExtractResponse(pydantic.BaseModel):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user