mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-17 09:05:55 +08:00
clean up on map
This commit is contained in:
parent
7426e54e6c
commit
39dd721781
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
|
|||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||||
import { getIndexQueue } from "../../services/queue-service";
|
import { getIndexQueue } from "../../services/queue-service";
|
||||||
import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex as globalUseIndex } from "../../services/index";
|
import { generateURLSplits, hashURL, index_supabase_service, useIndex as globalUseIndex } from "../../services/index";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -44,6 +44,28 @@ interface MapResult {
|
|||||||
mapResults: MapDocument[];
|
mapResults: MapDocument[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function queryIndex(url: string, limit: number, useIndex: boolean): Promise<string[]> {
|
||||||
|
if (!globalUseIndex || !useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
|
||||||
|
|
||||||
|
const { data, error } = await index_supabase_service
|
||||||
|
.from("index")
|
||||||
|
.select("resolved_url")
|
||||||
|
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
||||||
|
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
||||||
|
.limit(limit)
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
logger.warn("Error querying index", { error });
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return (data ?? []).map((x) => x.resolved_url);
|
||||||
|
}
|
||||||
|
|
||||||
export async function getMapResults({
|
export async function getMapResults({
|
||||||
url,
|
url,
|
||||||
search,
|
search,
|
||||||
@ -167,26 +189,15 @@ export async function getMapResults({
|
|||||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
||||||
}
|
}
|
||||||
|
|
||||||
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
|
|
||||||
|
|
||||||
// Parallelize sitemap index query with search results
|
// Parallelize sitemap index query with search results
|
||||||
const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
|
const [sitemapIndexResult, indexResults, ...searchResults] = await Promise.all([
|
||||||
querySitemapIndex(url, abort),
|
querySitemapIndex(url, abort),
|
||||||
globalUseIndex && useIndex && process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true" ? (
|
queryIndex(url, limit, useIndex),
|
||||||
index_supabase_service
|
|
||||||
.from("index")
|
|
||||||
.select("resolved_url")
|
|
||||||
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
|
||||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
|
||||||
.limit(limit)
|
|
||||||
) : Promise.resolve({ data: [], error: null }),
|
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (indexError) {
|
if (indexResults.length > 0) {
|
||||||
logger.warn("Error querying index", { error: indexError });
|
links.push(...indexResults);
|
||||||
} else if (indexResults.length > 0) {
|
|
||||||
links.push(...indexResults.map((x) => x.resolved_url));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const twoDaysAgo = new Date();
|
const twoDaysAgo = new Date();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user