mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 01:25:53 +08:00
improve fns
This commit is contained in:
parent
6ba57306c3
commit
c75fad5e79
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
|
|||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||||
import { getIndexQueue } from "../../services/queue-service";
|
import { getIndexQueue } from "../../services/queue-service";
|
||||||
import { generateURLSplits, hashURL, index_supabase_service, useIndex as globalUseIndex } from "../../services/index";
|
import { queryIndexAtSplitLevel } from "../../services/index";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -45,25 +45,11 @@ interface MapResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function queryIndex(url: string, limit: number, useIndex: boolean): Promise<string[]> {
|
async function queryIndex(url: string, limit: number, useIndex: boolean): Promise<string[]> {
|
||||||
if (!globalUseIndex || !useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") {
|
if (!useIndex) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
|
return await queryIndexAtSplitLevel(url, limit);
|
||||||
|
|
||||||
const { data, error } = await index_supabase_service
|
|
||||||
.from("index")
|
|
||||||
.select("resolved_url")
|
|
||||||
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
|
||||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
|
||||||
.limit(limit)
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
logger.warn("Error querying index", { error });
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
return (data ?? []).map((x) => x.resolved_url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getMapResults({
|
export async function getMapResults({
|
||||||
|
@ -214,3 +214,28 @@ export async function processIndexInsertJobs() {
|
|||||||
export async function getIndexInsertQueueLength(): Promise<number> {
|
export async function getIndexInsertQueueLength(): Promise<number> {
|
||||||
return await redisEvictConnection.llen(INDEX_INSERT_QUEUE_KEY) ?? 0;
|
return await redisEvictConnection.llen(INDEX_INSERT_QUEUE_KEY) ?? 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function queryIndexAtSplitLevel(url: string, limit: number): Promise<string[]> {
|
||||||
|
if (!useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.search = "";
|
||||||
|
|
||||||
|
const urlSplitsHash = generateURLSplits(urlObj.href).map(x => hashURL(x));
|
||||||
|
|
||||||
|
const { data, error } = await index_supabase_service
|
||||||
|
.from("index")
|
||||||
|
.select("resolved_url")
|
||||||
|
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
||||||
|
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
||||||
|
.limit(limit)
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
logger.warn("Error querying index", { error, url, limit });
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...new Set((data ?? []).map((x) => x.resolved_url))];
|
||||||
|
}
|
||||||
|
@ -86,7 +86,7 @@ import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
|||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
||||||
import { redisEvictConnection } from "./redis";
|
import { redisEvictConnection } from "./redis";
|
||||||
import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index";
|
import { generateURLSplits, queryIndexAtSplitLevel } from "./index";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
@ -916,29 +916,20 @@ const workerFun = async (
|
|||||||
};
|
};
|
||||||
|
|
||||||
async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) {
|
async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) {
|
||||||
|
if (sc.crawlerOptions.ignoreSitemap) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
const trimmedURL = new URL(url);
|
const trimmedURL = new URL(url);
|
||||||
trimmedURL.search = "";
|
trimmedURL.search = "";
|
||||||
|
|
||||||
const urlSplits = generateURLSplits(trimmedURL.href).map(x => hashURL(x));
|
const index = await queryIndexAtSplitLevel(
|
||||||
|
sc.crawlerOptions.allowBackwardCrawling ? generateURLSplits(trimmedURL.href)[0] : trimmedURL.href,
|
||||||
const index = (sc.crawlerOptions.ignoreSitemap || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true" || !useIndex)
|
sc.crawlerOptions.limit ?? 100,
|
||||||
? []
|
);
|
||||||
: sc.crawlerOptions.allowBackwardCrawling
|
|
||||||
? (await index_supabase_service
|
|
||||||
.from("index")
|
|
||||||
.select("resolved_url")
|
|
||||||
.eq("url_split_0_hash", urlSplits[0])
|
|
||||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
|
||||||
.limit(sc.crawlerOptions.limit ?? 100)).data ?? []
|
|
||||||
: (await index_supabase_service
|
|
||||||
.from("index")
|
|
||||||
.select("resolved_url")
|
|
||||||
.eq("url_split_" + (urlSplits.length - 1) + "_hash", urlSplits[urlSplits.length - 1])
|
|
||||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
|
||||||
.limit(sc.crawlerOptions.limit ?? 100)).data ?? [];
|
|
||||||
|
|
||||||
const validIndexLinks = crawler.filterLinks(
|
const validIndexLinks = crawler.filterLinks(
|
||||||
[...new Set(index.map(x => x.resolved_url))].filter(x => crawler.filterURL(x, trimmedURL.href) !== null),
|
index.filter(x => crawler.filterURL(x, trimmedURL.href) !== null),
|
||||||
sc.crawlerOptions.limit ?? 100,
|
sc.crawlerOptions.limit ?? 100,
|
||||||
sc.crawlerOptions.maxDepth ?? 10,
|
sc.crawlerOptions.maxDepth ?? 10,
|
||||||
false,
|
false,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user