mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 03:55:55 +08:00
feat: use url split columns
This commit is contained in:
parent
2fd31174fb
commit
96c753f9a9
@ -65,6 +65,7 @@ describe("Queue Concurrency Integration", () => {
|
||||
fastMode: false,
|
||||
blockAds: true,
|
||||
maxAge: 0,
|
||||
dontStoreInCache: false,
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
|
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
|
||||
import Redis from "ioredis";
|
||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||
import { getIndexQueue } from "../../services/queue-service";
|
||||
import { hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
|
||||
import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
@ -165,6 +165,8 @@ export async function getMapResults({
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
||||
}
|
||||
|
||||
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
|
||||
|
||||
// Parallelize sitemap index query with search results
|
||||
const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
|
||||
querySitemapIndex(url, abort),
|
||||
@ -172,7 +174,7 @@ export async function getMapResults({
|
||||
index_supabase_service
|
||||
.from("index")
|
||||
.select("resolved_url")
|
||||
.overlaps("url_splits_hash", [await hashURL(normalizeURLForIndex(url))])
|
||||
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
||||
.limit(limit)
|
||||
) : Promise.resolve({ data: [], error: null }),
|
||||
|
@ -132,6 +132,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
fastMode: false,
|
||||
blockAds: false,
|
||||
maxAge: 0,
|
||||
dontStoreInCache: false,
|
||||
},
|
||||
}, logger, costTracking, acuc?.flags ?? null);
|
||||
return response.length > 0 ? response : [];
|
||||
|
@ -68,6 +68,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
location_country: meta.options.location?.country ?? null,
|
||||
location_languages: meta.options.location?.languages ?? null,
|
||||
status: document.metadata.statusCode,
|
||||
...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
|
||||
...a,
|
||||
[`url_split_${i}_hash`]: x,
|
||||
}), {})),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
|
@ -1242,8 +1242,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
);
|
||||
}
|
||||
|
||||
indexJob(job, doc);
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user