mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 07:05:57 +08:00
feat: use url split columns
This commit is contained in:
parent
2fd31174fb
commit
96c753f9a9
@ -65,6 +65,7 @@ describe("Queue Concurrency Integration", () => {
|
|||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: true,
|
blockAds: true,
|
||||||
maxAge: 0,
|
maxAge: 0,
|
||||||
|
dontStoreInCache: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
|
@ -25,7 +25,7 @@ import { logger } from "../../lib/logger";
|
|||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||||
import { getIndexQueue } from "../../services/queue-service";
|
import { getIndexQueue } from "../../services/queue-service";
|
||||||
import { hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
|
import { generateURLSplits, hashURL, index_supabase_service, normalizeURLForIndex, useIndex } from "../../services/index";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -165,6 +165,8 @@ export async function getMapResults({
|
|||||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const urlSplitsHash = generateURLSplits(url).map(x => hashURL(x));
|
||||||
|
|
||||||
// Parallelize sitemap index query with search results
|
// Parallelize sitemap index query with search results
|
||||||
const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
|
const [sitemapIndexResult, { data: indexResults, error: indexError }, ...searchResults] = await Promise.all([
|
||||||
querySitemapIndex(url, abort),
|
querySitemapIndex(url, abort),
|
||||||
@ -172,7 +174,7 @@ export async function getMapResults({
|
|||||||
index_supabase_service
|
index_supabase_service
|
||||||
.from("index")
|
.from("index")
|
||||||
.select("resolved_url")
|
.select("resolved_url")
|
||||||
.overlaps("url_splits_hash", [await hashURL(normalizeURLForIndex(url))])
|
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
||||||
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
||||||
.limit(limit)
|
.limit(limit)
|
||||||
) : Promise.resolve({ data: [], error: null }),
|
) : Promise.resolve({ data: [], error: null }),
|
||||||
|
@ -132,6 +132,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
maxAge: 0,
|
maxAge: 0,
|
||||||
|
dontStoreInCache: false,
|
||||||
},
|
},
|
||||||
}, logger, costTracking, acuc?.flags ?? null);
|
}, logger, costTracking, acuc?.flags ?? null);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
|
@ -68,6 +68,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
|||||||
location_country: meta.options.location?.country ?? null,
|
location_country: meta.options.location?.country ?? null,
|
||||||
location_languages: meta.options.location?.languages ?? null,
|
location_languages: meta.options.location?.languages ?? null,
|
||||||
status: document.metadata.statusCode,
|
status: document.metadata.statusCode,
|
||||||
|
...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
|
||||||
|
...a,
|
||||||
|
[`url_split_${i}_hash`]: x,
|
||||||
|
}), {})),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -1242,8 +1242,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJob(job, doc);
|
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user