mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 05:25:59 +08:00
feat(index): batch insert
This commit is contained in:
parent
369a8f6050
commit
18a7462fea
12
apps/api/src/controllers/v0/admin/index-queue-prometheus.ts
Normal file
12
apps/api/src/controllers/v0/admin/index-queue-prometheus.ts
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import type { Request, Response } from "express";
|
||||||
|
import { getIndexInsertQueueLength } from "../../../services";
|
||||||
|
|
||||||
|
export async function indexQueuePrometheus(req: Request, res: Response) {
|
||||||
|
const queueLength = await getIndexInsertQueueLength();
|
||||||
|
res.setHeader("Content-Type", "text/plain");
|
||||||
|
res.send(`\
|
||||||
|
# HELP firecrawl_index_queue_length The number of items in the index insert queue
|
||||||
|
# TYPE firecrawl_index_queue_length gauge
|
||||||
|
firecrawl_index_queue_length ${queueLength}
|
||||||
|
`);
|
||||||
|
}
|
@ -10,6 +10,7 @@ import { wrap } from "./v1";
|
|||||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||||
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
|
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
|
||||||
import { cclogController } from "../controllers/v0/admin/cclog";
|
import { cclogController } from "../controllers/v0/admin/cclog";
|
||||||
|
import { indexQueuePrometheus } from "../controllers/v0/admin/index-queue-prometheus";
|
||||||
|
|
||||||
export const adminRouter = express.Router();
|
export const adminRouter = express.Router();
|
||||||
|
|
||||||
@ -49,3 +50,8 @@ adminRouter.get(
|
|||||||
`/admin/${process.env.BULL_AUTH_KEY}/cclog`,
|
`/admin/${process.env.BULL_AUTH_KEY}/cclog`,
|
||||||
wrap(cclogController),
|
wrap(cclogController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
adminRouter.get(
|
||||||
|
`/admin/${process.env.BULL_AUTH_KEY}/index-queue-prometheus`,
|
||||||
|
wrap(indexQueuePrometheus),
|
||||||
|
);
|
||||||
|
@ -433,7 +433,8 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const shouldUseIndex =
|
const shouldUseIndex =
|
||||||
!meta.options.formats.includes("changeTracking")
|
useIndex
|
||||||
|
&& !meta.options.formats.includes("changeTracking")
|
||||||
&& meta.options.maxAge !== 0
|
&& meta.options.maxAge !== 0
|
||||||
&& (
|
&& (
|
||||||
meta.options.headers === undefined
|
meta.options.headers === undefined
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { Document } from "../../../../controllers/v1/types";
|
import { Document } from "../../../../controllers/v1/types";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits } from "../../../../services";
|
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits, addIndexInsertJob } from "../../../../services";
|
||||||
import { EngineError, IndexMissError } from "../../error";
|
import { EngineError, IndexMissError } from "../../error";
|
||||||
import crypto from "crypto";
|
import crypto from "crypto";
|
||||||
|
|
||||||
@ -51,9 +51,8 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
|||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
const { error } = await index_supabase_service
|
try {
|
||||||
.from("index")
|
await addIndexInsertJob({
|
||||||
.insert({
|
|
||||||
id: indexId,
|
id: indexId,
|
||||||
url: normalizedURL,
|
url: normalizedURL,
|
||||||
url_hash: urlHash,
|
url_hash: urlHash,
|
||||||
@ -73,12 +72,10 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
|||||||
[`url_split_${i}_hash`]: x,
|
[`url_split_${i}_hash`]: x,
|
||||||
}), {})),
|
}), {})),
|
||||||
});
|
});
|
||||||
|
} catch (error) {
|
||||||
if (error) {
|
meta.logger.error("Failed to add document to index insert queue", {
|
||||||
meta.logger.error("Failed to save document to index", {
|
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
return document;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
@ -3,6 +3,7 @@ import { logger } from "../lib/logger";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Storage } from "@google-cloud/storage";
|
import { Storage } from "@google-cloud/storage";
|
||||||
import crypto from "crypto";
|
import crypto from "crypto";
|
||||||
|
import { redisEvictConnection } from "./redis";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
@ -183,3 +184,33 @@ export function generateURLSplits(url: string): string[] {
|
|||||||
|
|
||||||
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
|
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const INDEX_INSERT_QUEUE_KEY = "index-insert-queue";
|
||||||
|
const INDEX_INSERT_BATCH_SIZE = 1000;
|
||||||
|
|
||||||
|
export async function addIndexInsertJob(data: any) {
|
||||||
|
await redisEvictConnection.rpush(INDEX_INSERT_QUEUE_KEY, JSON.stringify(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIndexInsertJobs(): Promise<any[]> {
|
||||||
|
const jobs = (await redisEvictConnection.lpop(INDEX_INSERT_QUEUE_KEY, INDEX_INSERT_BATCH_SIZE)) ?? [];
|
||||||
|
return jobs.map(x => JSON.parse(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function processIndexInsertJobs() {
|
||||||
|
const jobs = await getIndexInsertJobs();
|
||||||
|
if (jobs.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
logger.info(`Index inserter found jobs to insert`, { jobCount: jobs.length });
|
||||||
|
try {
|
||||||
|
await index_supabase_service.from("index").insert(jobs);
|
||||||
|
logger.info(`Index inserter inserted jobs`, { jobCount: jobs.length });
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Index inserter failed to insert jobs`, { error, jobCount: jobs.length });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIndexInsertQueueLength(): Promise<number> {
|
||||||
|
return await redisEvictConnection.llen(INDEX_INSERT_QUEUE_KEY) ?? 0;
|
||||||
|
}
|
||||||
|
@ -14,6 +14,7 @@ import { saveCrawlMap } from "./crawl-maps-index";
|
|||||||
import { processBillingBatch, queueBillingOperation, startBillingBatchProcessing } from "../billing/batch_billing";
|
import { processBillingBatch, queueBillingOperation, startBillingBatchProcessing } from "../billing/batch_billing";
|
||||||
import systemMonitor from "../system-monitor";
|
import systemMonitor from "../system-monitor";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { processIndexInsertJobs } from "..";
|
||||||
|
|
||||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||||
const workerStalledCheckInterval =
|
const workerStalledCheckInterval =
|
||||||
@ -226,6 +227,8 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) =
|
|||||||
process.exit(0);
|
process.exit(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const INDEX_INSERT_INTERVAL = 15000;
|
||||||
|
|
||||||
// Start the workers
|
// Start the workers
|
||||||
(async () => {
|
(async () => {
|
||||||
// Start index worker
|
// Start index worker
|
||||||
@ -235,6 +238,16 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) =
|
|||||||
startBillingBatchProcessing();
|
startBillingBatchProcessing();
|
||||||
const billingWorkerPromise = workerFun(getBillingQueue(), processBillingJobInternal);
|
const billingWorkerPromise = workerFun(getBillingQueue(), processBillingJobInternal);
|
||||||
|
|
||||||
|
const indexInserterInterval = setInterval(async () => {
|
||||||
|
if (isShuttingDown) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await processIndexInsertJobs();
|
||||||
|
}, INDEX_INSERT_INTERVAL);
|
||||||
|
|
||||||
// Wait for both workers to complete (which should only happen on shutdown)
|
// Wait for both workers to complete (which should only happen on shutdown)
|
||||||
await Promise.all([indexWorkerPromise, billingWorkerPromise]);
|
await Promise.all([indexWorkerPromise, billingWorkerPromise]);
|
||||||
|
|
||||||
|
clearInterval(indexInserterInterval);
|
||||||
})();
|
})();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user