mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 15:39:03 +08:00
Nick: init
This commit is contained in:
parent
81cf05885b
commit
27457ed5db
53
apps/api/src/controllers/v1/extract-status.ts
Normal file
53
apps/api/src/controllers/v1/extract-status.ts
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import {
|
||||||
|
supabaseGetJobByIdOnlyData,
|
||||||
|
supabaseGetJobsById,
|
||||||
|
} from "../../lib/supabase-jobs";
|
||||||
|
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
|
||||||
|
import { RequestWithAuth } from "./types";
|
||||||
|
|
||||||
|
export async function extractStatusController(
|
||||||
|
req: RequestWithAuth<{ jobId: string }, any, any>,
|
||||||
|
res: Response,
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const rateLimiter = scrapeStatusRateLimiter;
|
||||||
|
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||||
|
req.socket.remoteAddress) as string;
|
||||||
|
const iptoken = incomingIP;
|
||||||
|
await rateLimiter.consume(iptoken);
|
||||||
|
|
||||||
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||||
|
if (!job || job.team_id !== req.auth.team_id) {
|
||||||
|
return res.status(403).json({
|
||||||
|
success: false,
|
||||||
|
error: "You are not allowed to access this resource.",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const jobData = await supabaseGetJobsById([req.params.jobId]);
|
||||||
|
if (!jobData || jobData.length === 0) {
|
||||||
|
return res.status(404).json({
|
||||||
|
success: false,
|
||||||
|
error: "Job not found",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: jobData[0].docs,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||||
|
return res.status(429).json({
|
||||||
|
success: false,
|
||||||
|
error: "Rate limit exceeded. Please try again later.",
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
return res.status(500).json({
|
||||||
|
success: false,
|
||||||
|
error: "An unexpected error occurred.",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -5,7 +5,9 @@ import {
|
|||||||
extractRequestSchema,
|
extractRequestSchema,
|
||||||
ExtractResponse,
|
ExtractResponse,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { performExtraction } from "../../lib/extract/extraction-service";
|
import { getExtractQueue } from "../../services/queue-service";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts data from the provided URLs based on the request parameters.
|
* Extracts data from the provided URLs based on the request parameters.
|
||||||
@ -29,12 +31,47 @@ export async function extractController(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await performExtraction({
|
const extractId = crypto.randomUUID();
|
||||||
|
const jobData = {
|
||||||
request: req.body,
|
request: req.body,
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
subId: req.acuc?.sub_id || undefined,
|
subId: req.acuc?.sub_id,
|
||||||
});
|
extractId,
|
||||||
|
};
|
||||||
|
|
||||||
return res.status(result.success ? 200 : 400).json(result);
|
if (Sentry.isInitialized()) {
|
||||||
|
const size = JSON.stringify(jobData).length;
|
||||||
|
await Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Add extract job",
|
||||||
|
op: "queue.publish",
|
||||||
|
attributes: {
|
||||||
|
"messaging.message.id": extractId,
|
||||||
|
"messaging.destination.name": getExtractQueue().name,
|
||||||
|
"messaging.message.body.size": size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async (span) => {
|
||||||
|
await getExtractQueue().add(extractId, {
|
||||||
|
...jobData,
|
||||||
|
sentry: {
|
||||||
|
trace: Sentry.spanToTraceHeader(span),
|
||||||
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
|
size,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
await getExtractQueue().add(extractId, jobData, {
|
||||||
|
jobId: extractId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status(202).json({
|
||||||
|
success: true,
|
||||||
|
id: extractId,
|
||||||
|
urlTrace: [],
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
@ -478,10 +478,11 @@ export interface URLTrace {
|
|||||||
|
|
||||||
export interface ExtractResponse {
|
export interface ExtractResponse {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
|
error?: string;
|
||||||
data?: any;
|
data?: any;
|
||||||
scrape_id?: string;
|
scrape_id?: string;
|
||||||
|
id?: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
error?: string;
|
|
||||||
urlTrace?: URLTrace[];
|
urlTrace?: URLTrace[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ interface ExtractServiceOptions {
|
|||||||
interface ExtractResult {
|
interface ExtractResult {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
data?: any;
|
data?: any;
|
||||||
scrapeId: string;
|
extractId: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
urlTrace?: URLTrace[];
|
urlTrace?: URLTrace[];
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -38,9 +38,8 @@ function getRootDomain(url: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
|
export async function performExtraction(extractId, options: ExtractServiceOptions): Promise<ExtractResult> {
|
||||||
const { request, teamId, plan, subId } = options;
|
const { request, teamId, plan, subId } = options;
|
||||||
const scrapeId = crypto.randomUUID();
|
|
||||||
const urlTraces: URLTrace[] = [];
|
const urlTraces: URLTrace[] = [];
|
||||||
let docs: Document[] = [];
|
let docs: Document[] = [];
|
||||||
|
|
||||||
@ -65,7 +64,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||||
scrapeId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -89,7 +88,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: error.message,
|
error: error.message,
|
||||||
scrapeId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -191,7 +190,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
|
|
||||||
// Log job
|
// Log job
|
||||||
logJob({
|
logJob({
|
||||||
job_id: scrapeId,
|
job_id: extractId,
|
||||||
success: true,
|
success: true,
|
||||||
message: "Extract completed",
|
message: "Extract completed",
|
||||||
num_docs: 1,
|
num_docs: 1,
|
||||||
@ -208,7 +207,7 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: completions.extract ?? {},
|
data: completions.extract ?? {},
|
||||||
scrapeId,
|
extractId,
|
||||||
warning: completions.warning,
|
warning: completions.warning,
|
||||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||||
};
|
};
|
||||||
|
@ -24,13 +24,7 @@ import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
|||||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||||
import { extractController } from "../controllers/v1/extract";
|
import { extractController } from "../controllers/v1/extract";
|
||||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
import { extractStatusController } from "../controllers/v1/extract-status";
|
||||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
|
||||||
// import { searchController } from "../../src/controllers/v1/search";
|
|
||||||
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
|
|
||||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
|
||||||
// import { livenessController } from "../controllers/v1/liveness";
|
|
||||||
// import { readinessController } from "../controllers/v1/readiness";
|
|
||||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { searchController } from "../controllers/v1/search";
|
import { searchController } from "../controllers/v1/search";
|
||||||
@ -215,6 +209,12 @@ v1Router.post(
|
|||||||
wrap(extractController),
|
wrap(extractController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/extract/:jobId",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(extractStatusController),
|
||||||
|
);
|
||||||
|
|
||||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||||
|
|
||||||
v1Router.delete(
|
v1Router.delete(
|
||||||
|
@ -3,12 +3,16 @@ import { logger } from "../lib/logger";
|
|||||||
import IORedis from "ioredis";
|
import IORedis from "ioredis";
|
||||||
|
|
||||||
let scrapeQueue: Queue;
|
let scrapeQueue: Queue;
|
||||||
|
let extractQueue: Queue;
|
||||||
|
let loggingQueue: Queue;
|
||||||
|
|
||||||
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
|
export const redisConnection = new IORedis(process.env.REDIS_URL!, {
|
||||||
maxRetriesPerRequest: null,
|
maxRetriesPerRequest: null,
|
||||||
});
|
});
|
||||||
|
|
||||||
export const scrapeQueueName = "{scrapeQueue}";
|
export const scrapeQueueName = "{scrapeQueue}";
|
||||||
|
export const extractQueueName = "{extractQueue}";
|
||||||
|
export const loggingQueueName = "{loggingQueue}";
|
||||||
|
|
||||||
export function getScrapeQueue() {
|
export function getScrapeQueue() {
|
||||||
if (!scrapeQueue) {
|
if (!scrapeQueue) {
|
||||||
@ -24,24 +28,35 @@ export function getScrapeQueue() {
|
|||||||
age: 90000, // 25 hours
|
age: 90000, // 25 hours
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
// {
|
|
||||||
// settings: {
|
|
||||||
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
|
||||||
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
|
||||||
// stalledInterval: 30 * 1000,
|
|
||||||
// maxStalledCount: 10,
|
|
||||||
// },
|
|
||||||
// defaultJobOptions:{
|
|
||||||
// attempts: 5
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
);
|
);
|
||||||
logger.info("Web scraper queue created");
|
logger.info("Web scraper queue created");
|
||||||
}
|
}
|
||||||
return scrapeQueue;
|
return scrapeQueue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getExtractQueue() {
|
||||||
|
if (!extractQueue) {
|
||||||
|
extractQueue = new Queue(
|
||||||
|
extractQueueName,
|
||||||
|
{
|
||||||
|
connection: redisConnection,
|
||||||
|
defaultJobOptions: {
|
||||||
|
removeOnComplete: {
|
||||||
|
age: 90000, // 25 hours
|
||||||
|
},
|
||||||
|
removeOnFail: {
|
||||||
|
age: 90000, // 25 hours
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
logger.info("Extraction queue created");
|
||||||
|
}
|
||||||
|
return extractQueue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
||||||
// import { QueueEvents } from 'bullmq';
|
// import { QueueEvents } from 'bullmq';
|
||||||
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
||||||
|
@ -4,8 +4,10 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { CustomError } from "../lib/custom-error";
|
import { CustomError } from "../lib/custom-error";
|
||||||
import {
|
import {
|
||||||
getScrapeQueue,
|
getScrapeQueue,
|
||||||
|
getExtractQueue,
|
||||||
redisConnection,
|
redisConnection,
|
||||||
scrapeQueueName,
|
scrapeQueueName,
|
||||||
|
extractQueueName,
|
||||||
} from "./queue-service";
|
} from "./queue-service";
|
||||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
@ -50,6 +52,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
|||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
|
import { performExtraction } from "../lib/extract/extraction-service";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -243,6 +246,52 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
|||||||
return err;
|
return err;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const processExtractJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||||
|
const logger = _logger.child({
|
||||||
|
module: "extract-worker",
|
||||||
|
method: "processJobInternal",
|
||||||
|
jobId: job.id,
|
||||||
|
extractId: job.data.extractId,
|
||||||
|
teamId: job.data?.teamId ?? undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
const extendLockInterval = setInterval(async () => {
|
||||||
|
logger.info(`🔄 Worker extending lock on job ${job.id}`);
|
||||||
|
await job.extendLock(token, jobLockExtensionTime);
|
||||||
|
}, jobLockExtendInterval);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await performExtraction(job.data.extractId, {
|
||||||
|
request: job.data.request,
|
||||||
|
teamId: job.data.teamId,
|
||||||
|
plan: job.data.plan,
|
||||||
|
subId: job.data.subId,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
// Move job to completed state in Redis
|
||||||
|
await job.moveToCompleted(result, token, false);
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
throw new Error(result.error || "Unknown error during extraction");
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
|
||||||
|
|
||||||
|
Sentry.captureException(error, {
|
||||||
|
data: {
|
||||||
|
job: job.id,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Move job to failed state in Redis
|
||||||
|
await job.moveToFailed(error, token, false);
|
||||||
|
throw error;
|
||||||
|
} finally {
|
||||||
|
clearInterval(extendLockInterval);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let isShuttingDown = false;
|
let isShuttingDown = false;
|
||||||
|
|
||||||
process.on("SIGINT", () => {
|
process.on("SIGINT", () => {
|
||||||
@ -399,7 +448,9 @@ const workerFun = async (
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Start both workers
|
||||||
workerFun(getScrapeQueue(), processJobInternal);
|
workerFun(getScrapeQueue(), processJobInternal);
|
||||||
|
workerFun(getExtractQueue(), processExtractJobInternal);
|
||||||
|
|
||||||
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
|
Loading…
x
Reference in New Issue
Block a user