mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 04:40:42 +08:00
Merge branch 'main' into v1-webscraper
This commit is contained in:
commit
4d0acc9722
@ -4,6 +4,7 @@ import { Job } from "bullmq";
|
|||||||
import { Logger } from "../../../lib/logger";
|
import { Logger } from "../../../lib/logger";
|
||||||
import { getScrapeQueue } from "../../../services/queue-service";
|
import { getScrapeQueue } from "../../../services/queue-service";
|
||||||
import { checkAlerts } from "../../../services/alerts";
|
import { checkAlerts } from "../../../services/alerts";
|
||||||
|
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||||
|
|
||||||
export async function cleanBefore24hCompleteJobsController(
|
export async function cleanBefore24hCompleteJobsController(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -54,34 +55,145 @@ export async function cleanBefore24hCompleteJobsController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export async function checkQueuesController(req: Request, res: Response) {
|
export async function checkQueuesController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
await checkAlerts();
|
await checkAlerts();
|
||||||
return res.status(200).send("Alerts initialized");
|
return res.status(200).send("Alerts initialized");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||||
return res.status(500).send("Failed to initialize alerts");
|
return res.status(500).send("Failed to initialize alerts");
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use this as a "health check" that way we dont destroy the server
|
// Use this as a "health check" that way we dont destroy the server
|
||||||
export async function queuesController(req: Request, res: Response) {
|
export async function queuesController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
|
|
||||||
const [webScraperActive] = await Promise.all([
|
const [webScraperActive] = await Promise.all([
|
||||||
|
scrapeQueue.getActiveCount(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const noActiveJobs = webScraperActive === 0;
|
||||||
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
|
return res.status(noActiveJobs ? 200 : 500).json({
|
||||||
|
webScraperActive,
|
||||||
|
noActiveJobs,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(error);
|
||||||
|
return res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function autoscalerController(req: Request, res: Response) {
|
||||||
|
try {
|
||||||
|
const maxNumberOfMachines = 80;
|
||||||
|
const minNumberOfMachines = 20;
|
||||||
|
|
||||||
|
const scrapeQueue = getScrapeQueue();
|
||||||
|
|
||||||
|
const [webScraperActive, webScraperWaiting, webScraperPriority] =
|
||||||
|
await Promise.all([
|
||||||
scrapeQueue.getActiveCount(),
|
scrapeQueue.getActiveCount(),
|
||||||
|
scrapeQueue.getWaitingCount(),
|
||||||
|
scrapeQueue.getPrioritizedCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noActiveJobs = webScraperActive === 0;
|
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
|
||||||
return res.status(noActiveJobs ? 200 : 500).json({
|
// get number of machines active
|
||||||
webScraperActive,
|
const request = await fetch(
|
||||||
noActiveJobs,
|
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||||
});
|
{
|
||||||
} catch (error) {
|
headers: {
|
||||||
Logger.error(error);
|
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
||||||
return res.status(500).json({ error: error.message });
|
},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
const machines = await request.json();
|
||||||
|
|
||||||
|
// Only worker machines
|
||||||
|
const activeMachines = machines.filter(
|
||||||
|
(machine) =>
|
||||||
|
(machine.state === "started" ||
|
||||||
|
machine.state === "starting" ||
|
||||||
|
machine.state === "replacing") &&
|
||||||
|
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
|
||||||
|
).length;
|
||||||
|
|
||||||
|
let targetMachineCount = activeMachines;
|
||||||
|
|
||||||
|
const baseScaleUp = 10;
|
||||||
|
// Slow scale down
|
||||||
|
const baseScaleDown = 2;
|
||||||
|
|
||||||
|
// Scale up logic
|
||||||
|
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
||||||
|
targetMachineCount = Math.min(
|
||||||
|
maxNumberOfMachines,
|
||||||
|
activeMachines + baseScaleUp * 3
|
||||||
|
);
|
||||||
|
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
||||||
|
targetMachineCount = Math.min(
|
||||||
|
maxNumberOfMachines,
|
||||||
|
activeMachines + baseScaleUp * 2
|
||||||
|
);
|
||||||
|
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
||||||
|
targetMachineCount = Math.min(
|
||||||
|
maxNumberOfMachines,
|
||||||
|
activeMachines + baseScaleUp
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
// Scale down logic
|
||||||
|
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
||||||
|
targetMachineCount = Math.max(
|
||||||
|
minNumberOfMachines,
|
||||||
|
activeMachines - baseScaleDown * 3
|
||||||
|
);
|
||||||
|
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
||||||
|
targetMachineCount = Math.max(
|
||||||
|
minNumberOfMachines,
|
||||||
|
activeMachines - baseScaleDown * 2
|
||||||
|
);
|
||||||
|
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
||||||
|
targetMachineCount = Math.max(
|
||||||
|
minNumberOfMachines,
|
||||||
|
activeMachines - baseScaleDown
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (targetMachineCount !== activeMachines) {
|
||||||
|
Logger.info(
|
||||||
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||||
|
);
|
||||||
|
|
||||||
|
if (targetMachineCount > activeMachines) {
|
||||||
|
sendSlackWebhook(
|
||||||
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||||
|
false,
|
||||||
|
process.env.SLACK_AUTOSCALER ?? ""
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
sendSlackWebhook(
|
||||||
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||||
|
false,
|
||||||
|
process.env.SLACK_AUTOSCALER ?? ""
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return res.status(200).json({
|
||||||
|
mode: "scale-descale",
|
||||||
|
count: targetMachineCount,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
mode: "normal",
|
||||||
|
count: activeMachines,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(error);
|
||||||
|
return res.status(500).send("Failed to initialize autoscaler");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -222,7 +222,8 @@ export async function supaAuthenticateUser(
|
|||||||
rateLimiter = getRateLimiter(
|
rateLimiter = getRateLimiter(
|
||||||
RateLimiterMode.Scrape,
|
RateLimiterMode.Scrape,
|
||||||
token,
|
token,
|
||||||
subscriptionData.plan
|
subscriptionData.plan,
|
||||||
|
teamId
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case RateLimiterMode.Search:
|
case RateLimiterMode.Search:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
export function parseMarkdown(html: string) {
|
export async function parseMarkdown(html: string) {
|
||||||
var TurndownService = require("turndown");
|
var TurndownService = require("turndown");
|
||||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||||
|
|
||||||
@ -21,7 +21,27 @@ export function parseMarkdown(html: string) {
|
|||||||
});
|
});
|
||||||
var gfm = turndownPluginGfm.gfm;
|
var gfm = turndownPluginGfm.gfm;
|
||||||
turndownService.use(gfm);
|
turndownService.use(gfm);
|
||||||
let markdownContent = turndownService.turndown(html);
|
let markdownContent = "";
|
||||||
|
const turndownPromise = new Promise<string>((resolve, reject) => {
|
||||||
|
try {
|
||||||
|
const result = turndownService.turndown(html);
|
||||||
|
resolve(result);
|
||||||
|
} catch (error) {
|
||||||
|
reject("Error converting HTML to Markdown: " + error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const timeoutPromise = new Promise<string>((resolve, reject) => {
|
||||||
|
const timeout = 5000; // Timeout in milliseconds
|
||||||
|
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return ""; // Optionally return an empty string or handle the error as needed
|
||||||
|
}
|
||||||
|
|
||||||
// multiple line links
|
// multiple line links
|
||||||
let insideLinkContent = false;
|
let insideLinkContent = false;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import express from "express";
|
import express from "express";
|
||||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||||
import {
|
import {
|
||||||
|
autoscalerController,
|
||||||
checkQueuesController,
|
checkQueuesController,
|
||||||
cleanBefore24hCompleteJobsController,
|
cleanBefore24hCompleteJobsController,
|
||||||
queuesController,
|
queuesController,
|
||||||
@ -27,3 +28,8 @@ adminRouter.get(
|
|||||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||||
queuesController
|
queuesController
|
||||||
);
|
);
|
||||||
|
|
||||||
|
adminRouter.get(
|
||||||
|
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||||
|
autoscalerController
|
||||||
|
);
|
||||||
|
@ -24,8 +24,8 @@ import { clientSideError } from "../../strings";
|
|||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export const baseScrapers = [
|
export const baseScrapers = [
|
||||||
"fire-engine",
|
|
||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
|
"fire-engine",
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||||
"scrapingBeeLoad",
|
"scrapingBeeLoad",
|
||||||
@ -85,8 +85,8 @@ function getScrapingFallbackOrder(
|
|||||||
});
|
});
|
||||||
|
|
||||||
let defaultOrder = [
|
let defaultOrder = [
|
||||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
|
||||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
||||||
|
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||||
"scrapingBeeLoad",
|
"scrapingBeeLoad",
|
||||||
|
@ -15,7 +15,8 @@ const socialMediaBlocklist = [
|
|||||||
'wechat.com',
|
'wechat.com',
|
||||||
'telegram.org',
|
'telegram.org',
|
||||||
'researchhub.com',
|
'researchhub.com',
|
||||||
'youtube.com'
|
'youtube.com',
|
||||||
|
'corterix.com',
|
||||||
];
|
];
|
||||||
|
|
||||||
const allowedKeywords = [
|
const allowedKeywords = [
|
||||||
|
@ -3,9 +3,9 @@ import { Logger } from "../../../src/lib/logger";
|
|||||||
|
|
||||||
export async function sendSlackWebhook(
|
export async function sendSlackWebhook(
|
||||||
message: string,
|
message: string,
|
||||||
alertEveryone: boolean = false
|
alertEveryone: boolean = false,
|
||||||
|
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
|
||||||
) {
|
) {
|
||||||
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
|
|
||||||
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
||||||
const payload = {
|
const payload = {
|
||||||
text: `${messagePrefix} ${message}`,
|
text: `${messagePrefix} ${message}`,
|
||||||
|
@ -97,16 +97,28 @@ export const testSuiteRateLimiter = new RateLimiterRedis({
|
|||||||
duration: 60, // Duration in seconds
|
duration: 60, // Duration in seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
export const devBRateLimiter = new RateLimiterRedis({
|
||||||
|
storeClient: redisRateLimitClient,
|
||||||
|
keyPrefix: "dev-b",
|
||||||
|
points: 1200,
|
||||||
|
duration: 60, // Duration in seconds
|
||||||
|
});
|
||||||
|
|
||||||
export function getRateLimiter(
|
export function getRateLimiter(
|
||||||
mode: RateLimiterMode,
|
mode: RateLimiterMode,
|
||||||
token: string,
|
token: string,
|
||||||
plan?: string
|
plan?: string,
|
||||||
|
teamId?: string
|
||||||
) {
|
) {
|
||||||
|
|
||||||
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) {
|
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) {
|
||||||
return testSuiteRateLimiter;
|
return testSuiteRateLimiter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(teamId === process.env.DEV_B_TEAM_ID) {
|
||||||
|
return devBRateLimiter;
|
||||||
|
}
|
||||||
|
|
||||||
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
|
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
|
||||||
|
|
||||||
if (!rateLimitConfig) return serverRateLimiter;
|
if (!rateLimitConfig) return serverRateLimiter;
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user