Merge branch 'main' into v1-webscraper

This commit is contained in:
Nicolas 2024-08-26 16:22:05 -03:00
commit 4d0acc9722
9 changed files with 692 additions and 31 deletions

View File

@ -4,6 +4,7 @@ import { Job } from "bullmq";
import { Logger } from "../../../lib/logger"; import { Logger } from "../../../lib/logger";
import { getScrapeQueue } from "../../../services/queue-service"; import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../../services/alerts"; import { checkAlerts } from "../../../services/alerts";
import { sendSlackWebhook } from "../../../services/alerts/slack";
export async function cleanBefore24hCompleteJobsController( export async function cleanBefore24hCompleteJobsController(
req: Request, req: Request,
@ -54,34 +55,145 @@ export async function cleanBefore24hCompleteJobsController(
} }
} }
export async function checkQueuesController(req: Request, res: Response) { export async function checkQueuesController(req: Request, res: Response) {
try { try {
await checkAlerts(); await checkAlerts();
return res.status(200).send("Alerts initialized"); return res.status(200).send("Alerts initialized");
} catch (error) { } catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`); Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts"); return res.status(500).send("Failed to initialize alerts");
}
} }
}
// Use this as a "health check" that way we dont destroy the server // Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) { export async function queuesController(req: Request, res: Response) {
try { try {
const scrapeQueue = getScrapeQueue(); const scrapeQueue = getScrapeQueue();
const [webScraperActive] = await Promise.all([ const [webScraperActive] = await Promise.all([
scrapeQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
export async function autoscalerController(req: Request, res: Response) {
try {
const maxNumberOfMachines = 80;
const minNumberOfMachines = 20;
const scrapeQueue = getScrapeQueue();
const [webScraperActive, webScraperWaiting, webScraperPriority] =
await Promise.all([
scrapeQueue.getActiveCount(), scrapeQueue.getActiveCount(),
scrapeQueue.getWaitingCount(),
scrapeQueue.getPrioritizedCount(),
]); ]);
const noActiveJobs = webScraperActive === 0; let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({ // get number of machines active
webScraperActive, const request = await fetch(
noActiveJobs, "https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
}); {
} catch (error) { headers: {
Logger.error(error); Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
return res.status(500).json({ error: error.message }); },
}
);
const machines = await request.json();
// Only worker machines
const activeMachines = machines.filter(
(machine) =>
(machine.state === "started" ||
machine.state === "starting" ||
machine.state === "replacing") &&
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
).length;
let targetMachineCount = activeMachines;
const baseScaleUp = 10;
// Slow scale down
const baseScaleDown = 2;
// Scale up logic
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 3
);
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 2
);
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp
);
} }
// Scale down logic
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 3
);
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 2
);
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown
);
}
if (targetMachineCount !== activeMachines) {
Logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
);
if (targetMachineCount > activeMachines) {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
} else {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
}
return res.status(200).json({
mode: "scale-descale",
count: targetMachineCount,
});
}
return res.status(200).json({
mode: "normal",
count: activeMachines,
});
} catch (error) {
Logger.error(error);
return res.status(500).send("Failed to initialize autoscaler");
} }
}

View File

@ -222,7 +222,8 @@ export async function supaAuthenticateUser(
rateLimiter = getRateLimiter( rateLimiter = getRateLimiter(
RateLimiterMode.Scrape, RateLimiterMode.Scrape,
token, token,
subscriptionData.plan subscriptionData.plan,
teamId
); );
break; break;
case RateLimiterMode.Search: case RateLimiterMode.Search:

View File

@ -1,5 +1,5 @@
export function parseMarkdown(html: string) { export async function parseMarkdown(html: string) {
var TurndownService = require("turndown"); var TurndownService = require("turndown");
var turndownPluginGfm = require('joplin-turndown-plugin-gfm') var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
@ -21,7 +21,27 @@ export function parseMarkdown(html: string) {
}); });
var gfm = turndownPluginGfm.gfm; var gfm = turndownPluginGfm.gfm;
turndownService.use(gfm); turndownService.use(gfm);
let markdownContent = turndownService.turndown(html); let markdownContent = "";
const turndownPromise = new Promise<string>((resolve, reject) => {
try {
const result = turndownService.turndown(html);
resolve(result);
} catch (error) {
reject("Error converting HTML to Markdown: " + error);
}
});
const timeoutPromise = new Promise<string>((resolve, reject) => {
const timeout = 5000; // Timeout in milliseconds
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
});
try {
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
} catch (error) {
console.error(error);
return ""; // Optionally return an empty string or handle the error as needed
}
// multiple line links // multiple line links
let insideLinkContent = false; let insideLinkContent = false;

View File

@ -1,6 +1,7 @@
import express from "express"; import express from "express";
import { redisHealthController } from "../controllers/v0/admin/redis-health"; import { redisHealthController } from "../controllers/v0/admin/redis-health";
import { import {
autoscalerController,
checkQueuesController, checkQueuesController,
cleanBefore24hCompleteJobsController, cleanBefore24hCompleteJobsController,
queuesController, queuesController,
@ -27,3 +28,8 @@ adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/queues`, `/admin/${process.env.BULL_AUTH_KEY}/queues`,
queuesController queuesController
); );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);

View File

@ -24,8 +24,8 @@ import { clientSideError } from "../../strings";
dotenv.config(); dotenv.config();
export const baseScrapers = [ export const baseScrapers = [
"fire-engine",
"fire-engine;chrome-cdp", "fire-engine;chrome-cdp",
"fire-engine",
"scrapingBee", "scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
@ -85,8 +85,8 @@ function getScrapingFallbackOrder(
}); });
let defaultOrder = [ let defaultOrder = [
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
"scrapingBee", "scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",

View File

@ -15,7 +15,8 @@ const socialMediaBlocklist = [
'wechat.com', 'wechat.com',
'telegram.org', 'telegram.org',
'researchhub.com', 'researchhub.com',
'youtube.com' 'youtube.com',
'corterix.com',
]; ];
const allowedKeywords = [ const allowedKeywords = [

View File

@ -3,9 +3,9 @@ import { Logger } from "../../../src/lib/logger";
export async function sendSlackWebhook( export async function sendSlackWebhook(
message: string, message: string,
alertEveryone: boolean = false alertEveryone: boolean = false,
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
) { ) {
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
const messagePrefix = alertEveryone ? "<!channel> " : ""; const messagePrefix = alertEveryone ? "<!channel> " : "";
const payload = { const payload = {
text: `${messagePrefix} ${message}`, text: `${messagePrefix} ${message}`,

View File

@ -97,16 +97,28 @@ export const testSuiteRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const devBRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "dev-b",
points: 1200,
duration: 60, // Duration in seconds
});
export function getRateLimiter( export function getRateLimiter(
mode: RateLimiterMode, mode: RateLimiterMode,
token: string, token: string,
plan?: string plan?: string,
teamId?: string
) { ) {
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) { if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) {
return testSuiteRateLimiter; return testSuiteRateLimiter;
} }
if(teamId === process.env.DEV_B_TEAM_ID) {
return devBRateLimiter;
}
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
if (!rateLimitConfig) return serverRateLimiter; if (!rateLimitConfig) return serverRateLimiter;

File diff suppressed because one or more lines are too long