mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 16:00:37 +08:00
commit
a75d6889c7
@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL=
|
|||||||
|
|
||||||
# Resend API Key for transactional emails
|
# Resend API Key for transactional emails
|
||||||
RESEND_API_KEY=
|
RESEND_API_KEY=
|
||||||
|
|
||||||
|
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
|
||||||
|
# Available levels are:
|
||||||
|
# NONE - No logs will be output.
|
||||||
|
# ERROR - For logging error messages that indicate a failure in a specific operation.
|
||||||
|
# WARN - For logging potentially harmful situations that are not necessarily errors.
|
||||||
|
# INFO - For logging informational messages that highlight the progress of the application.
|
||||||
|
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
|
||||||
|
# TRACE - For logging more detailed information than the DEBUG level.
|
||||||
|
# Set LOGGING_LEVEL to one of the above options to control logging output.
|
||||||
|
LOGGING_LEVEL=INFO
|
||||||
|
@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
console.log(crawlData)
|
|
||||||
expect(crawlData.length).toBeGreaterThan(0);
|
expect(crawlData.length).toBeGreaterThan(0);
|
||||||
expect(crawlData).toEqual(expect.arrayContaining([
|
expect(crawlData).toEqual(expect.arrayContaining([
|
||||||
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
|
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
|
||||||
|
@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth";
|
|||||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||||
import { sendNotification } from "../services/notification/email_notification";
|
import { sendNotification } from "../services/notification/email_notification";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||||
@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
|
|||||||
api_key
|
api_key
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error setting trace attributes:', error);
|
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -82,7 +83,7 @@ export async function supaAuthenticateUser(
|
|||||||
// $$ language plpgsql;
|
// $$ language plpgsql;
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error('Error fetching key and price_id:', error);
|
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||||
} else {
|
} else {
|
||||||
// console.log('Key and Price ID:', data);
|
// console.log('Key and Price ID:', data);
|
||||||
}
|
}
|
||||||
@ -135,7 +136,7 @@ export async function supaAuthenticateUser(
|
|||||||
try {
|
try {
|
||||||
await rateLimiter.consume(team_endpoint_token);
|
await rateLimiter.consume(team_endpoint_token);
|
||||||
} catch (rateLimiterRes) {
|
} catch (rateLimiterRes) {
|
||||||
console.error(rateLimiterRes);
|
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs";
|
|||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||||
import { supabase_service } from "../../src/services/supabase";
|
import { supabase_service } from "../../src/services/supabase";
|
||||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
const { partialDocs } = await job.progress();
|
const { partialDocs } = await job.progress();
|
||||||
|
|
||||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||||
console.log("Billing team for partial docs...");
|
Logger.info("Billing team for partial docs...");
|
||||||
// Note: the credits that we will bill them here might be lower than the actual
|
// Note: the credits that we will bill them here might be lower than the actual
|
||||||
// due to promises that are not yet resolved
|
// due to promises that are not yet resolved
|
||||||
await billTeam(team_id, partialDocs.length);
|
await billTeam(team_id, partialDocs.length);
|
||||||
@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
await job.discard();
|
await job.discard();
|
||||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const newJobState = await job.getState();
|
const newJobState = await job.getState();
|
||||||
@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
status: "cancelled"
|
status: "cancelled"
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types";
|
|||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlStatusController(req: Request, res: Response) {
|
export async function crawlStatusController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
|
|||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -30,7 +31,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
try {
|
try {
|
||||||
createIdempotencyKey(req);
|
createIdempotencyKey(req);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -83,7 +84,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
documents: docs,
|
documents: docs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,7 +102,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
res.json({ jobId: job.id });
|
res.json({ jobId: job.id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
|
|||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
res.json({ jobId: job.id });
|
res.json({ jobId: job.id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ import { Document } from "../lib/entities";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
|
import { Logger } from '../lib/logger';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -112,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||||
}
|
}
|
||||||
@ -188,7 +189,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
|
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ import { logJob } from "../services/logging/log_job";
|
|||||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||||
import { search } from "../search";
|
import { search } from "../search";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -155,7 +156,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: "Internal server error" });
|
return res.status(500).json({ error: "Internal server error" });
|
||||||
}
|
}
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
@ -183,7 +184,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,16 +12,17 @@ import { sendSlackWebhook } from "./services/alerts/slack";
|
|||||||
import { checkAlerts } from "./services/alerts";
|
import { checkAlerts } from "./services/alerts";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { redisRateLimitClient } from "./services/rate-limiter";
|
import { redisRateLimitClient } from "./services/rate-limiter";
|
||||||
|
import { Logger } from "./lib/logger";
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
const { ExpressAdapter } = require("@bull-board/express");
|
const { ExpressAdapter } = require("@bull-board/express");
|
||||||
|
|
||||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||||
console.log(`Number of CPUs: ${numCPUs} available`);
|
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||||
|
|
||||||
if (cluster.isMaster) {
|
if (cluster.isMaster) {
|
||||||
console.log(`Master ${process.pid} is running`);
|
Logger.info(`Master ${process.pid} is running`);
|
||||||
|
|
||||||
// Fork workers.
|
// Fork workers.
|
||||||
for (let i = 0; i < numCPUs; i++) {
|
for (let i = 0; i < numCPUs; i++) {
|
||||||
@ -30,8 +31,8 @@ if (cluster.isMaster) {
|
|||||||
|
|
||||||
cluster.on("exit", (worker, code, signal) => {
|
cluster.on("exit", (worker, code, signal) => {
|
||||||
if (code !== null) {
|
if (code !== null) {
|
||||||
console.log(`Worker ${worker.process.pid} exited`);
|
Logger.info(`Worker ${worker.process.pid} exited`);
|
||||||
console.log("Starting a new worker");
|
Logger.info("Starting a new worker");
|
||||||
cluster.fork();
|
cluster.fork();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -81,14 +82,9 @@ if (cluster.isMaster) {
|
|||||||
|
|
||||||
function startServer(port = DEFAULT_PORT) {
|
function startServer(port = DEFAULT_PORT) {
|
||||||
const server = app.listen(Number(port), HOST, () => {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
console.log(`Worker ${process.pid} listening on port ${port}`);
|
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||||
console.log(
|
Logger.info(
|
||||||
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||||
);
|
|
||||||
console.log("");
|
|
||||||
console.log("1. Make sure Redis is running on port 6379 by default");
|
|
||||||
console.log(
|
|
||||||
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
|
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
return server;
|
return server;
|
||||||
@ -114,7 +110,7 @@ if (cluster.isMaster) {
|
|||||||
noActiveJobs,
|
noActiveJobs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -132,7 +128,7 @@ if (cluster.isMaster) {
|
|||||||
waitingJobs,
|
waitingJobs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -177,13 +173,13 @@ if (cluster.isMaster) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
console.error("Failed to send Slack notification");
|
Logger.error("Failed to send Slack notification");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, timeout);
|
}, timeout);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.debug(error);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -198,7 +194,7 @@ if (cluster.isMaster) {
|
|||||||
await checkAlerts();
|
await checkAlerts();
|
||||||
return res.status(200).send("Alerts initialized");
|
return res.status(200).send("Alerts initialized");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to initialize alerts:", error);
|
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||||
return res.status(500).send("Failed to initialize alerts");
|
return res.status(500).send("Failed to initialize alerts");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -207,6 +203,7 @@ if (cluster.isMaster) {
|
|||||||
app.get(
|
app.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||||
async (req, res) => {
|
async (req, res) => {
|
||||||
|
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const webScraperQueue = getWebScraperQueue();
|
||||||
const batchSize = 10;
|
const batchSize = 10;
|
||||||
@ -241,12 +238,12 @@ if (cluster.isMaster) {
|
|||||||
await job.remove();
|
await job.remove();
|
||||||
count++;
|
count++;
|
||||||
} catch (jobError) {
|
} catch (jobError) {
|
||||||
console.error(`Failed to remove job with ID ${job.id}:`, jobError);
|
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}` );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to clean last 24h complete jobs:", error);
|
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||||
return res.status(500).send("Failed to clean jobs");
|
return res.status(500).send("Failed to clean jobs");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -272,7 +269,7 @@ if (cluster.isMaster) {
|
|||||||
queueRedisHealth = await queueRedis.get(testKey);
|
queueRedisHealth = await queueRedis.get(testKey);
|
||||||
await queueRedis.del(testKey);
|
await queueRedis.del(testKey);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("queueRedis health check failed:", error);
|
Logger.error(`queueRedis health check failed: ${error}`);
|
||||||
queueRedisHealth = null;
|
queueRedisHealth = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -283,7 +280,7 @@ if (cluster.isMaster) {
|
|||||||
redisRateLimitHealth = await redisRateLimitClient.get(testKey);
|
redisRateLimitHealth = await redisRateLimitClient.get(testKey);
|
||||||
await redisRateLimitClient.del(testKey);
|
await redisRateLimitClient.del(testKey);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("redisRateLimitClient health check failed:", error);
|
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||||
redisRateLimitHealth = null;
|
redisRateLimitHealth = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,12 +294,12 @@ if (cluster.isMaster) {
|
|||||||
healthStatus.queueRedis === "healthy" &&
|
healthStatus.queueRedis === "healthy" &&
|
||||||
healthStatus.redisRateLimitClient === "healthy"
|
healthStatus.redisRateLimitClient === "healthy"
|
||||||
) {
|
) {
|
||||||
console.log("Both Redis instances are healthy");
|
Logger.info("Both Redis instances are healthy");
|
||||||
return res
|
return res
|
||||||
.status(200)
|
.status(200)
|
||||||
.json({ status: "healthy", details: healthStatus });
|
.json({ status: "healthy", details: healthStatus });
|
||||||
} else {
|
} else {
|
||||||
console.log("Redis instances health check:", healthStatus);
|
Logger.info(`Redis instances health check: ${JSON.stringify(healthStatus)}`);
|
||||||
await sendSlackWebhook(
|
await sendSlackWebhook(
|
||||||
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||||
healthStatus
|
healthStatus
|
||||||
@ -314,7 +311,7 @@ if (cluster.isMaster) {
|
|||||||
.json({ status: "unhealthy", details: healthStatus });
|
.json({ status: "unhealthy", details: healthStatus });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Redis health check failed:", error);
|
Logger.error(`Redis health check failed: ${error}`);
|
||||||
await sendSlackWebhook(
|
await sendSlackWebhook(
|
||||||
`[REDIS DOWN] Redis instances health check: ${error.message}`,
|
`[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||||
true
|
true
|
||||||
@ -326,5 +323,5 @@ if (cluster.isMaster) {
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log(`Worker ${process.pid} started`);
|
Logger.info(`Worker ${process.pid} started`);
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
|||||||
|
|
||||||
import { generateOpenAICompletions } from "./models";
|
import { generateOpenAICompletions } from "./models";
|
||||||
import { Document, ExtractorOptions } from "../entities";
|
import { Document, ExtractorOptions } from "../entities";
|
||||||
|
import { Logger } from "../logger";
|
||||||
|
|
||||||
// Generate completion using OpenAI
|
// Generate completion using OpenAI
|
||||||
export async function generateCompletions(
|
export async function generateCompletions(
|
||||||
@ -44,7 +45,7 @@ export async function generateCompletions(
|
|||||||
|
|
||||||
return completionResult;
|
return completionResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error generating completions: ${error}`);
|
Logger.error(`Error generating completions: ${error}`);
|
||||||
throw new Error(`Error generating completions: ${error.message}`);
|
throw new Error(`Error generating completions: ${error.message}`);
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
53
apps/api/src/lib/logger.ts
Normal file
53
apps/api/src/lib/logger.ts
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
enum LogLevel {
|
||||||
|
NONE = 'NONE', // No logs will be output.
|
||||||
|
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||||
|
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
||||||
|
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
||||||
|
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
||||||
|
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
||||||
|
}
|
||||||
|
export class Logger {
|
||||||
|
static colors = {
|
||||||
|
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
||||||
|
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
||||||
|
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
||||||
|
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
||||||
|
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
||||||
|
};
|
||||||
|
|
||||||
|
static log (message: string, level: LogLevel) {
|
||||||
|
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
|
||||||
|
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||||
|
const currentLevelIndex = levels.indexOf(logLevel);
|
||||||
|
const messageLevelIndex = levels.indexOf(level);
|
||||||
|
|
||||||
|
if (currentLevelIndex >= messageLevelIndex) {
|
||||||
|
const color = Logger.colors[level];
|
||||||
|
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||||
|
|
||||||
|
// if (process.env.USE_DB_AUTH) {
|
||||||
|
// save to supabase? another place?
|
||||||
|
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static error(message: string | any) {
|
||||||
|
Logger.log(message, LogLevel.ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
static warn(message: string) {
|
||||||
|
Logger.log(message, LogLevel.WARN);
|
||||||
|
}
|
||||||
|
|
||||||
|
static info(message: string) {
|
||||||
|
Logger.log(message, LogLevel.INFO);
|
||||||
|
}
|
||||||
|
|
||||||
|
static debug(message: string) {
|
||||||
|
Logger.log(message, LogLevel.DEBUG);
|
||||||
|
}
|
||||||
|
|
||||||
|
static trace(message: string) {
|
||||||
|
Logger.log(message, LogLevel.TRACE);
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,5 @@
|
|||||||
import { AuthResponse } from "../../src/types";
|
import { AuthResponse } from "../../src/types";
|
||||||
|
import { Logger } from "./logger";
|
||||||
|
|
||||||
let warningCount = 0;
|
let warningCount = 0;
|
||||||
|
|
||||||
@ -8,7 +9,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||||||
return async function (...args: U): Promise<T> {
|
return async function (...args: U): Promise<T> {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
if (warningCount < 5) {
|
if (warningCount < 5) {
|
||||||
console.warn("WARNING - You're bypassing authentication");
|
Logger.warn("You're bypassing authentication");
|
||||||
warningCount++;
|
warningCount++;
|
||||||
}
|
}
|
||||||
return { success: true } as T;
|
return { success: true } as T;
|
||||||
@ -16,7 +17,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||||||
try {
|
try {
|
||||||
return await originalFunction(...args);
|
return await originalFunction(...args);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error in withAuth function: ", error);
|
Logger.error(`Error in withAuth function: ${error}`);
|
||||||
return { success: false, error: error.message } as T;
|
return { success: false, error: error.message } as T;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import { DocumentUrl, Progress } from "../lib/entities";
|
|||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
@ -23,6 +24,7 @@ export async function startWebScraperPipeline({
|
|||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
|
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||||
if (progress.currentDocument) {
|
if (progress.currentDocument) {
|
||||||
partialDocs.push(progress.currentDocument);
|
partialDocs.push(progress.currentDocument);
|
||||||
if (partialDocs.length > 50) {
|
if (partialDocs.length > 50) {
|
||||||
@ -32,9 +34,11 @@ export async function startWebScraperPipeline({
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
onSuccess: (result) => {
|
onSuccess: (result) => {
|
||||||
|
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||||
saveJob(job, result);
|
saveJob(job, result);
|
||||||
},
|
},
|
||||||
onError: (error) => {
|
onError: (error) => {
|
||||||
|
Logger.error(`🐂 Job failed ${job.id}`);
|
||||||
job.moveToFailed(error);
|
job.moveToFailed(error);
|
||||||
},
|
},
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
@ -108,7 +112,6 @@ export async function runWebScraper({
|
|||||||
// this return doesn't matter too much for the job completion result
|
// this return doesn't matter too much for the job completion result
|
||||||
return { success: true, message: "", docs: filteredDocs };
|
return { success: true, message: "", docs: filteredDocs };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error running web scraper", error);
|
|
||||||
onError(error);
|
onError(error);
|
||||||
return { success: false, message: error.message, docs: [] };
|
return { success: false, message: error.message, docs: [] };
|
||||||
}
|
}
|
||||||
@ -136,6 +139,6 @@ const saveJob = async (job: Job, result: any) => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to update job status:", error);
|
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url";
|
|||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -116,7 +117,7 @@ export class WebCrawler {
|
|||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
console.log(`Link disallowed by robots.txt: ${link}`);
|
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -133,15 +134,19 @@ export class WebCrawler {
|
|||||||
limit: number = 10000,
|
limit: number = 10000,
|
||||||
maxDepth: number = 10
|
maxDepth: number = 10
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
|
|
||||||
|
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||||
// Fetch and parse robots.txt
|
// Fetch and parse robots.txt
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
|
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!crawlerOptions?.ignoreSitemap){
|
if (!crawlerOptions?.ignoreSitemap){
|
||||||
|
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
@ -175,6 +180,7 @@ export class WebCrawler {
|
|||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
const queue = async.queue(async (task: string, callback) => {
|
const queue = async.queue(async (task: string, callback) => {
|
||||||
|
Logger.debug(`Crawling ${task}`);
|
||||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
@ -216,16 +222,18 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}, concurrencyLimit);
|
}, concurrencyLimit);
|
||||||
|
|
||||||
|
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
||||||
queue.push(
|
queue.push(
|
||||||
urls.filter(
|
urls.filter(
|
||||||
(url) =>
|
(url) =>
|
||||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
||||||
),
|
),
|
||||||
(err) => {
|
(err) => {
|
||||||
if (err) console.error(err);
|
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
await queue.drain();
|
await queue.drain();
|
||||||
|
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,7 +290,6 @@ export class WebCrawler {
|
|||||||
const urlObj = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
|
||||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||||
if (this.isInternalLink(fullUrl) &&
|
if (this.isInternalLink(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
@ -452,7 +459,7 @@ export class WebCrawler {
|
|||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||||
if (response) {
|
if (response) {
|
||||||
sitemapLinks = response;
|
sitemapLinks = response;
|
||||||
@ -467,7 +474,7 @@ export class WebCrawler {
|
|||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||||
// Check for Readme Docs special case
|
// Check for Readme Docs special case
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(
|
Logger.debug(
|
||||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
@ -19,7 +21,7 @@ export async function handleCustomScraping(
|
|||||||
|
|
||||||
// Check for Vanta security portals
|
// Check for Vanta security portals
|
||||||
if (text.includes('<link href="https://static.vanta.com')) {
|
if (text.includes('<link href="https://static.vanta.com')) {
|
||||||
console.log(
|
Logger.debug(
|
||||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
@ -34,7 +36,7 @@ export async function handleCustomScraping(
|
|||||||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||||
if (googleDriveMetaMatch) {
|
if (googleDriveMetaMatch) {
|
||||||
const url = googleDriveMetaMatch[1];
|
const url = googleDriveMetaMatch[1];
|
||||||
console.log(`Google Drive PDF link detected: ${url}`);
|
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||||
|
|
||||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||||
if (fileIdMatch) {
|
if (fileIdMatch) {
|
||||||
|
@ -19,6 +19,7 @@ import { generateCompletions } from "../../lib/LLM-extraction";
|
|||||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
@ -89,14 +90,14 @@ export class WebScraperDataProvider {
|
|||||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||||
const jobStatus = await job.getState();
|
const jobStatus = await job.getState();
|
||||||
if (jobStatus === "failed") {
|
if (jobStatus === "failed") {
|
||||||
console.error(
|
Logger.info(
|
||||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||||
);
|
);
|
||||||
return [] as Document[];
|
return [] as Document[];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
Logger.error(error.message);
|
||||||
return [] as Document[];
|
return [] as Document[];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,7 +271,7 @@ export class WebScraperDataProvider {
|
|||||||
this.mode === "single_urls" && links.length > 0
|
this.mode === "single_urls" && links.length > 0
|
||||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||||
(error) => {
|
(error) => {
|
||||||
console.error("Failed to fetch sitemap data:", error);
|
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -460,7 +461,7 @@ export class WebScraperDataProvider {
|
|||||||
let documents: Document[] = [];
|
let documents: Document[] = [];
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
const normalizedUrl = this.normalizeUrl(url);
|
const normalizedUrl = this.normalizeUrl(url);
|
||||||
console.log(
|
Logger.debug(
|
||||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||||
);
|
);
|
||||||
const cachedDocumentString = await getValue(
|
const cachedDocumentString = await getValue(
|
||||||
|
@ -2,6 +2,7 @@ import axios from "axios";
|
|||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
import { logScrape } from "../../../services/logging/scrape_log";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with Axios
|
* Scrapes a URL with Axios
|
||||||
@ -34,9 +35,7 @@ export async function scrapWithFetch(
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
console.error(
|
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`);
|
||||||
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
|
||||||
);
|
|
||||||
logParams.error_message = response.statusText;
|
logParams.error_message = response.statusText;
|
||||||
logParams.response_code = response.status;
|
logParams.response_code = response.status;
|
||||||
return {
|
return {
|
||||||
@ -63,10 +62,10 @@ export async function scrapWithFetch(
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === "ECONNABORTED") {
|
if (error.code === "ECONNABORTED") {
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
console.log(`[Axios] Request timed out for ${url}`);
|
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
||||||
} else {
|
} else {
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
}
|
}
|
||||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
|||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with Fire-Engine
|
* Scrapes a URL with Fire-Engine
|
||||||
@ -59,12 +60,10 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||||
|
|
||||||
console.log(
|
Logger.info(
|
||||||
`[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
// console.log(fireEngineOptionsParam)
|
|
||||||
|
|
||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
{
|
{
|
||||||
@ -84,15 +83,15 @@ export async function scrapWithFireEngine({
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
console.error(
|
Logger.debug(
|
||||||
`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`
|
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
|
||||||
);
|
);
|
||||||
|
|
||||||
logParams.error_message = response.data?.pageError;
|
logParams.error_message = response.data?.pageError;
|
||||||
logParams.response_code = response.data?.pageStatusCode;
|
logParams.response_code = response.data?.pageStatusCode;
|
||||||
|
|
||||||
if(response.data && response.data?.pageStatusCode !== 200) {
|
if(response.data && response.data?.pageStatusCode !== 200) {
|
||||||
console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -130,10 +129,10 @@ export async function scrapWithFireEngine({
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === "ECONNABORTED") {
|
if (error.code === "ECONNABORTED") {
|
||||||
console.log(`[Fire-Engine] Request timed out for ${url}`);
|
Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
} else {
|
} else {
|
||||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
}
|
}
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
|
@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
|||||||
import { generateRequestParams } from "../single_url";
|
import { generateRequestParams } from "../single_url";
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with Playwright
|
* Scrapes a URL with Playwright
|
||||||
@ -51,8 +52,8 @@ export async function scrapWithPlaywright(
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
console.error(
|
Logger.debug(
|
||||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
||||||
);
|
);
|
||||||
logParams.error_message = response.data?.pageError;
|
logParams.error_message = response.data?.pageError;
|
||||||
logParams.response_code = response.data?.pageStatusCode;
|
logParams.response_code = response.data?.pageStatusCode;
|
||||||
@ -86,8 +87,8 @@ export async function scrapWithPlaywright(
|
|||||||
};
|
};
|
||||||
} catch (jsonError) {
|
} catch (jsonError) {
|
||||||
logParams.error_message = jsonError.message || jsonError;
|
logParams.error_message = jsonError.message || jsonError;
|
||||||
console.error(
|
Logger.debug(
|
||||||
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
|
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
||||||
);
|
);
|
||||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
}
|
}
|
||||||
@ -95,10 +96,10 @@ export async function scrapWithPlaywright(
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === "ECONNABORTED") {
|
if (error.code === "ECONNABORTED") {
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
console.log(`[Playwright] Request timed out for ${url}`);
|
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
||||||
} else {
|
} else {
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
}
|
}
|
||||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url";
|
|||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
import { ScrapingBeeClient } from "scrapingbee";
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with ScrapingBee
|
* Scrapes a URL with ScrapingBee
|
||||||
@ -56,8 +57,8 @@ export async function scrapWithScrapingBee(
|
|||||||
text = decoder.decode(response.data);
|
text = decoder.decode(response.data);
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
} catch (decodeError) {
|
} catch (decodeError) {
|
||||||
console.error(
|
Logger.debug(
|
||||||
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
|
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
||||||
);
|
);
|
||||||
logParams.error_message = decodeError.message || decodeError;
|
logParams.error_message = decodeError.message || decodeError;
|
||||||
}
|
}
|
||||||
@ -72,7 +73,7 @@ export async function scrapWithScrapingBee(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
logParams.response_code = error.response?.status;
|
logParams.response_code = error.response?.status;
|
||||||
return {
|
return {
|
||||||
|
@ -17,6 +17,7 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
|||||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||||
import { extractLinks } from "./utils/utils";
|
import { extractLinks } from "./utils/utils";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -48,7 +49,7 @@ export async function generateRequestParams(
|
|||||||
return defaultParams;
|
return defaultParams;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error generating URL key: ${error}`);
|
Logger.error(`Error generating URL key: ${error}`);
|
||||||
return defaultParams;
|
return defaultParams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -154,7 +155,6 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.log(`Scraping ${url} with Fire Engine`);
|
|
||||||
const response = await scrapWithFireEngine({
|
const response = await scrapWithFireEngine({
|
||||||
url,
|
url,
|
||||||
waitFor: pageOptions.waitFor,
|
waitFor: pageOptions.waitFor,
|
||||||
@ -277,7 +277,7 @@ export async function scrapSingleUrl(
|
|||||||
try {
|
try {
|
||||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||||
}
|
}
|
||||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||||
const scrapersInOrder = getScrapingFallbackOrder(
|
const scrapersInOrder = getScrapingFallbackOrder(
|
||||||
@ -311,12 +311,18 @@ export async function scrapSingleUrl(
|
|||||||
pageError = undefined;
|
pageError = undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (text && text.trim().length >= 100) break;
|
if (text && text.trim().length >= 100) {
|
||||||
if (pageStatusCode && pageStatusCode == 404) break;
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
break;
|
||||||
if (nextScraperIndex < scrapersInOrder.length) {
|
|
||||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
|
||||||
}
|
}
|
||||||
|
if (pageStatusCode && pageStatusCode == 404) {
|
||||||
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||||
|
// if (nextScraperIndex < scrapersInOrder.length) {
|
||||||
|
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!text) {
|
if (!text) {
|
||||||
@ -372,7 +378,7 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout";
|
|||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
@ -26,7 +27,7 @@ export async function getLinksFromSitemap(
|
|||||||
content = response.html;
|
content = response.html;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
@ -48,7 +49,7 @@ export async function getLinksFromSitemap(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error processing ${sitemapUrl}: ${error}`);
|
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { Logger } from '../../../../lib/logger';
|
||||||
import { isUrlBlocked } from '../blocklist';
|
import { isUrlBlocked } from '../blocklist';
|
||||||
|
|
||||||
describe('isUrlBlocked', () => {
|
describe('isUrlBlocked', () => {
|
||||||
@ -19,7 +20,7 @@ describe('isUrlBlocked', () => {
|
|||||||
|
|
||||||
blockedUrls.forEach(url => {
|
blockedUrls.forEach(url => {
|
||||||
if (!isUrlBlocked(url)) {
|
if (!isUrlBlocked(url)) {
|
||||||
console.log(`URL not blocked: ${url}`);
|
Logger.debug(`URL not blocked: ${url}`);
|
||||||
}
|
}
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
});
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
const socialMediaBlocklist = [
|
const socialMediaBlocklist = [
|
||||||
'facebook.com',
|
'facebook.com',
|
||||||
'x.com',
|
'x.com',
|
||||||
@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
return isBlocked;
|
return isBlocked;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// If an error occurs (e.g., invalid URL), return false
|
// If an error occurs (e.g., invalid URL), return false
|
||||||
console.error(`Error parsing the following URL: ${url}`);
|
Logger.error(`Error parsing the following URL: ${url}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import Anthropic from '@anthropic-ai/sdk';
|
import Anthropic from '@anthropic-ai/sdk';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
|
import { Logger } from '../../../lib/logger';
|
||||||
|
|
||||||
export async function getImageDescription(
|
export async function getImageDescription(
|
||||||
imageUrl: string,
|
imageUrl: string,
|
||||||
@ -82,7 +83,7 @@ export async function getImageDescription(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error generating image alt text:", error?.message);
|
Logger.error(`Error generating image alt text: ${error}`);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
import { CheerioAPI } from "cheerio";
|
import { CheerioAPI } from "cheerio";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
interface Metadata {
|
interface Metadata {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error extracting metadata:", error);
|
Logger.error(`Error extracting metadata: ${error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -7,6 +7,7 @@ import pdf from "pdf-parse";
|
|||||||
import path from "path";
|
import path from "path";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import { axiosTimeout } from "../../../lib/timeout";
|
import { axiosTimeout } from "../../../lib/timeout";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -39,6 +40,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||||
|
Logger.debug("Processing pdf document w/ LlamaIndex");
|
||||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||||
const headers = {
|
const headers = {
|
||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
@ -81,7 +83,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error fetching result w/ LlamaIndex");
|
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||||
attempt++;
|
attempt++;
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||||
// You may want to handle specific errors differently
|
// You may want to handle specific errors differently
|
||||||
@ -93,7 +95,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||||||
}
|
}
|
||||||
content = resultResponse.data[resultType];
|
content = resultResponse.data[resultType];
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
}
|
}
|
||||||
} else if (parsePDF) {
|
} else if (parsePDF) {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
import { Document } from "../../../lib/entities";
|
import { Document } from "../../../lib/entities";
|
||||||
|
|
||||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||||
@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error replacing paths with absolute paths", error);
|
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error replacing img paths with absolute paths", error);
|
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
};
|
};
|
@ -1,5 +1,6 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import * as cheerio from "cheerio";
|
import * as cheerio from "cheerio";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
|
|
||||||
export async function attemptScrapWithRequests(
|
export async function attemptScrapWithRequests(
|
||||||
@ -9,13 +10,13 @@ export async function attemptScrapWithRequests(
|
|||||||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||||
|
|
||||||
if (!response.data) {
|
if (!response.data) {
|
||||||
console.log("Failed normal requests as well");
|
Logger.debug("Failed normal requests as well");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return response.data;
|
return response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error in attemptScrapWithRequests: ${error}`);
|
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ import axios from 'axios';
|
|||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import * as querystring from 'querystring';
|
import * as querystring from 'querystring';
|
||||||
import { SearchResult } from '../../src/lib/entities';
|
import { SearchResult } from '../../src/lib/entities';
|
||||||
|
import { Logger } from '../../src/lib/logger';
|
||||||
|
|
||||||
const _useragent_list = [
|
const _useragent_list = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||||
@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results
|
|||||||
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.message === 'Too many requests') {
|
if (error.message === 'Too many requests') {
|
||||||
console.warn('Too many requests, breaking the loop');
|
Logger.warn('Too many requests, breaking the loop');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
throw error;
|
throw error;
|
||||||
@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (attempts >= maxAttempts) {
|
if (attempts >= maxAttempts) {
|
||||||
console.warn('Max attempts reached, breaking the loop');
|
Logger.warn('Max attempts reached, breaking the loop');
|
||||||
}
|
}
|
||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { SearchResult } from "../../src/lib/entities";
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
import { google_search } from "./googlesearch";
|
import { google_search } from "./googlesearch";
|
||||||
import { serper_search } from "./serper";
|
import { serper_search } from "./serper";
|
||||||
@ -47,7 +48,7 @@ export async function search({
|
|||||||
timeout
|
timeout
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error in search function: ", error);
|
Logger.error(`Error in search function: ${error}`);
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
// if process.env.SERPER_API_KEY is set, use serper
|
// if process.env.SERPER_API_KEY is set, use serper
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import { getWebScraperQueue } from "../queue-service";
|
import { getWebScraperQueue } from "../queue-service";
|
||||||
import { sendSlackWebhook } from "./slack";
|
import { sendSlackWebhook } from "./slack";
|
||||||
|
|
||||||
@ -9,13 +10,13 @@ export async function checkAlerts() {
|
|||||||
process.env.ALERT_NUM_ACTIVE_JOBS &&
|
process.env.ALERT_NUM_ACTIVE_JOBS &&
|
||||||
process.env.ALERT_NUM_WAITING_JOBS
|
process.env.ALERT_NUM_WAITING_JOBS
|
||||||
) {
|
) {
|
||||||
console.info("Initializing alerts");
|
Logger.info("Initializing alerts");
|
||||||
const checkActiveJobs = async () => {
|
const checkActiveJobs = async () => {
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const webScraperQueue = getWebScraperQueue();
|
||||||
const activeJobs = await webScraperQueue.getActiveCount();
|
const activeJobs = await webScraperQueue.getActiveCount();
|
||||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||||
console.warn(
|
Logger.warn(
|
||||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||||
);
|
);
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
@ -23,12 +24,12 @@ export async function checkAlerts() {
|
|||||||
true
|
true
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
console.info(
|
Logger.info(
|
||||||
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
|
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to check active jobs:", error);
|
Logger.error(`Failed to check active jobs: ${error}`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -38,7 +39,7 @@ export async function checkAlerts() {
|
|||||||
const paused = await webScraperQueue.getPausedCount();
|
const paused = await webScraperQueue.getPausedCount();
|
||||||
|
|
||||||
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||||
console.warn(
|
Logger.warn(
|
||||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
||||||
);
|
);
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
@ -57,6 +58,6 @@ export async function checkAlerts() {
|
|||||||
// setInterval(checkAll, 10000); // Run every
|
// setInterval(checkAll, 10000); // Run every
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to initialize alerts:", error);
|
Logger.error(`Failed to initialize alerts: ${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export async function sendSlackWebhook(
|
export async function sendSlackWebhook(
|
||||||
message: string,
|
message: string,
|
||||||
@ -16,8 +17,8 @@ export async function sendSlackWebhook(
|
|||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
console.log("Webhook sent successfully:", response.data);
|
Logger.log("Webhook sent successfully:", response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error sending webhook:", error);
|
Logger.debug(`Error sending webhook: ${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ import { NotificationType } from "../../types";
|
|||||||
import { withAuth } from "../../lib/withAuth";
|
import { withAuth } from "../../lib/withAuth";
|
||||||
import { sendNotification } from "../notification/email_notification";
|
import { sendNotification } from "../notification/email_notification";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
const FREE_CREDITS = 500;
|
const FREE_CREDITS = 500;
|
||||||
|
|
||||||
@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||||||
if (team_id === "preview") {
|
if (team_id === "preview") {
|
||||||
return { success: true, message: "Preview team, no credits used" };
|
return { success: true, message: "Preview team, no credits used" };
|
||||||
}
|
}
|
||||||
console.log(`Billing team ${team_id} for ${credits} credits`);
|
Logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||||
// When the API is used, you can log the credit usage in the credit_usage table:
|
// When the API is used, you can log the credit usage in the credit_usage table:
|
||||||
// team_id: The ID of the team using the API.
|
// team_id: The ID of the team using the API.
|
||||||
// subscription_id: The ID of the team's active subscription.
|
// subscription_id: The ID of the team's active subscription.
|
||||||
@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
0
|
0
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log("totalCreditsUsed", totalCreditsUsed);
|
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||||
|
|
||||||
const end = new Date();
|
const end = new Date();
|
||||||
end.setDate(end.getDate() + 30);
|
end.setDate(end.getDate() + 30);
|
||||||
@ -262,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (creditUsageError) {
|
if (creditUsageError) {
|
||||||
console.error("Error calculating credit usage:", creditUsageError);
|
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (creditUsages && creditUsages.length > 0) {
|
if (creditUsages && creditUsages.length > 0) {
|
||||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error calculating credit usage:", error);
|
Logger.error(`Error calculating credit usage: ${error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust total credits used by subtracting coupon value
|
// Adjust total credits used by subtracting coupon value
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import { Request } from "express";
|
import { Request } from "express";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export async function createIdempotencyKey(
|
export async function createIdempotencyKey(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -14,7 +15,7 @@ export async function createIdempotencyKey(
|
|||||||
.insert({ key: idempotencyKey });
|
.insert({ key: idempotencyKey });
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Failed to create idempotency key:", error);
|
Logger.error(`Failed to create idempotency key: ${error}`);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { Request } from "express";
|
import { Request } from "express";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { validate as isUuid } from 'uuid';
|
import { validate as isUuid } from 'uuid';
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export async function validateIdempotencyKey(
|
export async function validateIdempotencyKey(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -13,7 +14,7 @@ export async function validateIdempotencyKey(
|
|||||||
// Ensure idempotencyKey is treated as a string
|
// Ensure idempotencyKey is treated as a string
|
||||||
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
|
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
|
||||||
if (!isUuid(key)) {
|
if (!isUuid(key)) {
|
||||||
console.error("Invalid idempotency key provided in the request headers.");
|
Logger.debug("Invalid idempotency key provided in the request headers.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ export async function validateIdempotencyKey(
|
|||||||
.eq("key", idempotencyKey);
|
.eq("key", idempotencyKey);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error(error);
|
Logger.error(`Error validating idempotency key: ${error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!data || data.length === 0) {
|
if (!data || data.length === 0) {
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
|
||||||
export async function logCrawl(job_id: string, team_id: string) {
|
export async function logCrawl(job_id: string, team_id: string) {
|
||||||
@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) {
|
|||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error logging crawl job:\n", error);
|
Logger.error(`Error logging crawl job to supabase:\n${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ import { supabase_service } from "../supabase";
|
|||||||
import { FirecrawlJob } from "../../types";
|
import { FirecrawlJob } from "../../types";
|
||||||
import { posthog } from "../posthog";
|
import { posthog } from "../posthog";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function logJob(job: FirecrawlJob) {
|
export async function logJob(job: FirecrawlJob) {
|
||||||
try {
|
try {
|
||||||
@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
posthog.capture(phLog);
|
posthog.capture(phLog);
|
||||||
}
|
}
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error logging job:\n", error);
|
Logger.error(`Error logging job: ${error.message}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error logging job:\n", error);
|
Logger.error(`Error logging job: ${error.message}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,15 +2,16 @@ import "dotenv/config";
|
|||||||
import { ScrapeLog } from "../../types";
|
import { ScrapeLog } from "../../types";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { PageOptions } from "../../lib/entities";
|
import { PageOptions } from "../../lib/entities";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function logScrape(
|
export async function logScrape(
|
||||||
scrapeLog: ScrapeLog,
|
scrapeLog: ScrapeLog,
|
||||||
pageOptions?: PageOptions
|
pageOptions?: PageOptions
|
||||||
) {
|
) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
|
Logger.debug("Skipping logging scrape to Supabase");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Only log jobs in production
|
// Only log jobs in production
|
||||||
// if (process.env.ENV !== "production") {
|
// if (process.env.ENV !== "production") {
|
||||||
@ -43,9 +44,9 @@ export async function logScrape(
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error logging proxy:\n", error);
|
Logger.error(`Error logging proxy:\n${error}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error logging proxy:\n", error);
|
Logger.error(`Error logging proxy:\n${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,20 @@
|
|||||||
import { Logtail } from "@logtail/node";
|
import { Logtail } from "@logtail/node";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
|
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
|
||||||
class MockLogtail {
|
class MockLogtail {
|
||||||
info(message: string, context?: Record<string, any>): void {
|
info(message: string, context?: Record<string, any>): void {
|
||||||
console.log(message, context);
|
Logger.debug(`${message} - ${context}`);
|
||||||
}
|
}
|
||||||
error(message: string, context: Record<string, any> = {}): void {
|
error(message: string, context: Record<string, any> = {}): void {
|
||||||
console.error(message, context);
|
Logger.error(`${message} - ${context}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
|
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
|
||||||
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
|
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
|
||||||
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
|
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
|
||||||
console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||||
return new MockLogtail();
|
return new MockLogtail();
|
||||||
})();
|
})();
|
||||||
|
@ -2,6 +2,7 @@ import { supabase_service } from "../supabase";
|
|||||||
import { withAuth } from "../../lib/withAuth";
|
import { withAuth } from "../../lib/withAuth";
|
||||||
import { Resend } from "resend";
|
import { Resend } from "resend";
|
||||||
import { NotificationType } from "../../types";
|
import { NotificationType } from "../../types";
|
||||||
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
const emailTemplates: Record<
|
const emailTemplates: Record<
|
||||||
NotificationType,
|
NotificationType,
|
||||||
@ -52,11 +53,11 @@ async function sendEmailNotification(
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error sending email: ", error);
|
Logger.debug(`Error sending email: ${error}`);
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error sending email (2): ", error);
|
Logger.debug(`Error sending email (2): ${error}`);
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -79,7 +80,7 @@ export async function sendNotificationInternal(
|
|||||||
.lte("sent_date", endDateString);
|
.lte("sent_date", endDateString);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error fetching notifications: ", error);
|
Logger.debug(`Error fetching notifications: ${error}`);
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +94,7 @@ export async function sendNotificationInternal(
|
|||||||
.eq("team_id", team_id);
|
.eq("team_id", team_id);
|
||||||
|
|
||||||
if (emailsError) {
|
if (emailsError) {
|
||||||
console.error("Error fetching emails: ", emailsError);
|
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,7 +113,7 @@ export async function sendNotificationInternal(
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
if (insertError) {
|
if (insertError) {
|
||||||
console.error("Error inserting notification record: ", insertError);
|
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import { PostHog } from 'posthog-node';
|
import { PostHog } from 'posthog-node';
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
import { Logger } from '../../src/lib/logger';
|
||||||
|
|
||||||
export default function PostHogClient() {
|
export default function PostHogClient() {
|
||||||
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
|
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
|
||||||
@ -19,7 +20,7 @@ class MockPostHog {
|
|||||||
export const posthog = process.env.POSTHOG_API_KEY
|
export const posthog = process.env.POSTHOG_API_KEY
|
||||||
? PostHogClient()
|
? PostHogClient()
|
||||||
: (() => {
|
: (() => {
|
||||||
console.warn(
|
Logger.warn(
|
||||||
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
|
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
|
||||||
);
|
);
|
||||||
return new MockPostHog();
|
return new MockPostHog();
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import Queue from "bull";
|
import Queue from "bull";
|
||||||
import { Queue as BullQueue } from "bull";
|
import { Queue as BullQueue } from "bull";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
let webScraperQueue: BullQueue;
|
let webScraperQueue: BullQueue;
|
||||||
|
|
||||||
@ -16,7 +17,7 @@ export function getWebScraperQueue() {
|
|||||||
attempts: 5
|
attempts: 5
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
console.log("Web scraper queue created");
|
Logger.info("Web scraper queue created");
|
||||||
}
|
}
|
||||||
return webScraperQueue;
|
return webScraperQueue;
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,9 @@ import { callWebhook } from "./webhook";
|
|||||||
import { logJob } from "./logging/log_job";
|
import { logJob } from "./logging/log_job";
|
||||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||||
import { Job } from "bull";
|
import { Job } from "bull";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
if(process.env.ENV === 'production') {
|
if (process.env.ENV === 'production') {
|
||||||
initSDK({
|
initSDK({
|
||||||
consoleCapture: true,
|
consoleCapture: true,
|
||||||
additionalInstrumentations: [],
|
additionalInstrumentations: [],
|
||||||
@ -18,7 +19,7 @@ if(process.env.ENV === 'production') {
|
|||||||
const wsq = getWebScraperQueue();
|
const wsq = getWebScraperQueue();
|
||||||
|
|
||||||
async function processJob(job: Job, done) {
|
async function processJob(job: Job, done) {
|
||||||
console.log("taking job", job.id);
|
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.progress({
|
||||||
current: 1,
|
current: 1,
|
||||||
@ -58,18 +59,18 @@ async function processJob(job: Job, done) {
|
|||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
});
|
||||||
console.log("job done", job.id);
|
Logger.debug(`🐂 Job done ${job.id}`);
|
||||||
done(null, data);
|
done(null, data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log("job errored", job.id, error);
|
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||||
if (await getWebScraperQueue().isPaused(false)) {
|
if (await getWebScraperQueue().isPaused(false)) {
|
||||||
console.log("queue is paused, ignoring");
|
Logger.debug("🐂Queue is paused, ignoring");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (error instanceof CustomError) {
|
if (error instanceof CustomError) {
|
||||||
// Here we handle the error, then save the failed job
|
// Here we handle the error, then save the failed job
|
||||||
console.error(error.message); // or any other error handling
|
Logger.error(error.message); // or any other error handling
|
||||||
|
|
||||||
logtail.error("Custom error while ingesting", {
|
logtail.error("Custom error while ingesting", {
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
@ -77,7 +78,7 @@ async function processJob(job: Job, done) {
|
|||||||
dataIngestionJob: error.dataIngestionJob,
|
dataIngestionJob: error.dataIngestionJob,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
console.log(error);
|
Logger.error(error);
|
||||||
|
|
||||||
logtail.error("Overall error ingesting", {
|
logtail.error("Overall error ingesting", {
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { redisRateLimitClient } from "./rate-limiter";
|
import { redisRateLimitClient } from "./rate-limiter";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
// Listen to 'error' events to the Redis connection
|
// Listen to 'error' events to the Redis connection
|
||||||
redisRateLimitClient.on("error", (error) => {
|
redisRateLimitClient.on("error", (error) => {
|
||||||
try {
|
try {
|
||||||
if (error.message === "ECONNRESET") {
|
if (error.message === "ECONNRESET") {
|
||||||
console.log("Connection to Redis Session Store timed out.");
|
Logger.error("Connection to Redis Session Rate Limit Store timed out.");
|
||||||
} else if (error.message === "ECONNREFUSED") {
|
} else if (error.message === "ECONNREFUSED") {
|
||||||
console.log("Connection to Redis Session Store refused!");
|
Logger.error("Connection to Redis Session Rate Limit Store refused!");
|
||||||
} else console.log(error);
|
} else Logger.error(error);
|
||||||
} catch (error) {}
|
} catch (error) {}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => {
|
|||||||
redisRateLimitClient.on("reconnecting", (err) => {
|
redisRateLimitClient.on("reconnecting", (err) => {
|
||||||
try {
|
try {
|
||||||
if (redisRateLimitClient.status === "reconnecting")
|
if (redisRateLimitClient.status === "reconnecting")
|
||||||
console.log("Reconnecting to Redis Session Store...");
|
Logger.info("Reconnecting to Redis Session Rate Limit Store...");
|
||||||
else console.log("Error reconnecting to Redis Session Store.");
|
else Logger.error("Error reconnecting to Redis Session Rate Limit Store.");
|
||||||
} catch (error) {}
|
} catch (error) {}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Listen to the 'connect' event to Redis
|
// Listen to the 'connect' event to Redis
|
||||||
redisRateLimitClient.on("connect", (err) => {
|
redisRateLimitClient.on("connect", (err) => {
|
||||||
try {
|
try {
|
||||||
if (!err) console.log("Connected to Redis Session Store!");
|
if (!err) Logger.info("Connected to Redis Session Rate Limit Store!");
|
||||||
} catch (error) {}
|
} catch (error) {}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
class SupabaseService {
|
class SupabaseService {
|
||||||
@ -10,13 +11,13 @@ class SupabaseService {
|
|||||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
// Warn the user that Authentication is disabled by setting the client to null
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
console.warn(
|
Logger.warn(
|
||||||
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
|
"Authentication is disabled. Supabase client will not be initialized."
|
||||||
);
|
);
|
||||||
this.client = null;
|
this.client = null;
|
||||||
} else if (!supabaseUrl || !supabaseServiceToken) {
|
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||||
console.error(
|
Logger.error(
|
||||||
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
|
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||||
@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||||||
new SupabaseService(),
|
new SupabaseService(),
|
||||||
{
|
{
|
||||||
get: function (target, prop, receiver) {
|
get: function (target, prop, receiver) {
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
|
Logger.debug(
|
||||||
|
"Attempted to access Supabase client when it's not configured."
|
||||||
|
);
|
||||||
|
}
|
||||||
const client = target.getClient();
|
const client = target.getClient();
|
||||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||||
if (client === null) {
|
if (client === null) {
|
||||||
console.error(
|
Logger.error(
|
||||||
"Attempted to access Supabase client when it's not configured."
|
"Attempted to access Supabase client when it's not configured."
|
||||||
);
|
);
|
||||||
return () => {
|
return () => {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
|
|
||||||
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||||
@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
|||||||
.eq("team_id", teamId)
|
.eq("team_id", teamId)
|
||||||
.limit(1);
|
.limit(1);
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error(
|
Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
|
||||||
`Error fetching webhook URL for team ID: ${teamId}`,
|
|
||||||
error.message
|
|
||||||
);
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
|||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(
|
Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
|
||||||
`Error sending webhook for team ID: ${teamId}`,
|
|
||||||
error.message
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
import { Logger } from "../../api/src/lib/logger";
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
class SupabaseService {
|
class SupabaseService {
|
||||||
private client: SupabaseClient | null = null;
|
private client: SupabaseClient | null = null;
|
||||||
@ -10,13 +12,13 @@ class SupabaseService {
|
|||||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
// Warn the user that Authentication is disabled by setting the client to null
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
console.warn(
|
Logger.warn(
|
||||||
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
|
"Authentication is disabled. Supabase client will not be initialized."
|
||||||
);
|
);
|
||||||
this.client = null;
|
this.client = null;
|
||||||
} else if (!supabaseUrl || !supabaseServiceToken) {
|
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||||
console.error(
|
Logger.error(
|
||||||
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
|
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||||
@ -35,10 +37,15 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||||||
new SupabaseService(),
|
new SupabaseService(),
|
||||||
{
|
{
|
||||||
get: function (target, prop, receiver) {
|
get: function (target, prop, receiver) {
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||||
|
Logger.debug(
|
||||||
|
"Attempted to access Supabase client when it's not configured."
|
||||||
|
);
|
||||||
|
}
|
||||||
const client = target.getClient();
|
const client = target.getClient();
|
||||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||||
if (client === null) {
|
if (client === null) {
|
||||||
console.error(
|
Logger.error(
|
||||||
"Attempted to access Supabase client when it's not configured."
|
"Attempted to access Supabase client when it's not configured."
|
||||||
);
|
);
|
||||||
return () => {
|
return () => {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user