This commit is contained in:
rafaelsideguide 2024-07-25 17:48:43 -03:00
commit e0954d7f59
55 changed files with 463 additions and 174 deletions

View File

@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL=
# Resend API Key for transactional emails
RESEND_API_KEY=
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
# Available levels are:
# NONE - No logs will be output.
# ERROR - For logging error messages that indicate a failure in a specific operation.
# WARN - For logging potentially harmful situations that are not necessarily errors.
# INFO - For logging informational messages that highlight the progress of the application.
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
# TRACE - For logging more detailed information than the DEBUG level.
# Set LOGGING_LEVEL to one of the above options to control logging output.
LOGGING_LEVEL=INFO

View File

@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
console.log(crawlData)
expect(crawlData.length).toBeGreaterThan(0);
expect(crawlData).toEqual(expect.arrayContaining([
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),

View File

@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode);
@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
api_key
});
} catch (error) {
console.error('Error setting trace attributes:', error);
Logger.error(`Error setting trace attributes: ${error.message}`);
}
}
@ -82,7 +83,7 @@ export async function supaAuthenticateUser(
// $$ language plpgsql;
if (error) {
console.error('Error fetching key and price_id:', error);
Logger.warn(`Error fetching key and price_id: ${error.message}`);
} else {
// console.log('Key and Price ID:', data);
}
@ -135,7 +136,7 @@ export async function supaAuthenticateUser(
try {
await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) {
console.error(rateLimiterRes);
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);

View File

@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing";
import { Logger } from "../../src/lib/logger";
export async function crawlCancelController(req: Request, res: Response) {
try {
@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) {
const { partialDocs } = await job.progress();
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
console.log("Billing team for partial docs...");
Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length);
@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) {
console.error(error);
Logger.error(error);
}
const newJobState = await job.getState();
@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) {
status: "cancelled"
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlStatusController(req: Request, res: Response) {
try {
@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) {
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -10,6 +10,8 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
export async function crawlController(req: Request, res: Response) {
try {
@ -30,7 +32,7 @@ export async function crawlController(req: Request, res: Response) {
try {
createIdempotencyKey(req);
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
@ -60,10 +62,11 @@ export async function crawlController(req: Request, res: Response) {
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) {
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
try {
const a = new WebScraperDataProvider();
await a.setOptions({
jobId: uuidv4(),
mode: "single_urls",
urls: [url],
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
@ -83,7 +86,7 @@ export async function crawlController(req: Request, res: Response) {
documents: docs,
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
@ -101,7 +104,7 @@ export async function crawlController(req: Request, res: Response) {
res.json({ jobId: job.id });
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../src/lib/logger";
export async function crawlPreviewController(req: Request, res: Response) {
try {
@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
res.json({ jobId: job.id });
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -9,8 +9,11 @@ import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger';
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
@ -35,6 +38,7 @@ export async function scrapeHelper(
const a = new WebScraperDataProvider();
await a.setOptions({
jobId,
mode: "single_urls",
urls: [url],
crawlerOptions: {
@ -112,7 +116,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
console.error(error);
Logger.error(error);
earlyReturn = true;
return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
}
@ -127,8 +131,11 @@ export async function scrapeController(req: Request, res: Response) {
checkCredits();
}
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
@ -169,6 +176,7 @@ export async function scrapeController(req: Request, res: Response) {
}
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: 1,
@ -188,7 +196,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(result.returnCode).json(result);
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -7,8 +7,11 @@ import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger";
export async function searchHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
@ -75,6 +78,7 @@ export async function searchHelper(
const a = new WebScraperDataProvider();
await a.setOptions({
jobId,
mode: "single_urls",
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
crawlerOptions: {
@ -148,6 +152,8 @@ export async function searchController(req: Request, res: Response) {
const searchOptions = req.body.searchOptions ?? { limit: 7 };
const jobId = uuidv4();
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
@ -155,11 +161,12 @@ export async function searchController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: "Internal server error" });
}
const startTime = new Date().getTime();
const result = await searchHelper(
jobId,
req,
team_id,
crawlerOptions,
@ -169,6 +176,7 @@ export async function searchController(req: Request, res: Response) {
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: result.data ? result.data.length : 0,
@ -183,7 +191,7 @@ export async function searchController(req: Request, res: Response) {
});
return res.status(result.returnCode).json(result);
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,6 +1,7 @@
import { Request, Response } from "express";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -4,6 +4,7 @@ async function example() {
const example = new WebScraperDataProvider();
await example.setOptions({
jobId: "TEST",
mode: "crawl",
urls: ["https://mendable.ai"],
crawlerOptions: {},

View File

@ -12,16 +12,18 @@ import { sendSlackWebhook } from "./services/alerts/slack";
import { checkAlerts } from "./services/alerts";
import Redis from "ioredis";
import { redisRateLimitClient } from "./services/rate-limiter";
import { Logger } from "./lib/logger";
import { ScrapeEvents } from "./lib/scrape-events";
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
console.log(`Number of CPUs: ${numCPUs} available`);
Logger.info(`Number of CPUs: ${numCPUs} available`);
if (cluster.isMaster) {
console.log(`Master ${process.pid} is running`);
Logger.info(`Master ${process.pid} is running`);
// Fork workers.
for (let i = 0; i < numCPUs; i++) {
@ -30,8 +32,8 @@ if (cluster.isMaster) {
cluster.on("exit", (worker, code, signal) => {
if (code !== null) {
console.log(`Worker ${worker.process.pid} exited`);
console.log("Starting a new worker");
Logger.info(`Worker ${worker.process.pid} exited`);
Logger.info("Starting a new worker");
cluster.fork();
}
});
@ -81,14 +83,9 @@ if (cluster.isMaster) {
function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => {
console.log(`Worker ${process.pid} listening on port ${port}`);
console.log(
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
);
console.log("");
console.log("1. Make sure Redis is running on port 6379 by default");
console.log(
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
Logger.info(`Worker ${process.pid} listening on port ${port}`);
Logger.info(
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
);
});
return server;
@ -114,7 +111,7 @@ if (cluster.isMaster) {
noActiveJobs,
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
});
@ -132,7 +129,7 @@ if (cluster.isMaster) {
waitingJobs,
});
} catch (error) {
console.error(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
});
@ -177,13 +174,13 @@ if (cluster.isMaster) {
});
if (!response.ok) {
console.error("Failed to send Slack notification");
Logger.error("Failed to send Slack notification");
}
}
}, timeout);
}
} catch (error) {
console.error(error);
Logger.debug(error);
}
};
@ -198,7 +195,7 @@ if (cluster.isMaster) {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
console.error("Failed to initialize alerts:", error);
Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
@ -207,6 +204,7 @@ if (cluster.isMaster) {
app.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
async (req, res) => {
Logger.info("🐂 Cleaning jobs older than 24h");
try {
const webScraperQueue = getWebScraperQueue();
const batchSize = 10;
@ -241,12 +239,12 @@ if (cluster.isMaster) {
await job.remove();
count++;
} catch (jobError) {
console.error(`Failed to remove job with ID ${job.id}:`, jobError);
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}` );
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
console.error("Failed to clean last 24h complete jobs:", error);
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
@ -272,7 +270,7 @@ if (cluster.isMaster) {
queueRedisHealth = await queueRedis.get(testKey);
await queueRedis.del(testKey);
} catch (error) {
console.error("queueRedis health check failed:", error);
Logger.error(`queueRedis health check failed: ${error}`);
queueRedisHealth = null;
}
@ -283,7 +281,7 @@ if (cluster.isMaster) {
redisRateLimitHealth = await redisRateLimitClient.get(testKey);
await redisRateLimitClient.del(testKey);
} catch (error) {
console.error("redisRateLimitClient health check failed:", error);
Logger.error(`redisRateLimitClient health check failed: ${error}`);
redisRateLimitHealth = null;
}
@ -297,12 +295,12 @@ if (cluster.isMaster) {
healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy"
) {
console.log("Both Redis instances are healthy");
Logger.info("Both Redis instances are healthy");
return res
.status(200)
.json({ status: "healthy", details: healthStatus });
} else {
console.log("Redis instances health check:", healthStatus);
Logger.info(`Redis instances health check: ${JSON.stringify(healthStatus)}`);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
healthStatus
@ -314,7 +312,7 @@ if (cluster.isMaster) {
.json({ status: "unhealthy", details: healthStatus });
}
} catch (error) {
console.error("Redis health check failed:", error);
Logger.error(`Redis health check failed: ${error}`);
await sendSlackWebhook(
`[REDIS DOWN] Redis instances health check: ${error.message}`,
true
@ -326,5 +324,14 @@ if (cluster.isMaster) {
}
);
console.log(`Worker ${process.pid} started`);
Logger.info(`Worker ${process.pid} started`);
}
const wsq = getWebScraperQueue();
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

View File

@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities";
import { Logger } from "../logger";
// Generate completion using OpenAI
export async function generateCompletions(
@ -44,7 +45,7 @@ export async function generateCompletions(
return completionResult;
} catch (error) {
console.error(`Error generating completions: ${error}`);
Logger.error(`Error generating completions: ${error}`);
throw new Error(`Error generating completions: ${error.message}`);
}
default:

View File

@ -56,6 +56,7 @@ export type CrawlerOptions = {
}
export type WebScraperOptions = {
jobId: string;
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions;

View File

@ -0,0 +1,53 @@
enum LogLevel {
NONE = 'NONE', // No logs will be output.
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
}
export class Logger {
static colors = {
ERROR: '\x1b[31m%s\x1b[0m', // Red
WARN: '\x1b[33m%s\x1b[0m', // Yellow
INFO: '\x1b[34m%s\x1b[0m', // Blue
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
};
static log (message: string, level: LogLevel) {
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
const currentLevelIndex = levels.indexOf(logLevel);
const messageLevelIndex = levels.indexOf(level);
if (currentLevelIndex >= messageLevelIndex) {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
// if (process.env.USE_DB_AUTH) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }
}
}
static error(message: string | any) {
Logger.log(message, LogLevel.ERROR);
}
static warn(message: string) {
Logger.log(message, LogLevel.WARN);
}
static info(message: string) {
Logger.log(message, LogLevel.INFO);
}
static debug(message: string) {
Logger.log(message, LogLevel.DEBUG);
}
static trace(message: string) {
Logger.log(message, LogLevel.TRACE);
}
}

View File

@ -0,0 +1,84 @@
import { Job, JobId } from "bull";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger";
export type ScrapeErrorEvent = {
type: "error",
message: string,
stack?: string,
}
export type ScrapeScrapeEvent = {
type: "scrape",
url: string,
worker?: string,
method: (typeof baseScrapers)[number],
result: null | {
success: boolean,
response_code?: number,
response_size?: number,
error?: string | object,
// proxy?: string,
time_taken: number,
},
}
export type ScrapeQueueEvent = {
type: "queue",
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
worker?: string,
}
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
if (process.env.USE_DB_AUTHENTICATION) {
try {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,
type: content.type,
content: content,
// created_at
}).select().single();
return (result.data as any).id;
} catch (error) {
Logger.error(`Error inserting scrape event: ${error}`);
return null;
}
}
return null;
}
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
if (logId === null) return;
try {
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
await supabase.from("scrape_events").update({
content: {
...previousLog.content,
result,
}
}).eq("id", logId);
} catch (error) {
Logger.error(`Error updating scrape result: ${error}`);
}
}
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
try {
await this.insert(((job as any).id ? (job as any).id : job) as string, {
type: "queue",
event,
worker: process.env.FLY_MACHINE_ID,
});
} catch (error) {
Logger.error(`Error logging job event: ${error}`);
}
}
}

View File

@ -1,4 +1,5 @@
import { AuthResponse } from "../../src/types";
import { Logger } from "./logger";
let warningCount = 0;
@ -8,7 +9,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
return async function (...args: U): Promise<T> {
if (process.env.USE_DB_AUTHENTICATION === "false") {
if (warningCount < 5) {
console.warn("WARNING - You're bypassing authentication");
Logger.warn("You're bypassing authentication");
warningCount++;
}
return { success: true } as T;
@ -16,7 +17,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
try {
return await originalFunction(...args);
} catch (error) {
console.error("Error in withAuth function: ", error);
Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T;
}
}

View File

@ -10,6 +10,8 @@ import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
export async function startWebScraperPipeline({
job,
@ -23,6 +25,7 @@ export async function startWebScraperPipeline({
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
inProgress: (progress) => {
Logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) {
partialDocs.push(progress.currentDocument);
if (partialDocs.length > 50) {
@ -32,9 +35,12 @@ export async function startWebScraperPipeline({
}
},
onSuccess: (result) => {
Logger.debug(`🐂 Job completed ${job.id}`);
saveJob(job, result);
},
onError: (error) => {
Logger.error(`🐂 Job failed ${job.id}`);
ScrapeEvents.logJobEvent(job, "failed");
job.moveToFailed(error);
},
team_id: job.data.team_id,
@ -56,6 +62,7 @@ export async function runWebScraper({
const provider = new WebScraperDataProvider();
if (mode === "crawl") {
await provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: [url],
crawlerOptions: crawlerOptions,
@ -64,6 +71,7 @@ export async function runWebScraper({
});
} else {
await provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: url.split(","),
crawlerOptions: crawlerOptions,
@ -108,7 +116,6 @@ export async function runWebScraper({
// this return doesn't matter too much for the job completion result
return { success: true, message: "", docs: filteredDocs };
} catch (error) {
console.error("Error running web scraper", error);
onError(error);
return { success: false, message: error.message, docs: [] };
}
@ -135,7 +142,8 @@ const saveJob = async (job: Job, result: any) => {
// I think the job won't exist here anymore
}
}
ScrapeEvents.logJobEvent(job, "completed");
} catch (error) {
console.error("Failed to update job status:", error);
Logger.error(`🐂 Failed to update job status: ${error}`);
}
};

View File

@ -42,6 +42,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
@ -76,6 +77,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
@ -104,6 +106,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
@ -133,6 +136,7 @@ describe('WebCrawler', () => {
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
@ -161,6 +165,7 @@ describe('WebCrawler', () => {
// Setup the crawler with the specific test case options
const crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
@ -194,6 +199,7 @@ describe('WebCrawler', () => {
const limit = 2; // Set a limit for the number of links
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],

View File

@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
expect(resultWithHtml.html).toBeDefined();
expect(resultWithoutHtml.html).toBeUndefined();
@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
const url = 'https://mendable.ai';
const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl(url, pageOptions);
const result = await scrapSingleUrl("TEST", url, pageOptions);
// Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined();

View File

@ -8,8 +8,10 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
private baseUrl: string;
private includes: string[];
@ -26,6 +28,7 @@ export class WebCrawler {
private allowExternalContentLinks: boolean;
constructor({
jobId,
initialUrl,
includes,
excludes,
@ -36,6 +39,7 @@ export class WebCrawler {
allowBackwardCrawling = false,
allowExternalContentLinks = false
}: {
jobId: string;
initialUrl: string;
includes?: string[];
excludes?: string[];
@ -46,6 +50,7 @@ export class WebCrawler {
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? [];
@ -116,7 +121,7 @@ export class WebCrawler {
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt
if (!isAllowed) {
console.log(`Link disallowed by robots.txt: ${link}`);
Logger.debug(`Link disallowed by robots.txt: ${link}`);
return false;
}
@ -133,15 +138,19 @@ export class WebCrawler {
limit: number = 10000,
maxDepth: number = 10
): Promise<{ url: string, html: string }[]> {
Logger.debug(`Crawler starting with ${this.initialUrl}`);
// Fetch and parse robots.txt
try {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
this.robots = robotsParser(this.robotsTxtUrl, response.data);
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
} catch (error) {
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
}
if(!crawlerOptions?.ignoreSitemap){
if (!crawlerOptions?.ignoreSitemap){
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
@ -175,6 +184,7 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => {
Logger.debug(`Crawling ${task}`);
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") {
callback();
@ -216,16 +226,18 @@ export class WebCrawler {
}
}, concurrencyLimit);
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
queue.push(
urls.filter(
(url) =>
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
),
(err) => {
if (err) console.error(err);
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
}
);
await queue.drain();
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
}
@ -253,7 +265,7 @@ export class WebCrawler {
// If it is the first link, fetch with single url
if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined;
@ -282,7 +294,6 @@ export class WebCrawler {
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
@ -452,7 +463,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
}
} catch (error) {
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) {
sitemapLinks = response;
@ -467,7 +478,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
}
} catch (error) {
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
}
}

View File

@ -1,10 +1,12 @@
import { Logger } from "../../../lib/logger";
export async function handleCustomScraping(
text: string,
url: string
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
Logger.debug(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return {
@ -19,7 +21,7 @@ export async function handleCustomScraping(
// Check for Vanta security portals
if (text.includes('<link href="https://static.vanta.com')) {
console.log(
Logger.debug(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
);
return {
@ -34,7 +36,7 @@ export async function handleCustomScraping(
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
if (googleDriveMetaMatch) {
const url = googleDriveMetaMatch[1];
console.log(`Google Drive PDF link detected: ${url}`);
Logger.debug(`Google Drive PDF link detected: ${url}`);
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
if (fileIdMatch) {

View File

@ -19,8 +19,10 @@ import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger";
export class WebScraperDataProvider {
private jobId: string;
private bullJobId: string;
private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@ -65,6 +67,7 @@ export class WebScraperDataProvider {
batchUrls.map(async (url, index) => {
const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(
this.jobId,
url,
this.pageOptions,
this.extractorOptions,
@ -89,14 +92,14 @@ export class WebScraperDataProvider {
const job = await getWebScraperQueue().getJob(this.bullJobId);
const jobStatus = await job.getState();
if (jobStatus === "failed") {
console.error(
Logger.info(
"Job has failed or has been cancelled by the user. Stopping the job..."
);
return [] as Document[];
}
}
} catch (error) {
console.error(error);
Logger.error(error.message);
return [] as Document[];
}
}
@ -165,6 +168,7 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const crawler = new WebCrawler({
jobId: this.jobId,
initialUrl: this.urls[0],
includes: this.includes,
excludes: this.excludes,
@ -270,7 +274,7 @@ export class WebScraperDataProvider {
this.mode === "single_urls" && links.length > 0
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
(error) => {
console.error("Failed to fetch sitemap data:", error);
Logger.debug(`Failed to fetch sitemap data: ${error}`);
return null;
}
)
@ -460,7 +464,7 @@ export class WebScraperDataProvider {
let documents: Document[] = [];
for (const url of urls) {
const normalizedUrl = this.normalizeUrl(url);
console.log(
Logger.debug(
"Getting cached document for web-scraper-cache:" + normalizedUrl
);
const cachedDocumentString = await getValue(
@ -499,6 +503,7 @@ export class WebScraperDataProvider {
throw new Error("Urls are required");
}
this.jobId = options.jobId;
this.bullJobId = options.bullJobId;
this.urls = options.urls;
this.mode = options.mode;

View File

@ -2,6 +2,7 @@ import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with Axios
@ -34,9 +35,7 @@ export async function scrapWithFetch(
});
if (response.status !== 200) {
console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}`
);
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`);
logParams.error_message = response.statusText;
logParams.response_code = response.status;
return {
@ -63,10 +62,10 @@ export async function scrapWithFetch(
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
console.log(`[Axios] Request timed out for ${url}`);
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
}
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {

View File

@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with Fire-Engine
@ -59,12 +60,10 @@ export async function scrapWithFireEngine({
let engine = engineParam; // do we want fireEngineOptions as first choice?
console.log(
`[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
// console.log(fireEngineOptionsParam)
const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
@ -84,15 +83,15 @@ export async function scrapWithFireEngine({
);
if (response.status !== 200) {
console.error(
`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`
Logger.debug(
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
);
logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode;
if(response.data && response.data?.pageStatusCode !== 200) {
console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`);
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
}
return {
@ -130,10 +129,10 @@ export async function scrapWithFireEngine({
}
} catch (error) {
if (error.code === "ECONNABORTED") {
console.log(`[Fire-Engine] Request timed out for ${url}`);
Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
logParams.error_message = "Request timed out";
} else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error;
}
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };

View File

@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with Playwright
@ -51,8 +52,8 @@ export async function scrapWithPlaywright(
);
if (response.status !== 200) {
console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
Logger.debug(
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
);
logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode;
@ -86,8 +87,8 @@ export async function scrapWithPlaywright(
};
} catch (jsonError) {
logParams.error_message = jsonError.message || jsonError;
console.error(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
Logger.debug(
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
);
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
}
@ -95,10 +96,10 @@ export async function scrapWithPlaywright(
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
console.log(`[Playwright] Request timed out for ${url}`);
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
}
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {

View File

@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { ScrapingBeeClient } from "scrapingbee";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with ScrapingBee
@ -56,8 +57,8 @@ export async function scrapWithScrapingBee(
text = decoder.decode(response.data);
logParams.success = true;
} catch (decodeError) {
console.error(
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
Logger.debug(
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
);
logParams.error_message = decodeError.message || decodeError;
}
@ -72,7 +73,7 @@ export async function scrapWithScrapingBee(
};
}
} catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error;
logParams.response_code = error.response?.status;
return {

View File

@ -17,10 +17,12 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils";
import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
dotenv.config();
const baseScrapers = [
export const baseScrapers = [
"fire-engine",
"fire-engine;chrome-cdp",
"scrapingBee",
@ -48,7 +50,7 @@ export async function generateRequestParams(
return defaultParams;
}
} catch (error) {
console.error(`Error generating URL key: ${error}`);
Logger.error(`Error generating URL key: ${error}`);
return defaultParams;
}
}
@ -117,6 +119,7 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl(
jobId: string,
urlToScrap: string,
pageOptions: PageOptions = {
onlyMainContent: true,
@ -144,6 +147,15 @@ export async function scrapSingleUrl(
} = { text: "", screenshot: "", metadata: {} };
let screenshot = "";
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
url,
worker: process.env.FLY_MACHINE_ID,
method,
result: null,
});
switch (method) {
case "fire-engine":
case "fire-engine;chrome-cdp":
@ -154,7 +166,6 @@ export async function scrapSingleUrl(
}
if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine({
url,
waitFor: pageOptions.waitFor,
@ -254,8 +265,19 @@ export async function scrapSingleUrl(
}
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
const text = await parseMarkdown(cleanedHtml);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: scraperResponse.text.length,
success: !scraperResponse.metadata.pageError && !!text,
error: scraperResponse.metadata.pageError,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
return {
text: await parseMarkdown(cleanedHtml),
text,
html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot,
@ -277,7 +299,7 @@ export async function scrapSingleUrl(
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
} catch (error) {
console.error(`Invalid URL key, trying: ${urlToScrap}`);
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
}
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(
@ -311,12 +333,18 @@ export async function scrapSingleUrl(
pageError = undefined;
}
if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
if (text && text.trim().length >= 100) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
break;
}
if (pageStatusCode && pageStatusCode == 404) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
break;
}
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
// if (nextScraperIndex < scrapersInOrder.length) {
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
// }
}
if (!text) {
@ -372,7 +400,12 @@ export async function scrapSingleUrl(
return document;
} catch (error) {
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
return {
content: "",
markdown: "",

View File

@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler";
import { Logger } from "../../lib/logger";
export async function getLinksFromSitemap(
{
@ -26,7 +27,7 @@ export async function getLinksFromSitemap(
content = response.html;
}
} catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`);
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
return allUrls;
}
@ -48,7 +49,7 @@ export async function getLinksFromSitemap(
}
}
} catch (error) {
console.error(`Error processing ${sitemapUrl}: ${error}`);
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
}
return allUrls;

View File

@ -1,3 +1,4 @@
import { Logger } from '../../../../lib/logger';
import { isUrlBlocked } from '../blocklist';
describe('isUrlBlocked', () => {
@ -19,7 +20,7 @@ describe('isUrlBlocked', () => {
blockedUrls.forEach(url => {
if (!isUrlBlocked(url)) {
console.log(`URL not blocked: ${url}`);
Logger.debug(`URL not blocked: ${url}`);
}
expect(isUrlBlocked(url)).toBe(true);
});

View File

@ -1,3 +1,5 @@
import { Logger } from "../../../lib/logger";
const socialMediaBlocklist = [
'facebook.com',
'x.com',
@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean {
return isBlocked;
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
console.error(`Error parsing the following URL: ${url}`);
Logger.error(`Error parsing the following URL: ${url}`);
return false;
}
}

View File

@ -1,5 +1,6 @@
import Anthropic from '@anthropic-ai/sdk';
import axios from 'axios';
import { Logger } from '../../../lib/logger';
export async function getImageDescription(
imageUrl: string,
@ -82,7 +83,7 @@ export async function getImageDescription(
}
}
} catch (error) {
console.error("Error generating image alt text:", error?.message);
Logger.error(`Error generating image alt text: ${error}`);
return "";
}
}

View File

@ -1,4 +1,6 @@
import { CheerioAPI } from "cheerio";
import { Logger } from "../../../lib/logger";
interface Metadata {
title?: string;
description?: string;
@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
} catch (error) {
console.error("Error extracting metadata:", error);
Logger.error(`Error extracting metadata: ${error}`);
}
return {

View File

@ -7,6 +7,7 @@ import pdf from "pdf-parse";
import path from "path";
import os from "os";
import { axiosTimeout } from "../../../lib/timeout";
import { Logger } from "../../../lib/logger";
dotenv.config();
@ -39,6 +40,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
let content = "";
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
Logger.debug("Processing pdf document w/ LlamaIndex");
const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = {
Authorization: `Bearer ${apiKey}`,
@ -81,7 +83,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
}
} catch (error) {
console.error("Error fetching result w/ LlamaIndex");
Logger.debug("Error fetching result w/ LlamaIndex");
attempt++;
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently
@ -93,7 +95,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
}
content = resultResponse.data[resultType];
} catch (error) {
console.error("Error processing pdf document w/ LlamaIndex(2)");
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath);
}
} else if (parsePDF) {

View File

@ -1,3 +1,4 @@
import { Logger } from "../../../lib/logger";
import { Document } from "../../../lib/entities";
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
return documents;
} catch (error) {
console.error("Error replacing paths with absolute paths", error);
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
return documents;
}
};
@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
return documents;
} catch (error) {
console.error("Error replacing img paths with absolute paths", error);
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
return documents;
}
};

View File

@ -1,5 +1,6 @@
import axios from "axios";
import * as cheerio from "cheerio";
import { Logger } from "../../../lib/logger";
export async function attemptScrapWithRequests(
@ -9,13 +10,13 @@ export async function attemptScrapWithRequests(
const response = await axios.get(urlToScrap, { timeout: 15000 });
if (!response.data) {
console.log("Failed normal requests as well");
Logger.debug("Failed normal requests as well");
return null;
}
return response.data;
} catch (error) {
console.error(`Error in attemptScrapWithRequests: ${error}`);
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
return null;
}
}

View File

@ -2,6 +2,7 @@ import axios from 'axios';
import * as cheerio from 'cheerio';
import * as querystring from 'querystring';
import { SearchResult } from '../../src/lib/entities';
import { Logger } from '../../src/lib/logger';
const _useragent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
} catch (error) {
if (error.message === 'Too many requests') {
console.warn('Too many requests, breaking the loop');
Logger.warn('Too many requests, breaking the loop');
break;
}
throw error;
@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results
}
}
if (attempts >= maxAttempts) {
console.warn('Max attempts reached, breaking the loop');
Logger.warn('Max attempts reached, breaking the loop');
}
return results
}

View File

@ -1,3 +1,4 @@
import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
@ -47,7 +48,7 @@ export async function search({
timeout
);
} catch (error) {
console.error("Error in search function: ", error);
Logger.error(`Error in search function: ${error}`);
return []
}
// if process.env.SERPER_API_KEY is set, use serper

View File

@ -1,3 +1,4 @@
import { Logger } from "../../../src/lib/logger";
import { getWebScraperQueue } from "../queue-service";
import { sendSlackWebhook } from "./slack";
@ -9,13 +10,13 @@ export async function checkAlerts() {
process.env.ALERT_NUM_ACTIVE_JOBS &&
process.env.ALERT_NUM_WAITING_JOBS
) {
console.info("Initializing alerts");
Logger.info("Initializing alerts");
const checkActiveJobs = async () => {
try {
const webScraperQueue = getWebScraperQueue();
const activeJobs = await webScraperQueue.getActiveCount();
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
console.warn(
Logger.warn(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
);
sendSlackWebhook(
@ -23,12 +24,12 @@ export async function checkAlerts() {
true
);
} else {
console.info(
Logger.info(
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
);
}
} catch (error) {
console.error("Failed to check active jobs:", error);
Logger.error(`Failed to check active jobs: ${error}`);
}
};
@ -38,7 +39,7 @@ export async function checkAlerts() {
const paused = await webScraperQueue.getPausedCount();
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
console.warn(
Logger.warn(
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
);
sendSlackWebhook(
@ -57,6 +58,6 @@ export async function checkAlerts() {
// setInterval(checkAll, 10000); // Run every
}
} catch (error) {
console.error("Failed to initialize alerts:", error);
Logger.error(`Failed to initialize alerts: ${error}`);
}
}

View File

@ -1,4 +1,5 @@
import axios from "axios";
import { Logger } from "../../../src/lib/logger";
export async function sendSlackWebhook(
message: string,
@ -16,8 +17,8 @@ export async function sendSlackWebhook(
"Content-Type": "application/json",
},
});
console.log("Webhook sent successfully:", response.data);
Logger.log("Webhook sent successfully:", response.data);
} catch (error) {
console.error("Error sending webhook:", error);
Logger.debug(`Error sending webhook: ${error}`);
}
}

View File

@ -2,6 +2,7 @@ import { NotificationType } from "../../types";
import { withAuth } from "../../lib/withAuth";
import { sendNotification } from "../notification/email_notification";
import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger";
const FREE_CREDITS = 500;
@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) {
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" };
}
console.log(`Billing team ${team_id} for ${credits} credits`);
Logger.info(`Billing team ${team_id} for ${credits} credits`);
// When the API is used, you can log the credit usage in the credit_usage table:
// team_id: The ID of the team using the API.
// subscription_id: The ID of the team's active subscription.
@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
0
);
console.log("totalCreditsUsed", totalCreditsUsed);
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
const end = new Date();
end.setDate(end.getDate() + 30);
@ -262,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
});
if (creditUsageError) {
console.error("Error calculating credit usage:", creditUsageError);
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
}
if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used;
}
} catch (error) {
console.error("Error calculating credit usage:", error);
Logger.error(`Error calculating credit usage: ${error}`);
}
// Adjust total credits used by subtracting coupon value

View File

@ -1,5 +1,6 @@
import { Request } from "express";
import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
export async function createIdempotencyKey(
req: Request,
@ -14,7 +15,7 @@ export async function createIdempotencyKey(
.insert({ key: idempotencyKey });
if (error) {
console.error("Failed to create idempotency key:", error);
Logger.error(`Failed to create idempotency key: ${error}`);
throw error;
}

View File

@ -1,6 +1,7 @@
import { Request } from "express";
import { supabase_service } from "../supabase";
import { validate as isUuid } from 'uuid';
import { Logger } from "../../../src/lib/logger";
export async function validateIdempotencyKey(
req: Request,
@ -13,7 +14,7 @@ export async function validateIdempotencyKey(
// Ensure idempotencyKey is treated as a string
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
if (!isUuid(key)) {
console.error("Invalid idempotency key provided in the request headers.");
Logger.debug("Invalid idempotency key provided in the request headers.");
return false;
}
@ -23,7 +24,7 @@ export async function validateIdempotencyKey(
.eq("key", idempotencyKey);
if (error) {
console.error(error);
Logger.error(`Error validating idempotency key: ${error}`);
}
if (!data || data.length === 0) {

View File

@ -1,4 +1,5 @@
import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
import "dotenv/config";
export async function logCrawl(job_id: string, team_id: string) {
@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) {
},
]);
} catch (error) {
console.error("Error logging crawl job:\n", error);
Logger.error(`Error logging crawl job to supabase:\n${error}`);
}
}
}

View File

@ -3,6 +3,7 @@ import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog";
import "dotenv/config";
import { Logger } from "../../lib/logger";
export async function logJob(job: FirecrawlJob) {
try {
@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) {
posthog.capture(phLog);
}
if (error) {
console.error("Error logging job:\n", error);
Logger.error(`Error logging job: ${error.message}`);
}
} catch (error) {
console.error("Error logging job:\n", error);
Logger.error(`Error logging job: ${error.message}`);
}
}

View File

@ -2,15 +2,16 @@ import "dotenv/config";
import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
import { Logger } from "../../lib/logger";
export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
Logger.debug("Skipping logging scrape to Supabase");
return;
}
try {
// Only log jobs in production
// if (process.env.ENV !== "production") {
@ -43,9 +44,9 @@ export async function logScrape(
]);
if (error) {
console.error("Error logging proxy:\n", error);
Logger.error(`Error logging proxy:\n${error}`);
}
} catch (error) {
console.error("Error logging proxy:\n", error);
Logger.error(`Error logging proxy:\n${error}`);
}
}

View File

@ -1,19 +1,20 @@
import { Logtail } from "@logtail/node";
import "dotenv/config";
import { Logger } from "../lib/logger";
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
class MockLogtail {
info(message: string, context?: Record<string, any>): void {
console.log(message, context);
Logger.debug(`${message} - ${context}`);
}
error(message: string, context: Record<string, any> = {}): void {
console.error(message, context);
Logger.error(`${message} - ${context}`);
}
}
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
return new MockLogtail();
})();

View File

@ -2,6 +2,7 @@ import { supabase_service } from "../supabase";
import { withAuth } from "../../lib/withAuth";
import { Resend } from "resend";
import { NotificationType } from "../../types";
import { Logger } from "../../../src/lib/logger";
const emailTemplates: Record<
NotificationType,
@ -52,11 +53,11 @@ async function sendEmailNotification(
});
if (error) {
console.error("Error sending email: ", error);
Logger.debug(`Error sending email: ${error}`);
return { success: false };
}
} catch (error) {
console.error("Error sending email (2): ", error);
Logger.debug(`Error sending email (2): ${error}`);
return { success: false };
}
}
@ -79,7 +80,7 @@ export async function sendNotificationInternal(
.lte("sent_date", endDateString);
if (error) {
console.error("Error fetching notifications: ", error);
Logger.debug(`Error fetching notifications: ${error}`);
return { success: false };
}
@ -93,7 +94,7 @@ export async function sendNotificationInternal(
.eq("team_id", team_id);
if (emailsError) {
console.error("Error fetching emails: ", emailsError);
Logger.debug(`Error fetching emails: ${emailsError}`);
return { success: false };
}
@ -112,7 +113,7 @@ export async function sendNotificationInternal(
]);
if (insertError) {
console.error("Error inserting notification record: ", insertError);
Logger.debug(`Error inserting notification record: ${insertError}`);
return { success: false };
}

View File

@ -1,5 +1,6 @@
import { PostHog } from 'posthog-node';
import "dotenv/config";
import { Logger } from '../../src/lib/logger';
export default function PostHogClient() {
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
@ -19,7 +20,7 @@ class MockPostHog {
export const posthog = process.env.POSTHOG_API_KEY
? PostHogClient()
: (() => {
console.warn(
Logger.warn(
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
);
return new MockPostHog();

View File

@ -1,5 +1,6 @@
import Queue from "bull";
import { Queue as BullQueue } from "bull";
import { Logger } from "../lib/logger";
let webScraperQueue: BullQueue;
@ -16,7 +17,7 @@ export function getWebScraperQueue() {
attempts: 5
}
});
console.log("Web scraper queue created");
Logger.info("Web scraper queue created");
}
return webScraperQueue;
}

View File

@ -7,8 +7,10 @@ import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry';
import { Job } from "bull";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
if(process.env.ENV === 'production') {
if (process.env.ENV === 'production') {
initSDK({
consoleCapture: true,
additionalInstrumentations: [],
@ -18,7 +20,8 @@ if(process.env.ENV === 'production') {
const wsq = getWebScraperQueue();
async function processJob(job: Job, done) {
console.log("taking job", job.id);
Logger.debug(`🐂 Worker taking job ${job.id}`);
try {
job.progress({
current: 1,
@ -58,18 +61,18 @@ async function processJob(job: Job, done) {
pageOptions: job.data.pageOptions,
origin: job.data.origin,
});
console.log("job done", job.id);
Logger.debug(`🐂 Job done ${job.id}`);
done(null, data);
} catch (error) {
console.log("job errored", job.id, error);
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
if (await getWebScraperQueue().isPaused(false)) {
console.log("queue is paused, ignoring");
Logger.debug("🐂Queue is paused, ignoring");
return;
}
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
console.error(error.message); // or any other error handling
Logger.error(error.message); // or any other error handling
logtail.error("Custom error while ingesting", {
job_id: job.id,
@ -77,7 +80,7 @@ async function processJob(job: Job, done) {
dataIngestionJob: error.dataIngestionJob,
});
}
console.log(error);
Logger.error(error);
logtail.error("Overall error ingesting", {
job_id: job.id,
@ -113,3 +116,10 @@ wsq.process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
processJob
);
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

View File

@ -1,14 +1,15 @@
import Redis from "ioredis";
import { redisRateLimitClient } from "./rate-limiter";
import { Logger } from "../lib/logger";
// Listen to 'error' events to the Redis connection
redisRateLimitClient.on("error", (error) => {
try {
if (error.message === "ECONNRESET") {
console.log("Connection to Redis Session Store timed out.");
Logger.error("Connection to Redis Session Rate Limit Store timed out.");
} else if (error.message === "ECONNREFUSED") {
console.log("Connection to Redis Session Store refused!");
} else console.log(error);
Logger.error("Connection to Redis Session Rate Limit Store refused!");
} else Logger.error(error);
} catch (error) {}
});
@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => {
redisRateLimitClient.on("reconnecting", (err) => {
try {
if (redisRateLimitClient.status === "reconnecting")
console.log("Reconnecting to Redis Session Store...");
else console.log("Error reconnecting to Redis Session Store.");
Logger.info("Reconnecting to Redis Session Rate Limit Store...");
else Logger.error("Error reconnecting to Redis Session Rate Limit Store.");
} catch (error) {}
});
// Listen to the 'connect' event to Redis
redisRateLimitClient.on("connect", (err) => {
try {
if (!err) console.log("Connected to Redis Session Store!");
if (!err) Logger.info("Connected to Redis Session Rate Limit Store!");
} catch (error) {}
});

View File

@ -1,4 +1,5 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js";
import { Logger } from "../lib/logger";
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService {
@ -10,13 +11,13 @@ class SupabaseService {
// Only initialize the Supabase client if both URL and Service Token are provided.
if (process.env.USE_DB_AUTHENTICATION === "false") {
// Warn the user that Authentication is disabled by setting the client to null
console.warn(
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
Logger.warn(
"Authentication is disabled. Supabase client will not be initialized."
);
this.client = null;
} else if (!supabaseUrl || !supabaseServiceToken) {
console.error(
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
Logger.error(
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
);
} else {
this.client = createClient(supabaseUrl, supabaseServiceToken);
@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(),
{
get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
Logger.debug(
"Attempted to access Supabase client when it's not configured."
);
}
const client = target.getClient();
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
if (client === null) {
console.error(
Logger.error(
"Attempted to access Supabase client when it's not configured."
);
return () => {

View File

@ -1,3 +1,4 @@
import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
.eq("team_id", teamId)
.limit(1);
if (error) {
console.error(
`Error fetching webhook URL for team ID: ${teamId}`,
error.message
);
Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
return null;
}
@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
}),
});
} catch (error) {
console.error(
`Error sending webhook for team ID: ${teamId}`,
error.message
);
Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
}
};

View File

@ -12,12 +12,12 @@ class SupabaseService {
if (process.env.USE_DB_AUTHENTICATION === "false") {
// Warn the user that Authentication is disabled by setting the client to null
console.warn(
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
"Authentication is disabled. Supabase client will not be initialized."
);
this.client = null;
} else if (!supabaseUrl || !supabaseServiceToken) {
console.error(
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
);
} else {
this.client = createClient(supabaseUrl, supabaseServiceToken);
@ -36,6 +36,11 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(),
{
get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
console.debug(
"Attempted to access Supabase client when it's not configured."
);
}
const client = target.getClient();
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
if (client === null) {