mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 12:40:39 +08:00
Merge branch 'main' into feat/queue-scrapes
This commit is contained in:
commit
46bcbd931f
@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL=
|
||||
|
||||
# Resend API Key for transactional emails
|
||||
RESEND_API_KEY=
|
||||
|
||||
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
|
||||
# Available levels are:
|
||||
# NONE - No logs will be output.
|
||||
# ERROR - For logging error messages that indicate a failure in a specific operation.
|
||||
# WARN - For logging potentially harmful situations that are not necessarily errors.
|
||||
# INFO - For logging informational messages that highlight the progress of the application.
|
||||
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
# TRACE - For logging more detailed information than the DEBUG level.
|
||||
# Set LOGGING_LEVEL to one of the above options to control logging output.
|
||||
LOGGING_LEVEL=INFO
|
||||
|
4
apps/api/.gitignore
vendored
4
apps/api/.gitignore
vendored
@ -3,4 +3,6 @@
|
||||
.env
|
||||
*.csv
|
||||
dump.rdb
|
||||
/mongo-data
|
||||
/mongo-data
|
||||
|
||||
/.next/
|
||||
|
@ -4,15 +4,15 @@
|
||||
#
|
||||
|
||||
app = 'firecrawl-scraper-js'
|
||||
primary_region = 'mia'
|
||||
primary_region = 'iad'
|
||||
kill_signal = 'SIGINT'
|
||||
kill_timeout = '30s'
|
||||
|
||||
[build]
|
||||
|
||||
[processes]
|
||||
app = 'node --max-old-space-size=4096 dist/src/index.js'
|
||||
worker = 'node --max-old-space-size=4096 dist/src/services/queue-worker.js'
|
||||
app = 'node --max-old-space-size=8192 dist/src/index.js'
|
||||
worker = 'node --max-old-space-size=8192 dist/src/services/queue-worker.js'
|
||||
|
||||
[http_service]
|
||||
internal_port = 8080
|
||||
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
||||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 100
|
||||
soft_limit = 50
|
||||
hard_limit = 200
|
||||
soft_limit = 75
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "20s"
|
||||
|
@ -26,6 +26,7 @@
|
||||
"license": "ISC",
|
||||
"devDependencies": {
|
||||
"@flydotio/dockerfile": "^0.4.10",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@tsconfig/recommended": "^1.0.3",
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/bull": "^4.10.0",
|
||||
@ -63,6 +64,7 @@
|
||||
"axios": "^1.3.4",
|
||||
"bottleneck": "^2.19.5",
|
||||
"bull": "^4.15.0",
|
||||
"cacheable-lookup": "^6.1.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"cohere": "^1.1.1",
|
||||
"cors": "^2.8.5",
|
||||
|
20
apps/api/pnpm-lock.yaml
generated
20
apps/api/pnpm-lock.yaml
generated
@ -59,6 +59,9 @@ importers:
|
||||
bull:
|
||||
specifier: ^4.15.0
|
||||
version: 4.15.0
|
||||
cacheable-lookup:
|
||||
specifier: ^6.1.0
|
||||
version: 6.1.0
|
||||
cheerio:
|
||||
specifier: ^1.0.0-rc.12
|
||||
version: 1.0.0-rc.12
|
||||
@ -189,6 +192,9 @@ importers:
|
||||
'@flydotio/dockerfile':
|
||||
specifier: ^0.4.10
|
||||
version: 0.4.11
|
||||
'@jest/globals':
|
||||
specifier: ^29.7.0
|
||||
version: 29.7.0
|
||||
'@tsconfig/recommended':
|
||||
specifier: ^1.0.3
|
||||
version: 1.0.6
|
||||
@ -1937,6 +1943,10 @@ packages:
|
||||
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
||||
cacheable-lookup@6.1.0:
|
||||
resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==}
|
||||
engines: {node: '>=10.6.0'}
|
||||
|
||||
call-bind@1.0.7:
|
||||
resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==}
|
||||
engines: {node: '>= 0.4'}
|
||||
@ -4369,8 +4379,8 @@ packages:
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
typescript@5.5.3:
|
||||
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==}
|
||||
typescript@5.5.4:
|
||||
resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
@ -6917,6 +6927,8 @@ snapshots:
|
||||
|
||||
bytes@3.1.2: {}
|
||||
|
||||
cacheable-lookup@6.1.0: {}
|
||||
|
||||
call-bind@1.0.7:
|
||||
dependencies:
|
||||
es-define-property: 1.0.0
|
||||
@ -8927,7 +8939,7 @@ snapshots:
|
||||
csv-parse: 5.5.6
|
||||
gpt3-tokenizer: 1.1.5
|
||||
openai: 3.3.0
|
||||
typescript: 5.5.3
|
||||
typescript: 5.5.4
|
||||
uuid: 9.0.1
|
||||
zod: 3.23.8
|
||||
transitivePeerDependencies:
|
||||
@ -9519,7 +9531,7 @@ snapshots:
|
||||
|
||||
typescript@5.4.5: {}
|
||||
|
||||
typescript@5.5.3: {}
|
||||
typescript@5.5.4: {}
|
||||
|
||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||
dependencies:
|
||||
|
@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
console.log(crawlData)
|
||||
expect(crawlData.length).toBeGreaterThan(0);
|
||||
expect(crawlData).toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
|
||||
|
87
apps/api/src/controllers/admin/queue.ts
Normal file
87
apps/api/src/controllers/admin/queue.ts
Normal file
@ -0,0 +1,87 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getWebScraperQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function checkQueuesController(req: Request, res: Response) {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
86
apps/api/src/controllers/admin/redis-health.ts
Normal file
86
apps/api/src/controllers/admin/redis-health.ts
Normal file
@ -0,0 +1,86 @@
|
||||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { sendSlackWebhook } from "../../services/alerts/slack";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
try {
|
||||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt === retries) throw error;
|
||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
||||
// Test queueRedis
|
||||
let queueRedisHealth;
|
||||
try {
|
||||
await retryOperation(() => queueRedis.set(testKey, testValue));
|
||||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||
await retryOperation(() => queueRedis.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`queueRedis health check failed: ${error}`);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
// Test redisRateLimitClient
|
||||
let redisRateLimitHealth;
|
||||
try {
|
||||
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
|
||||
redisRateLimitHealth = await retryOperation(() =>
|
||||
redisRateLimitClient.get(testKey)
|
||||
);
|
||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
const healthStatus = {
|
||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||
redisRateLimitClient:
|
||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
||||
};
|
||||
|
||||
if (
|
||||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
Logger.info("Both Redis instances are healthy");
|
||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
Logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
healthStatus
|
||||
)}`,
|
||||
true
|
||||
);
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
true
|
||||
);
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", message: error.message });
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
|
||||
api_key
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error setting trace attributes:', error);
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
|
||||
}
|
||||
@ -82,12 +83,15 @@ export async function supaAuthenticateUser(
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
console.error('Error fetching key and price_id:', error);
|
||||
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
@ -135,7 +139,7 @@ export async function supaAuthenticateUser(
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
console.error(rateLimiterRes);
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
@ -177,7 +181,10 @@ export async function supaAuthenticateUser(
|
||||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
@ -190,7 +197,6 @@ export async function supaAuthenticateUser(
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
}
|
||||
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
@ -199,11 +205,14 @@ function getPlanByPriceId(price_id: string) {
|
||||
return 'standard';
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
default:
|
||||
return 'free';
|
||||
|
@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
const { partialDocs } = await job.progress();
|
||||
|
||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||
console.log("Billing team for partial docs...");
|
||||
Logger.info("Billing team for partial docs...");
|
||||
// Note: the credits that we will bill them here might be lower than the actual
|
||||
// due to promises that are not yet resolved
|
||||
await billTeam(team_id, partialDocs.length);
|
||||
@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
const newJobState = await job.getState();
|
||||
@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,8 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -30,7 +32,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
@ -60,10 +62,11 @@ export async function crawlController(req: Request, res: Response) {
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
try {
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId: uuidv4(),
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
@ -83,7 +86,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
documents: docs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
@ -101,7 +104,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
6
apps/api/src/controllers/liveness.ts
Normal file
6
apps/api/src/controllers/liveness.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
6
apps/api/src/controllers/readiness.ts
Normal file
6
apps/api/src/controllers/readiness.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
@ -12,8 +12,11 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOri
|
||||
import { addWebScraperJob } from '../services/queue-jobs';
|
||||
import { getWebScraperQueue } from '../services/queue-service';
|
||||
import { supabase_service } from '../services/supabase';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
@ -148,7 +151,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
@ -163,8 +166,11 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
checkCredits();
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
@ -205,6 +211,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
@ -224,7 +231,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -7,8 +7,11 @@ import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
@ -75,6 +78,7 @@ export async function searchHelper(
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||
crawlerOptions: {
|
||||
@ -148,6 +152,8 @@ export async function searchController(req: Request, res: Response) {
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
@ -155,11 +161,12 @@ export async function searchController(req: Request, res: Response) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
const result = await searchHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
@ -169,6 +176,7 @@ export async function searchController(req: Request, res: Response) {
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: result.data ? result.data.length : 0,
|
||||
@ -183,7 +191,7 @@ export async function searchController(req: Request, res: Response) {
|
||||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ async function example() {
|
||||
const example = new WebScraperDataProvider();
|
||||
|
||||
await example.setOptions({
|
||||
jobId: "TEST",
|
||||
mode: "crawl",
|
||||
urls: ["https://mendable.ai"],
|
||||
crawlerOptions: {},
|
||||
|
@ -7,21 +7,30 @@ import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import cluster from "cluster";
|
||||
import os from "os";
|
||||
import { Job } from "bull";
|
||||
import { sendSlackWebhook } from "./services/alerts/slack";
|
||||
import { checkAlerts } from "./services/alerts";
|
||||
import Redis from "ioredis";
|
||||
import { redisRateLimitClient } from "./services/rate-limiter";
|
||||
import { Logger } from "./lib/logger";
|
||||
import { adminRouter } from "./routes/admin";
|
||||
import { ScrapeEvents } from "./lib/scrape-events";
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
const { ExpressAdapter } = require("@bull-board/express");
|
||||
|
||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||
console.log(`Number of CPUs: ${numCPUs} available`);
|
||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
|
||||
const cacheable = new CacheableLookup({
|
||||
// this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme
|
||||
lookup:false
|
||||
});
|
||||
|
||||
cacheable.install(http.globalAgent);
|
||||
cacheable.install(https.globalAgent)
|
||||
|
||||
if (cluster.isMaster) {
|
||||
console.log(`Master ${process.pid} is running`);
|
||||
Logger.info(`Master ${process.pid} is running`);
|
||||
|
||||
// Fork workers.
|
||||
for (let i = 0; i < numCPUs; i++) {
|
||||
@ -30,8 +39,8 @@ if (cluster.isMaster) {
|
||||
|
||||
cluster.on("exit", (worker, code, signal) => {
|
||||
if (code !== null) {
|
||||
console.log(`Worker ${worker.process.pid} exited`);
|
||||
console.log("Starting a new worker");
|
||||
Logger.info(`Worker ${worker.process.pid} exited`);
|
||||
Logger.info("Starting a new worker");
|
||||
cluster.fork();
|
||||
}
|
||||
});
|
||||
@ -45,7 +54,6 @@ if (cluster.isMaster) {
|
||||
|
||||
app.use(cors()); // Add this line to enable CORS
|
||||
|
||||
|
||||
const serverAdapter = new ExpressAdapter();
|
||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||
|
||||
@ -70,6 +78,7 @@ if (cluster.isMaster) {
|
||||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
@ -81,14 +90,9 @@ if (cluster.isMaster) {
|
||||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
console.log(`Worker ${process.pid} listening on port ${port}`);
|
||||
console.log(
|
||||
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
console.log("");
|
||||
console.log("1. Make sure Redis is running on port 6379 by default");
|
||||
console.log(
|
||||
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
|
||||
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
Logger.info(
|
||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
});
|
||||
return server;
|
||||
@ -98,27 +102,6 @@ if (cluster.isMaster) {
|
||||
startServer();
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
@ -132,7 +115,7 @@ if (cluster.isMaster) {
|
||||
waitingJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
@ -177,13 +160,13 @@ if (cluster.isMaster) {
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error("Failed to send Slack notification");
|
||||
Logger.error("Failed to send Slack notification");
|
||||
}
|
||||
}
|
||||
}, timeout);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.debug(error);
|
||||
}
|
||||
};
|
||||
|
||||
@ -191,140 +174,18 @@ if (cluster.isMaster) {
|
||||
}
|
||||
});
|
||||
|
||||
app.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||
async (req, res) => {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
console.error("Failed to initialize alerts:", error);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
console.error(`Failed to remove job with ID ${job.id}:`, jobError);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
console.error("Failed to clean last 24h complete jobs:", error);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.get("/is-production", (req, res) => {
|
||||
res.send({ isProduction: global.isProduction });
|
||||
});
|
||||
|
||||
app.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||
async (req, res) => {
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
||||
// Test queueRedis
|
||||
let queueRedisHealth;
|
||||
try {
|
||||
await queueRedis.set(testKey, testValue);
|
||||
queueRedisHealth = await queueRedis.get(testKey);
|
||||
await queueRedis.del(testKey);
|
||||
} catch (error) {
|
||||
console.error("queueRedis health check failed:", error);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
// Test redisRateLimitClient
|
||||
let redisRateLimitHealth;
|
||||
try {
|
||||
await redisRateLimitClient.set(testKey, testValue);
|
||||
redisRateLimitHealth = await redisRateLimitClient.get(testKey);
|
||||
await redisRateLimitClient.del(testKey);
|
||||
} catch (error) {
|
||||
console.error("redisRateLimitClient health check failed:", error);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
const healthStatus = {
|
||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||
redisRateLimitClient:
|
||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
||||
};
|
||||
|
||||
if (
|
||||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
console.log("Both Redis instances are healthy");
|
||||
return res
|
||||
.status(200)
|
||||
.json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
console.log("Redis instances health check:", healthStatus);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
healthStatus
|
||||
)}`,
|
||||
true
|
||||
);
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Redis health check failed:", error);
|
||||
await sendSlackWebhook(
|
||||
`[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
true
|
||||
);
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", message: error.message });
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`Worker ${process.pid} started`);
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||
|
||||
import { generateOpenAICompletions } from "./models";
|
||||
import { Document, ExtractorOptions } from "../entities";
|
||||
import { Logger } from "../logger";
|
||||
|
||||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
@ -44,7 +45,7 @@ export async function generateCompletions(
|
||||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
console.error(`Error generating completions: ${error}`);
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
throw new Error(`Error generating completions: ${error.message}`);
|
||||
}
|
||||
default:
|
||||
|
@ -48,7 +48,7 @@ function prepareOpenAIDoc(
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = "gpt-4o",
|
||||
model = process.env.MODEL_NAME || "gpt-4o",
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
|
@ -1,6 +1,6 @@
|
||||
export const defaultOrigin = "api";
|
||||
|
||||
export const defaultTimeout = 30000; // 30 seconds
|
||||
export const defaultTimeout = 45000; // 45 seconds
|
||||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
|
@ -56,6 +56,7 @@ export type CrawlerOptions = {
|
||||
}
|
||||
|
||||
export type WebScraperOptions = {
|
||||
jobId: string;
|
||||
urls: string[];
|
||||
mode: "single_urls" | "sitemap" | "crawl";
|
||||
crawlerOptions?: CrawlerOptions;
|
||||
@ -138,4 +139,5 @@ export interface FireEngineOptions{
|
||||
engine?: string;
|
||||
blockMedia?: boolean;
|
||||
blockAds?: boolean;
|
||||
disableJsDom?: boolean;
|
||||
}
|
||||
|
53
apps/api/src/lib/logger.ts
Normal file
53
apps/api/src/lib/logger.ts
Normal file
@ -0,0 +1,53 @@
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
||||
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
||||
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
||||
}
|
||||
export class Logger {
|
||||
static colors = {
|
||||
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
||||
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
||||
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
||||
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
||||
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
||||
};
|
||||
|
||||
static log (message: string, level: LogLevel) {
|
||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
|
||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||
const currentLevelIndex = levels.indexOf(logLevel);
|
||||
const messageLevelIndex = levels.indexOf(level);
|
||||
|
||||
if (currentLevelIndex >= messageLevelIndex) {
|
||||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// if (process.env.USE_DB_AUTH) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
}
|
||||
}
|
||||
static error(message: string | any) {
|
||||
Logger.log(message, LogLevel.ERROR);
|
||||
}
|
||||
|
||||
static warn(message: string) {
|
||||
Logger.log(message, LogLevel.WARN);
|
||||
}
|
||||
|
||||
static info(message: string) {
|
||||
Logger.log(message, LogLevel.INFO);
|
||||
}
|
||||
|
||||
static debug(message: string) {
|
||||
Logger.log(message, LogLevel.DEBUG);
|
||||
}
|
||||
|
||||
static trace(message: string) {
|
||||
Logger.log(message, LogLevel.TRACE);
|
||||
}
|
||||
}
|
84
apps/api/src/lib/scrape-events.ts
Normal file
84
apps/api/src/lib/scrape-events.ts
Normal file
@ -0,0 +1,84 @@
|
||||
import { Job, JobId } from "bull";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
type: "error",
|
||||
message: string,
|
||||
stack?: string,
|
||||
}
|
||||
|
||||
export type ScrapeScrapeEvent = {
|
||||
type: "scrape",
|
||||
url: string,
|
||||
worker?: string,
|
||||
method: (typeof baseScrapers)[number],
|
||||
result: null | {
|
||||
success: boolean,
|
||||
response_code?: number,
|
||||
response_size?: number,
|
||||
error?: string | object,
|
||||
// proxy?: string,
|
||||
time_taken: number,
|
||||
},
|
||||
}
|
||||
|
||||
export type ScrapeQueueEvent = {
|
||||
type: "queue",
|
||||
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
|
||||
worker?: string,
|
||||
}
|
||||
|
||||
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
|
||||
|
||||
export class ScrapeEvents {
|
||||
static async insert(jobId: string, content: ScrapeEvent) {
|
||||
if (jobId === "TEST") return null;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
try {
|
||||
const result = await supabase.from("scrape_events").insert({
|
||||
job_id: jobId,
|
||||
type: content.type,
|
||||
content: content,
|
||||
// created_at
|
||||
}).select().single();
|
||||
return (result.data as any).id;
|
||||
} catch (error) {
|
||||
Logger.error(`Error inserting scrape event: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
|
||||
if (logId === null) return;
|
||||
|
||||
try {
|
||||
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
|
||||
await supabase.from("scrape_events").update({
|
||||
content: {
|
||||
...previousLog.content,
|
||||
result,
|
||||
}
|
||||
}).eq("id", logId);
|
||||
} catch (error) {
|
||||
Logger.error(`Error updating scrape result: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
|
||||
try {
|
||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||
type: "queue",
|
||||
event,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging job event: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
@ -8,7 +9,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||
return async function (...args: U): Promise<T> {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
if (warningCount < 5) {
|
||||
console.warn("WARNING - You're bypassing authentication");
|
||||
Logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
}
|
||||
return { success: true } as T;
|
||||
@ -16,7 +17,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
console.error("Error in withAuth function: ", error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,8 @@ import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../lib/entities";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
@ -23,6 +25,7 @@ export async function startWebScraperPipeline({
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
partialDocs.push(progress.currentDocument);
|
||||
if (partialDocs.length > 50) {
|
||||
@ -32,9 +35,12 @@ export async function startWebScraperPipeline({
|
||||
}
|
||||
},
|
||||
onSuccess: (result) => {
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result);
|
||||
},
|
||||
onError: (error) => {
|
||||
Logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error);
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
@ -56,6 +62,7 @@ export async function runWebScraper({
|
||||
const provider = new WebScraperDataProvider();
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
crawlerOptions: crawlerOptions,
|
||||
@ -64,6 +71,7 @@ export async function runWebScraper({
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
crawlerOptions: crawlerOptions,
|
||||
@ -108,7 +116,6 @@ export async function runWebScraper({
|
||||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
} catch (error) {
|
||||
console.error("Error running web scraper", error);
|
||||
onError(error);
|
||||
return { success: false, message: error.message, docs: [] };
|
||||
}
|
||||
@ -135,7 +142,8 @@ const saveJob = async (job: Job, result: any) => {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
console.error("Failed to update job status:", error);
|
||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
}
|
||||
};
|
||||
|
29
apps/api/src/routes/admin.ts
Normal file
29
apps/api/src/routes/admin.ts
Normal file
@ -0,0 +1,29 @@
|
||||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import {
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||
redisHealthController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
cleanBefore24hCompleteJobsController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||
checkQueuesController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
queuesController
|
||||
);
|
@ -7,6 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
@ -23,3 +25,6 @@ v0Router.get("/v0/keyAuth", keyAuthController);
|
||||
// Search routes
|
||||
v0Router.post("/v0/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
v0Router.get("/v0/health/liveness", livenessController);
|
||||
v0Router.get("/v0/health/readiness", readinessController);
|
||||
|
@ -42,6 +42,7 @@ describe('WebCrawler', () => {
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
@ -76,6 +77,7 @@ describe('WebCrawler', () => {
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
@ -104,6 +106,7 @@ describe('WebCrawler', () => {
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
@ -133,6 +136,7 @@ describe('WebCrawler', () => {
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
@ -161,6 +165,7 @@ describe('WebCrawler', () => {
|
||||
|
||||
// Setup the crawler with the specific test case options
|
||||
const crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
@ -194,6 +199,7 @@ describe('WebCrawler', () => {
|
||||
const limit = 2; // Set a limit for the number of links
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
15
apps/api/src/scraper/WebScraper/__tests__/dns.test.ts
Normal file
15
apps/api/src/scraper/WebScraper/__tests__/dns.test.ts
Normal file
@ -0,0 +1,15 @@
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import https from 'node:https';
|
||||
import axios from "axios";
|
||||
|
||||
describe("DNS", () => {
|
||||
it("cached dns", async () => {
|
||||
const cachedDns = new CacheableLookup();
|
||||
cachedDns.install(https.globalAgent);
|
||||
jest.spyOn(cachedDns, "lookupAsync");
|
||||
|
||||
const res = await axios.get("https://example.com");
|
||||
expect(res.status).toBe(200);
|
||||
expect(cachedDns.lookupAsync).toHaveBeenCalled();
|
||||
});
|
||||
});
|
@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
|
||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||
|
||||
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
|
||||
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||
|
||||
expect(resultWithHtml.html).toBeDefined();
|
||||
expect(resultWithoutHtml.html).toBeUndefined();
|
||||
@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
|
||||
const url = 'https://mendable.ai';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl(url, pageOptions);
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
|
@ -8,8 +8,10 @@ import { scrapSingleUrl } from "./single_url";
|
||||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
private baseUrl: string;
|
||||
private includes: string[];
|
||||
@ -26,6 +28,7 @@ export class WebCrawler {
|
||||
private allowExternalContentLinks: boolean;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
initialUrl,
|
||||
includes,
|
||||
excludes,
|
||||
@ -36,6 +39,7 @@ export class WebCrawler {
|
||||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
@ -46,6 +50,7 @@ export class WebCrawler {
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
this.baseUrl = new URL(initialUrl).origin;
|
||||
this.includes = includes ?? [];
|
||||
@ -64,7 +69,7 @@ export class WebCrawler {
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
const url = new URL(link);
|
||||
const url = new URL(link.trim(), this.baseUrl);
|
||||
const path = url.pathname;
|
||||
|
||||
const depth = getURLDepth(url.toString());
|
||||
@ -116,7 +121,7 @@ export class WebCrawler {
|
||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
// Check if the link is disallowed by robots.txt
|
||||
if (!isAllowed) {
|
||||
console.log(`Link disallowed by robots.txt: ${link}`);
|
||||
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -133,15 +138,19 @@ export class WebCrawler {
|
||||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
|
||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if(!crawlerOptions?.ignoreSitemap){
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
@ -175,6 +184,7 @@ export class WebCrawler {
|
||||
inProgress?: (progress: Progress) => void,
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
Logger.debug(`Crawling ${task}`);
|
||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
@ -216,16 +226,18 @@ export class WebCrawler {
|
||||
}
|
||||
}, concurrencyLimit);
|
||||
|
||||
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
||||
queue.push(
|
||||
urls.filter(
|
||||
(url) =>
|
||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
||||
),
|
||||
(err) => {
|
||||
if (err) console.error(err);
|
||||
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
@ -253,7 +265,7 @@ export class WebCrawler {
|
||||
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||
content = page.html ?? "";
|
||||
pageStatusCode = page.metadata?.pageStatusCode;
|
||||
pageError = page.metadata?.pageError || undefined;
|
||||
@ -282,7 +294,6 @@ export class WebCrawler {
|
||||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
@ -452,7 +463,7 @@ export class WebCrawler {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
@ -467,7 +478,7 @@ export class WebCrawler {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(
|
||||
Logger.debug(
|
||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||
);
|
||||
return {
|
||||
@ -19,7 +21,7 @@ export async function handleCustomScraping(
|
||||
|
||||
// Check for Vanta security portals
|
||||
if (text.includes('<link href="https://static.vanta.com')) {
|
||||
console.log(
|
||||
Logger.debug(
|
||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||
);
|
||||
return {
|
||||
@ -34,7 +36,7 @@ export async function handleCustomScraping(
|
||||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||
if (googleDriveMetaMatch) {
|
||||
const url = googleDriveMetaMatch[1];
|
||||
console.log(`Google Drive PDF link detected: ${url}`);
|
||||
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
|
||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||
if (fileIdMatch) {
|
||||
|
@ -19,8 +19,10 @@ import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private jobId: string;
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
@ -65,6 +67,7 @@ export class WebScraperDataProvider {
|
||||
batchUrls.map(async (url, index) => {
|
||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(
|
||||
this.jobId,
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
@ -89,14 +92,14 @@ export class WebScraperDataProvider {
|
||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||
const jobStatus = await job.getState();
|
||||
if (jobStatus === "failed") {
|
||||
console.error(
|
||||
Logger.info(
|
||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||
);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error.message);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
@ -165,6 +168,7 @@ export class WebScraperDataProvider {
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
@ -270,7 +274,7 @@ export class WebScraperDataProvider {
|
||||
this.mode === "single_urls" && links.length > 0
|
||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||
(error) => {
|
||||
console.error("Failed to fetch sitemap data:", error);
|
||||
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
||||
return null;
|
||||
}
|
||||
)
|
||||
@ -460,7 +464,7 @@ export class WebScraperDataProvider {
|
||||
let documents: Document[] = [];
|
||||
for (const url of urls) {
|
||||
const normalizedUrl = this.normalizeUrl(url);
|
||||
console.log(
|
||||
Logger.debug(
|
||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
const cachedDocumentString = await getValue(
|
||||
@ -499,6 +503,7 @@ export class WebScraperDataProvider {
|
||||
throw new Error("Urls are required");
|
||||
}
|
||||
|
||||
this.jobId = options.jobId;
|
||||
this.bullJobId = options.bullJobId;
|
||||
this.urls = options.urls;
|
||||
this.mode = options.mode;
|
||||
|
@ -2,6 +2,7 @@ import axios from "axios";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Axios
|
||||
@ -34,9 +35,7 @@ export async function scrapWithFetch(
|
||||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`);
|
||||
logParams.error_message = response.statusText;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
@ -63,10 +62,10 @@ export async function scrapWithFetch(
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
console.log(`[Axios] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
|
@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Fire-Engine
|
||||
@ -59,12 +60,10 @@ export async function scrapWithFireEngine({
|
||||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
console.log(
|
||||
`[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// console.log(fireEngineOptionsParam)
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
@ -84,15 +83,15 @@ export async function scrapWithFireEngine({
|
||||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`
|
||||
Logger.debug(
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
|
||||
);
|
||||
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
|
||||
if(response.data && response.data?.pageStatusCode !== 200) {
|
||||
console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`);
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
|
||||
}
|
||||
|
||||
return {
|
||||
@ -130,10 +129,10 @@ export async function scrapWithFireEngine({
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
console.log(`[Fire-Engine] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
} else {
|
||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
}
|
||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
|
@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Playwright
|
||||
@ -51,8 +52,8 @@ export async function scrapWithPlaywright(
|
||||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
||||
);
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
@ -86,8 +87,8 @@ export async function scrapWithPlaywright(
|
||||
};
|
||||
} catch (jsonError) {
|
||||
logParams.error_message = jsonError.message || jsonError;
|
||||
console.error(
|
||||
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
||||
);
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
}
|
||||
@ -95,10 +96,10 @@ export async function scrapWithPlaywright(
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
console.log(`[Playwright] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
|
@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with ScrapingBee
|
||||
@ -56,8 +57,8 @@ export async function scrapWithScrapingBee(
|
||||
text = decoder.decode(response.data);
|
||||
logParams.success = true;
|
||||
} catch (decodeError) {
|
||||
console.error(
|
||||
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
|
||||
Logger.debug(
|
||||
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
||||
);
|
||||
logParams.error_message = decodeError.message || decodeError;
|
||||
}
|
||||
@ -72,7 +73,7 @@ export async function scrapWithScrapingBee(
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
logParams.response_code = error.response?.status;
|
||||
return {
|
||||
|
@ -17,17 +17,20 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||
import { extractLinks } from "./utils/utils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
import { clientSideError } from "../../strings";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const baseScrapers = [
|
||||
export const baseScrapers = [
|
||||
"fire-engine",
|
||||
"fire-engine;chrome-cdp",
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
] as const;
|
||||
].filter(Boolean);
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
@ -48,7 +51,7 @@ export async function generateRequestParams(
|
||||
return defaultParams;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error generating URL key: ${error}`);
|
||||
Logger.error(`Error generating URL key: ${error}`);
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
@ -82,22 +85,22 @@ function getScrapingFallbackOrder(
|
||||
});
|
||||
|
||||
let defaultOrder = [
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
||||
"scrapingBee",
|
||||
"fire-engine",
|
||||
"fire-engine;chrome-cdp",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
];
|
||||
].filter(Boolean);
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
];
|
||||
].filter(Boolean);
|
||||
}
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
@ -117,6 +120,7 @@ function getScrapingFallbackOrder(
|
||||
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
@ -144,6 +148,15 @@ export async function scrapSingleUrl(
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
type: "scrape",
|
||||
url,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method,
|
||||
result: null,
|
||||
});
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
case "fire-engine;chrome-cdp":
|
||||
@ -154,7 +167,6 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.log(`Scraping ${url} with Fire Engine`);
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
waitFor: pageOptions.waitFor,
|
||||
@ -254,8 +266,19 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
const text = await parseMarkdown(cleanedHtml);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: scraperResponse.text.length,
|
||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
error: scraperResponse.metadata.pageError,
|
||||
response_code: scraperResponse.metadata.pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
text: await parseMarkdown(cleanedHtml),
|
||||
text,
|
||||
html: cleanedHtml,
|
||||
rawHtml: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
@ -277,7 +300,7 @@ export async function scrapSingleUrl(
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
@ -289,7 +312,7 @@ export async function scrapSingleUrl(
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingHtml && existingHtml.trim().length >= 100) {
|
||||
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
|
||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||
text = await parseMarkdown(cleanedHtml);
|
||||
html = cleanedHtml;
|
||||
@ -311,12 +334,18 @@ export async function scrapSingleUrl(
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) break;
|
||||
if (pageStatusCode && pageStatusCode == 404) break;
|
||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
if (nextScraperIndex < scrapersInOrder.length) {
|
||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && pageStatusCode == 404) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||
break;
|
||||
}
|
||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
// if (nextScraperIndex < scrapersInOrder.length) {
|
||||
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
// }
|
||||
}
|
||||
|
||||
if (!text) {
|
||||
@ -372,7 +401,12 @@ export async function scrapSingleUrl(
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||
ScrapeEvents.insert(jobId, {
|
||||
type: "error",
|
||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||
stack: error.stack,
|
||||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
|
@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
{
|
||||
@ -22,11 +23,11 @@ export async function getLinksFromSitemap(
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} });
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"tlsclient", disableJsDom: true, mobileProxy: true } });
|
||||
content = response.html;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
@ -48,7 +49,7 @@ export async function getLinksFromSitemap(
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error processing ${sitemapUrl}: ${error}`);
|
||||
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { Logger } from '../../../../lib/logger';
|
||||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
@ -19,7 +20,7 @@ describe('isUrlBlocked', () => {
|
||||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
console.log(`URL not blocked: ${url}`);
|
||||
Logger.debug(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
|
@ -1,3 +1,5 @@
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
'x.com',
|
||||
@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean {
|
||||
return isBlocked;
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
console.error(`Error parsing the following URL: ${url}`);
|
||||
Logger.error(`Error parsing the following URL: ${url}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -212,4 +212,24 @@ export const urlSpecificParams = {
|
||||
engine: "playwright",
|
||||
}
|
||||
},
|
||||
"mendable.ai":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
mobileProxy: true,
|
||||
method: "get",
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"developer.apple.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
engine: "playwright",
|
||||
wait: 2000,
|
||||
fireEngineOptions: {
|
||||
blockMedia: false,
|
||||
}
|
||||
},
|
||||
},
|
||||
};
|
||||
|
@ -1,5 +1,6 @@
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import axios from 'axios';
|
||||
import { Logger } from '../../../lib/logger';
|
||||
|
||||
export async function getImageDescription(
|
||||
imageUrl: string,
|
||||
@ -82,7 +83,7 @@ export async function getImageDescription(
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error generating image alt text:", error?.message);
|
||||
Logger.error(`Error generating image alt text: ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,6 @@
|
||||
import { CheerioAPI } from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
interface Metadata {
|
||||
title?: string;
|
||||
description?: string;
|
||||
@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
|
||||
} catch (error) {
|
||||
console.error("Error extracting metadata:", error);
|
||||
Logger.error(`Error extracting metadata: ${error}`);
|
||||
}
|
||||
|
||||
return {
|
||||
|
@ -7,14 +7,20 @@ import pdf from "pdf-parse";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { axiosTimeout } from "../../../lib/timeout";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
try {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
||||
return { content: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||
@ -39,6 +45,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
Logger.debug("Processing pdf document w/ LlamaIndex");
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
@ -81,7 +88,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error fetching result w/ LlamaIndex");
|
||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||
attempt++;
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
@ -93,7 +100,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||
}
|
||||
content = resultResponse.data[resultType];
|
||||
} catch (error) {
|
||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else if (parsePDF) {
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { Document } from "../../../lib/entities";
|
||||
|
||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
console.error("Error replacing paths with absolute paths", error);
|
||||
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
||||
@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
console.error("Error replacing img paths with absolute paths", error);
|
||||
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
@ -1,5 +1,6 @@
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
|
||||
export async function attemptScrapWithRequests(
|
||||
@ -9,13 +10,13 @@ export async function attemptScrapWithRequests(
|
||||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||
|
||||
if (!response.data) {
|
||||
console.log("Failed normal requests as well");
|
||||
Logger.debug("Failed normal requests as well");
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error(`Error in attemptScrapWithRequests: ${error}`);
|
||||
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import * as querystring from 'querystring';
|
||||
import { SearchResult } from '../../src/lib/entities';
|
||||
import { Logger } from '../../src/lib/logger';
|
||||
|
||||
const _useragent_list = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||
@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results
|
||||
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||
} catch (error) {
|
||||
if (error.message === 'Too many requests') {
|
||||
console.warn('Too many requests, breaking the loop');
|
||||
Logger.warn('Too many requests, breaking the loop');
|
||||
break;
|
||||
}
|
||||
throw error;
|
||||
@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results
|
||||
}
|
||||
}
|
||||
if (attempts >= maxAttempts) {
|
||||
console.warn('Max attempts reached, breaking the loop');
|
||||
Logger.warn('Max attempts reached, breaking the loop');
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { serper_search } from "./serper";
|
||||
@ -47,7 +48,7 @@ export async function search({
|
||||
timeout
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error in search function: ", error);
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getWebScraperQueue } from "../queue-service";
|
||||
import { sendSlackWebhook } from "./slack";
|
||||
|
||||
@ -9,13 +10,13 @@ export async function checkAlerts() {
|
||||
process.env.ALERT_NUM_ACTIVE_JOBS &&
|
||||
process.env.ALERT_NUM_WAITING_JOBS
|
||||
) {
|
||||
console.info("Initializing alerts");
|
||||
Logger.info("Initializing alerts");
|
||||
const checkActiveJobs = async () => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const activeJobs = await webScraperQueue.getActiveCount();
|
||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||
console.warn(
|
||||
Logger.warn(
|
||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||
);
|
||||
sendSlackWebhook(
|
||||
@ -23,12 +24,12 @@ export async function checkAlerts() {
|
||||
true
|
||||
);
|
||||
} else {
|
||||
console.info(
|
||||
Logger.info(
|
||||
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to check active jobs:", error);
|
||||
Logger.error(`Failed to check active jobs: ${error}`);
|
||||
}
|
||||
};
|
||||
|
||||
@ -38,7 +39,7 @@ export async function checkAlerts() {
|
||||
const paused = await webScraperQueue.getPausedCount();
|
||||
|
||||
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||
console.warn(
|
||||
Logger.warn(
|
||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
||||
);
|
||||
sendSlackWebhook(
|
||||
@ -57,6 +58,6 @@ export async function checkAlerts() {
|
||||
// setInterval(checkAll, 10000); // Run every
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to initialize alerts:", error);
|
||||
Logger.error(`Failed to initialize alerts: ${error}`);
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
import axios from "axios";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function sendSlackWebhook(
|
||||
message: string,
|
||||
@ -16,8 +17,8 @@ export async function sendSlackWebhook(
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
console.log("Webhook sent successfully:", response.data);
|
||||
Logger.log("Webhook sent successfully:", response.data);
|
||||
} catch (error) {
|
||||
console.error("Error sending webhook:", error);
|
||||
Logger.debug(`Error sending webhook: ${error}`);
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ import { NotificationType } from "../../types";
|
||||
import { withAuth } from "../../lib/withAuth";
|
||||
import { sendNotification } from "../notification/email_notification";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
console.log(`Billing team ${team_id} for ${credits} credits`);
|
||||
Logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||
// When the API is used, you can log the credit usage in the credit_usage table:
|
||||
// team_id: The ID of the team using the API.
|
||||
// subscription_id: The ID of the team's active subscription.
|
||||
@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
0
|
||||
);
|
||||
|
||||
console.log("totalCreditsUsed", totalCreditsUsed);
|
||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||
|
||||
const end = new Date();
|
||||
end.setDate(end.getDate() + 30);
|
||||
@ -262,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
});
|
||||
|
||||
if (creditUsageError) {
|
||||
console.error("Error calculating credit usage:", creditUsageError);
|
||||
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
|
||||
}
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error calculating credit usage:", error);
|
||||
Logger.error(`Error calculating credit usage: ${error}`);
|
||||
}
|
||||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function createIdempotencyKey(
|
||||
req: Request,
|
||||
@ -14,7 +15,7 @@ export async function createIdempotencyKey(
|
||||
.insert({ key: idempotencyKey });
|
||||
|
||||
if (error) {
|
||||
console.error("Failed to create idempotency key:", error);
|
||||
Logger.error(`Failed to create idempotency key: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { validate as isUuid } from 'uuid';
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function validateIdempotencyKey(
|
||||
req: Request,
|
||||
@ -13,7 +14,7 @@ export async function validateIdempotencyKey(
|
||||
// Ensure idempotencyKey is treated as a string
|
||||
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
|
||||
if (!isUuid(key)) {
|
||||
console.error("Invalid idempotency key provided in the request headers.");
|
||||
Logger.debug("Invalid idempotency key provided in the request headers.");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -23,7 +24,7 @@ export async function validateIdempotencyKey(
|
||||
.eq("key", idempotencyKey);
|
||||
|
||||
if (error) {
|
||||
console.error(error);
|
||||
Logger.error(`Error validating idempotency key: ${error}`);
|
||||
}
|
||||
|
||||
if (!data || data.length === 0) {
|
||||
|
@ -1,4 +1,5 @@
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import "dotenv/config";
|
||||
|
||||
export async function logCrawl(job_id: string, team_id: string) {
|
||||
@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) {
|
||||
},
|
||||
]);
|
||||
} catch (error) {
|
||||
console.error("Error logging crawl job:\n", error);
|
||||
Logger.error(`Error logging crawl job to supabase:\n${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ import { supabase_service } from "../supabase";
|
||||
import { FirecrawlJob } from "../../types";
|
||||
import { posthog } from "../posthog";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) {
|
||||
posthog.capture(phLog);
|
||||
}
|
||||
if (error) {
|
||||
console.error("Error logging job:\n", error);
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error logging job:\n", error);
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
@ -2,15 +2,16 @@ import "dotenv/config";
|
||||
import { ScrapeLog } from "../../types";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function logScrape(
|
||||
scrapeLog: ScrapeLog,
|
||||
pageOptions?: PageOptions
|
||||
) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
Logger.debug("Skipping logging scrape to Supabase");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Only log jobs in production
|
||||
// if (process.env.ENV !== "production") {
|
||||
@ -43,9 +44,9 @@ export async function logScrape(
|
||||
]);
|
||||
|
||||
if (error) {
|
||||
console.error("Error logging proxy:\n", error);
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error logging proxy:\n", error);
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
}
|
||||
}
|
||||
|
@ -1,19 +1,20 @@
|
||||
import { Logtail } from "@logtail/node";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
|
||||
class MockLogtail {
|
||||
info(message: string, context?: Record<string, any>): void {
|
||||
console.log(message, context);
|
||||
Logger.debug(`${message} - ${context}`);
|
||||
}
|
||||
error(message: string, context: Record<string, any> = {}): void {
|
||||
console.error(message, context);
|
||||
Logger.error(`${message} - ${context}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
|
||||
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
|
||||
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
|
||||
console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||
Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||
return new MockLogtail();
|
||||
})();
|
||||
|
@ -2,6 +2,7 @@ import { supabase_service } from "../supabase";
|
||||
import { withAuth } from "../../lib/withAuth";
|
||||
import { Resend } from "resend";
|
||||
import { NotificationType } from "../../types";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
const emailTemplates: Record<
|
||||
NotificationType,
|
||||
@ -52,11 +53,11 @@ async function sendEmailNotification(
|
||||
});
|
||||
|
||||
if (error) {
|
||||
console.error("Error sending email: ", error);
|
||||
Logger.debug(`Error sending email: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error sending email (2): ", error);
|
||||
Logger.debug(`Error sending email (2): ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
}
|
||||
@ -70,7 +71,28 @@ export async function sendNotificationInternal(
|
||||
if (team_id === "preview") {
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
@ -78,14 +100,16 @@ export async function sendNotificationInternal(
|
||||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (error) {
|
||||
console.error("Error fetching notifications: ", error);
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
if (recentData.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
|
||||
return { success: false };
|
||||
} else {
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
@ -93,7 +117,7 @@ export async function sendNotificationInternal(
|
||||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
console.error("Error fetching emails: ", emailsError);
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
@ -112,7 +136,7 @@ export async function sendNotificationInternal(
|
||||
]);
|
||||
|
||||
if (insertError) {
|
||||
console.error("Error inserting notification record: ", insertError);
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { PostHog } from 'posthog-node';
|
||||
import "dotenv/config";
|
||||
import { Logger } from '../../src/lib/logger';
|
||||
|
||||
export default function PostHogClient() {
|
||||
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
|
||||
@ -19,7 +20,7 @@ class MockPostHog {
|
||||
export const posthog = process.env.POSTHOG_API_KEY
|
||||
? PostHogClient()
|
||||
: (() => {
|
||||
console.warn(
|
||||
Logger.warn(
|
||||
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
|
||||
);
|
||||
return new MockPostHog();
|
||||
|
@ -1,5 +1,6 @@
|
||||
import Queue from "bull";
|
||||
import { Queue as BullQueue } from "bull";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
let webScraperQueue: BullQueue;
|
||||
|
||||
@ -16,7 +17,7 @@ export function getWebScraperQueue() {
|
||||
attempts: 5
|
||||
}
|
||||
});
|
||||
console.log("Web scraper queue created");
|
||||
Logger.info("Web scraper queue created");
|
||||
}
|
||||
return webScraperQueue;
|
||||
}
|
||||
|
@ -7,8 +7,10 @@ import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
|
||||
if(process.env.ENV === 'production') {
|
||||
if (process.env.ENV === 'production') {
|
||||
initSDK({
|
||||
consoleCapture: true,
|
||||
additionalInstrumentations: [],
|
||||
@ -18,7 +20,8 @@ if(process.env.ENV === 'production') {
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
async function processJob(job: Job, done) {
|
||||
console.log("taking job", job.id);
|
||||
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
||||
|
||||
try {
|
||||
job.progress({
|
||||
current: 1,
|
||||
@ -60,18 +63,18 @@ async function processJob(job: Job, done) {
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
console.log("job done", job.id);
|
||||
Logger.debug(`🐂 Job done ${job.id}`);
|
||||
done(null, data);
|
||||
} catch (error) {
|
||||
console.log("job errored", job.id, error);
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
if (await getWebScraperQueue().isPaused(false)) {
|
||||
console.log("queue is paused, ignoring");
|
||||
Logger.debug("🐂Queue is paused, ignoring");
|
||||
return;
|
||||
}
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
console.error(error.message); // or any other error handling
|
||||
Logger.error(error.message); // or any other error handling
|
||||
|
||||
logtail.error("Custom error while ingesting", {
|
||||
job_id: job.id,
|
||||
@ -79,7 +82,7 @@ async function processJob(job: Job, done) {
|
||||
dataIngestionJob: error.dataIngestionJob,
|
||||
});
|
||||
}
|
||||
console.log(error);
|
||||
Logger.error(error);
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
@ -117,3 +120,10 @@ wsq.process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
processJob
|
||||
);
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
@ -1,14 +1,15 @@
|
||||
import Redis from "ioredis";
|
||||
import { redisRateLimitClient } from "./rate-limiter";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// Listen to 'error' events to the Redis connection
|
||||
redisRateLimitClient.on("error", (error) => {
|
||||
try {
|
||||
if (error.message === "ECONNRESET") {
|
||||
console.log("Connection to Redis Session Store timed out.");
|
||||
Logger.error("Connection to Redis Session Rate Limit Store timed out.");
|
||||
} else if (error.message === "ECONNREFUSED") {
|
||||
console.log("Connection to Redis Session Store refused!");
|
||||
} else console.log(error);
|
||||
Logger.error("Connection to Redis Session Rate Limit Store refused!");
|
||||
} else Logger.error(error);
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => {
|
||||
redisRateLimitClient.on("reconnecting", (err) => {
|
||||
try {
|
||||
if (redisRateLimitClient.status === "reconnecting")
|
||||
console.log("Reconnecting to Redis Session Store...");
|
||||
else console.log("Error reconnecting to Redis Session Store.");
|
||||
Logger.info("Reconnecting to Redis Session Rate Limit Store...");
|
||||
else Logger.error("Error reconnecting to Redis Session Rate Limit Store.");
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
// Listen to the 'connect' event to Redis
|
||||
redisRateLimitClient.on("connect", (err) => {
|
||||
try {
|
||||
if (!err) console.log("Connected to Redis Session Store!");
|
||||
if (!err) Logger.info("Connected to Redis Session Rate Limit Store!");
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
@ -10,13 +11,13 @@ class SupabaseService {
|
||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
console.warn(
|
||||
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
|
||||
Logger.warn(
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
);
|
||||
this.client = null;
|
||||
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||
console.error(
|
||||
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
|
||||
Logger.error(
|
||||
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
|
||||
);
|
||||
} else {
|
||||
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||
@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy(
|
||||
new SupabaseService(),
|
||||
{
|
||||
get: function (target, prop, receiver) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
Logger.debug(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
}
|
||||
const client = target.getClient();
|
||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||
if (client === null) {
|
||||
console.error(
|
||||
Logger.error(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
return () => {
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
|
||||
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
.eq("team_id", teamId)
|
||||
.limit(1);
|
||||
if (error) {
|
||||
console.error(
|
||||
`Error fetching webhook URL for team ID: ${teamId}`,
|
||||
error.message
|
||||
);
|
||||
Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
}),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(
|
||||
`Error sending webhook for team ID: ${teamId}`,
|
||||
error.message
|
||||
);
|
||||
Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
|
||||
}
|
||||
};
|
||||
|
@ -1,2 +1,4 @@
|
||||
export const errorNoResults =
|
||||
"No results found, please check the URL or contact us at help@mendable.ai to file a ticket.";
|
||||
|
||||
export const clientSideError = "client-side exception has occurred"
|
@ -5,7 +5,7 @@ the HTML content of a specified URL. It supports optional proxy settings and med
|
||||
|
||||
from os import environ
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi.responses import JSONResponse
|
||||
from playwright.async_api import Browser, async_playwright
|
||||
from pydantic import BaseModel
|
||||
@ -39,14 +39,28 @@ async def shutdown_event():
|
||||
"""Event handler for application shutdown to close the browser."""
|
||||
await browser.close()
|
||||
|
||||
@app.get("/health/liveness")
|
||||
def liveness_probe():
|
||||
"""Endpoint for liveness probe."""
|
||||
return JSONResponse(content={"status": "ok"}, status_code=200)
|
||||
|
||||
|
||||
@app.get("/health/readiness")
|
||||
async def readiness_probe():
|
||||
"""Endpoint for readiness probe. Checks if the browser instance is ready."""
|
||||
if browser:
|
||||
return JSONResponse(content={"status": "ok"}, status_code=200)
|
||||
return JSONResponse(content={"status": "Service Unavailable"}, status_code=503)
|
||||
|
||||
|
||||
@app.post("/html")
|
||||
async def root(body: UrlModel):
|
||||
"""
|
||||
Endpoint to fetch and return HTML content of a given URL.
|
||||
|
||||
|
||||
Args:
|
||||
body (UrlModel): The URL model containing the target URL, wait time, and timeout.
|
||||
|
||||
|
||||
Returns:
|
||||
JSONResponse: The HTML content of the page.
|
||||
"""
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import "dotenv/config";
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
private client: SupabaseClient | null = null;
|
||||
@ -11,12 +12,12 @@ class SupabaseService {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
console.warn(
|
||||
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
);
|
||||
this.client = null;
|
||||
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||
console.error(
|
||||
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
|
||||
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
|
||||
);
|
||||
} else {
|
||||
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||
@ -35,6 +36,11 @@ export const supabase_service: SupabaseClient = new Proxy(
|
||||
new SupabaseService(),
|
||||
{
|
||||
get: function (target, prop, receiver) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
console.debug(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
}
|
||||
const client = target.getClient();
|
||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||
if (client === null) {
|
||||
|
@ -1,42 +0,0 @@
|
||||
#root {
|
||||
max-width: 1280px;
|
||||
margin: 0 auto;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.logo {
|
||||
height: 6em;
|
||||
padding: 1.5em;
|
||||
will-change: filter;
|
||||
transition: filter 300ms;
|
||||
}
|
||||
.logo:hover {
|
||||
filter: drop-shadow(0 0 2em #646cffaa);
|
||||
}
|
||||
.logo.react:hover {
|
||||
filter: drop-shadow(0 0 2em #61dafbaa);
|
||||
}
|
||||
|
||||
@keyframes logo-spin {
|
||||
from {
|
||||
transform: rotate(0deg);
|
||||
}
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
@media (prefers-reduced-motion: no-preference) {
|
||||
a:nth-of-type(2) .logo {
|
||||
animation: logo-spin infinite 20s linear;
|
||||
}
|
||||
}
|
||||
|
||||
.card {
|
||||
padding: 2em;
|
||||
}
|
||||
|
||||
.read-the-docs {
|
||||
color: #888;
|
||||
}
|
@ -1,4 +1,3 @@
|
||||
import "./App.css";
|
||||
import FirecrawlComponent from "./components/ingestion";
|
||||
|
||||
function App() {
|
||||
|
@ -1 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="35.93" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 228"><path fill="#00D8FF" d="M210.483 73.824a171.49 171.49 0 0 0-8.24-2.597c.465-1.9.893-3.777 1.273-5.621c6.238-30.281 2.16-54.676-11.769-62.708c-13.355-7.7-35.196.329-57.254 19.526a171.23 171.23 0 0 0-6.375 5.848a155.866 155.866 0 0 0-4.241-3.917C100.759 3.829 77.587-4.822 63.673 3.233C50.33 10.957 46.379 33.89 51.995 62.588a170.974 170.974 0 0 0 1.892 8.48c-3.28.932-6.445 1.924-9.474 2.98C17.309 83.498 0 98.307 0 113.668c0 15.865 18.582 31.778 46.812 41.427a145.52 145.52 0 0 0 6.921 2.165a167.467 167.467 0 0 0-2.01 9.138c-5.354 28.2-1.173 50.591 12.134 58.266c13.744 7.926 36.812-.22 59.273-19.855a145.567 145.567 0 0 0 5.342-4.923a168.064 168.064 0 0 0 6.92 6.314c21.758 18.722 43.246 26.282 56.54 18.586c13.731-7.949 18.194-32.003 12.4-61.268a145.016 145.016 0 0 0-1.535-6.842c1.62-.48 3.21-.974 4.76-1.488c29.348-9.723 48.443-25.443 48.443-41.52c0-15.417-17.868-30.326-45.517-39.844Zm-6.365 70.984c-1.4.463-2.836.91-4.3 1.345c-3.24-10.257-7.612-21.163-12.963-32.432c5.106-11 9.31-21.767 12.459-31.957c2.619.758 5.16 1.557 7.61 2.4c23.69 8.156 38.14 20.213 38.14 29.504c0 9.896-15.606 22.743-40.946 31.14Zm-10.514 20.834c2.562 12.94 2.927 24.64 1.23 33.787c-1.524 8.219-4.59 13.698-8.382 15.893c-8.067 4.67-25.32-1.4-43.927-17.412a156.726 156.726 0 0 1-6.437-5.87c7.214-7.889 14.423-17.06 21.459-27.246c12.376-1.098 24.068-2.894 34.671-5.345a134.17 134.17 0 0 1 1.386 6.193ZM87.276 214.515c-7.882 2.783-14.16 2.863-17.955.675c-8.075-4.657-11.432-22.636-6.853-46.752a156.923 156.923 0 0 1 1.869-8.499c10.486 2.32 22.093 3.988 34.498 4.994c7.084 9.967 14.501 19.128 21.976 27.15a134.668 134.668 0 0 1-4.877 4.492c-9.933 8.682-19.886 14.842-28.658 17.94ZM50.35 144.747c-12.483-4.267-22.792-9.812-29.858-15.863c-6.35-5.437-9.555-10.836-9.555-15.216c0-9.322 13.897-21.212 37.076-29.293c2.813-.98 5.757-1.905 8.812-2.773c3.204 10.42 7.406 21.315 12.477 32.332c-5.137 11.18-9.399 22.249-12.634 32.792a134.718 134.718 0 0 1-6.318-1.979Zm12.378-84.26c-4.811-24.587-1.616-43.134 6.425-47.789c8.564-4.958 27.502 2.111 47.463 19.835a144.318 144.318 0 0 1 3.841 3.545c-7.438 7.987-14.787 17.08-21.808 26.988c-12.04 1.116-23.565 2.908-34.161 5.309a160.342 160.342 0 0 1-1.76-7.887Zm110.427 27.268a347.8 347.8 0 0 0-7.785-12.803c8.168 1.033 15.994 2.404 23.343 4.08c-2.206 7.072-4.956 14.465-8.193 22.045a381.151 381.151 0 0 0-7.365-13.322Zm-45.032-43.861c5.044 5.465 10.096 11.566 15.065 18.186a322.04 322.04 0 0 0-30.257-.006c4.974-6.559 10.069-12.652 15.192-18.18ZM82.802 87.83a323.167 323.167 0 0 0-7.227 13.238c-3.184-7.553-5.909-14.98-8.134-22.152c7.304-1.634 15.093-2.97 23.209-3.984a321.524 321.524 0 0 0-7.848 12.897Zm8.081 65.352c-8.385-.936-16.291-2.203-23.593-3.793c2.26-7.3 5.045-14.885 8.298-22.6a321.187 321.187 0 0 0 7.257 13.246c2.594 4.48 5.28 8.868 8.038 13.147Zm37.542 31.03c-5.184-5.592-10.354-11.779-15.403-18.433c4.902.192 9.899.29 14.978.29c5.218 0 10.376-.117 15.453-.343c-4.985 6.774-10.018 12.97-15.028 18.486Zm52.198-57.817c3.422 7.8 6.306 15.345 8.596 22.52c-7.422 1.694-15.436 3.058-23.88 4.071a382.417 382.417 0 0 0 7.859-13.026a347.403 347.403 0 0 0 7.425-13.565Zm-16.898 8.101a358.557 358.557 0 0 1-12.281 19.815a329.4 329.4 0 0 1-23.444.823c-7.967 0-15.716-.248-23.178-.732a310.202 310.202 0 0 1-12.513-19.846h.001a307.41 307.41 0 0 1-10.923-20.627a310.278 310.278 0 0 1 10.89-20.637l-.001.001a307.318 307.318 0 0 1 12.413-19.761c7.613-.576 15.42-.876 23.31-.876H128c7.926 0 15.743.303 23.354.883a329.357 329.357 0 0 1 12.335 19.695a358.489 358.489 0 0 1 11.036 20.54a329.472 329.472 0 0 1-11 20.722Zm22.56-122.124c8.572 4.944 11.906 24.881 6.52 51.026c-.344 1.668-.73 3.367-1.15 5.09c-10.622-2.452-22.155-4.275-34.23-5.408c-7.034-10.017-14.323-19.124-21.64-27.008a160.789 160.789 0 0 1 5.888-5.4c18.9-16.447 36.564-22.941 44.612-18.3ZM128 90.808c12.625 0 22.86 10.235 22.86 22.86s-10.235 22.86-22.86 22.86s-22.86-10.235-22.86-22.86s10.235-22.86 22.86-22.86Z"></path></svg>
|
Before Width: | Height: | Size: 4.0 KiB |
@ -15,11 +15,12 @@ import {
|
||||
CollapsibleContent,
|
||||
CollapsibleTrigger,
|
||||
} from "@/components/ui/collapsible";
|
||||
import { ChevronDown } from "lucide-react";
|
||||
import { ChevronDown, ChevronLeft, ChevronRight } from "lucide-react";
|
||||
|
||||
// Hardcoded values (not recommended for production)
|
||||
//! Hardcoded values (not recommended for production)
|
||||
//! Highly recommended to move all Firecrawl API calls to the backend (e.g. Next.js API route)
|
||||
const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud
|
||||
const FIRECRAWL_API_KEY = ""; // Replace with your actual API key
|
||||
const FIRECRAWL_API_KEY = "fc-YOUR_API_KEY"; // Replace with your actual API key
|
||||
|
||||
interface FormData {
|
||||
url: string;
|
||||
@ -100,7 +101,8 @@ export default function FirecrawlComponent() {
|
||||
const [elapsedTime, setElapsedTime] = useState<number>(0);
|
||||
const [showCrawlStatus, setShowCrawlStatus] = useState<boolean>(false);
|
||||
const [isScraping, setIsScraping] = useState<boolean>(false);
|
||||
const [showAllUrls, setShowAllUrls] = useState<boolean>(false);
|
||||
const [currentPage, setCurrentPage] = useState<number>(1);
|
||||
const urlsPerPage = 10;
|
||||
|
||||
useEffect(() => {
|
||||
let timer: NodeJS.Timeout;
|
||||
@ -289,6 +291,7 @@ export default function FirecrawlComponent() {
|
||||
const data: ScrapeResult = await response.json();
|
||||
newScrapeResults[url] = data;
|
||||
setCrawlStatus((prev) => ({ ...prev, current: index + 1 }));
|
||||
setScrapeResults({ ...scrapeResults, ...newScrapeResults });
|
||||
} catch (error) {
|
||||
console.error(`Error scraping ${url}:`, error);
|
||||
newScrapeResults[url] = {
|
||||
@ -312,22 +315,35 @@ export default function FirecrawlComponent() {
|
||||
}
|
||||
}
|
||||
|
||||
setScrapeResults(newScrapeResults);
|
||||
setLoading(false);
|
||||
setIsScraping(false);
|
||||
};
|
||||
|
||||
const handlePageChange = (newPage: number) => {
|
||||
setCurrentPage(newPage);
|
||||
};
|
||||
|
||||
const paginatedUrls = crawledUrls.slice(
|
||||
(currentPage - 1) * urlsPerPage,
|
||||
currentPage * urlsPerPage
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="max-w-2xl mx-auto p-4">
|
||||
<Card>
|
||||
<CardHeader className="flex items-center justify-between">
|
||||
<CardTitle className="flex items-center space-x-2">
|
||||
<span>Extract web content with Firecrawl 🔥</span>
|
||||
<CardHeader className="flex items-start justify-between mb-0 pb-4">
|
||||
<CardTitle className="flex items-center justify-between w-full space-x-2">
|
||||
<span className="text-base">Extract web content</span>
|
||||
<a
|
||||
href="https://www.firecrawl.dev"
|
||||
className="text-xs text-gray-500 font-normal px-3 py-1 bg-zinc-100 rounded-xl hover:bg-zinc-200 transition-colors"
|
||||
>
|
||||
Powered by Firecrawl 🔥
|
||||
</a>
|
||||
</CardTitle>
|
||||
<div className="text-sm text-gray-500 w-11/12 items-center">
|
||||
Use this component to quickly build your own UI for Firecrawl. Plug
|
||||
in your API key and the component will handle the rest. Learn more
|
||||
on the{" "}
|
||||
Use this component to quickly give your users the ability to connect
|
||||
their AI apps to web data with Firecrawl. Learn more on the{" "}
|
||||
<a
|
||||
href="https://docs.firecrawl.dev/"
|
||||
className="text-sm text-blue-500"
|
||||
@ -347,7 +363,36 @@ export default function FirecrawlComponent() {
|
||||
onChange={handleChange}
|
||||
/>
|
||||
<Button type="submit" variant="default" disabled={loading}>
|
||||
{loading ? "Running..." : "Run"}
|
||||
{loading ? (
|
||||
<div
|
||||
role="status"
|
||||
className="flex items-center justify-between space-x-2"
|
||||
>
|
||||
<svg
|
||||
className="animate-spin h-4 w-4 text-white"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
fill="none"
|
||||
viewBox="0 0 24 24"
|
||||
>
|
||||
<circle
|
||||
className="opacity-25"
|
||||
cx="12"
|
||||
cy="12"
|
||||
r="10"
|
||||
stroke="currentColor"
|
||||
strokeWidth="4"
|
||||
></circle>
|
||||
<path
|
||||
className="opacity-75"
|
||||
fill="currentColor"
|
||||
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
|
||||
></path>
|
||||
</svg>
|
||||
<span className="sr-only">Loading...</span>
|
||||
</div>
|
||||
) : (
|
||||
"Run"
|
||||
)}
|
||||
</Button>
|
||||
</div>
|
||||
<Collapsible
|
||||
@ -361,7 +406,7 @@ export default function FirecrawlComponent() {
|
||||
<ChevronDown className="h-4 w-4 opacity-50" />
|
||||
</Button>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="space-y-4 mt-4">
|
||||
<CollapsibleContent className="space-y-4 mt-4 px-2">
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="crawlSubPages"
|
||||
@ -484,8 +529,8 @@ export default function FirecrawlComponent() {
|
||||
className="text-sm cursor-pointer"
|
||||
>
|
||||
{selectedUrls.length === crawledUrls.length
|
||||
? "Unselect All"
|
||||
: "Select All"}
|
||||
? `Unselect All (${selectedUrls.length})`
|
||||
: `Select All (${selectedUrls.length})`}
|
||||
</label>
|
||||
</>
|
||||
)}
|
||||
@ -503,41 +548,57 @@ export default function FirecrawlComponent() {
|
||||
!isScraping && (
|
||||
<>
|
||||
<ul className="pl-2">
|
||||
{(showAllUrls ? crawledUrls : crawledUrls.slice(0, 10)).map(
|
||||
(url, index) => (
|
||||
<li key={index} className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
checked={selectedUrls.includes(url)}
|
||||
onCheckedChange={() =>
|
||||
setSelectedUrls((prev) =>
|
||||
prev.includes(url)
|
||||
? prev.filter((u) => u !== url)
|
||||
: [...prev, url]
|
||||
)
|
||||
}
|
||||
/>
|
||||
<span>{url}</span>
|
||||
</li>
|
||||
)
|
||||
)}
|
||||
</ul>
|
||||
{crawledUrls.length > 10 && (
|
||||
<div className="flex justify-center mt-2">
|
||||
<Button
|
||||
variant="link"
|
||||
onClick={() => setShowAllUrls(!showAllUrls)}
|
||||
{paginatedUrls.map((url, index) => (
|
||||
<li
|
||||
key={index}
|
||||
className="flex items-center space-x-2 my-2 text-sm"
|
||||
>
|
||||
{showAllUrls ? "Show Less" : "Show All"}
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
<Checkbox
|
||||
checked={selectedUrls.includes(url)}
|
||||
onCheckedChange={() =>
|
||||
setSelectedUrls((prev) =>
|
||||
prev.includes(url)
|
||||
? prev.filter((u) => u !== url)
|
||||
: [...prev, url]
|
||||
)
|
||||
}
|
||||
/>
|
||||
<span className="flex items-center max-w-lg">
|
||||
{url.length > 70 ? `${url.slice(0, 70)}...` : url}
|
||||
</span>
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
<div className="flex items-center justify-between mt-4">
|
||||
<Button
|
||||
variant="outline"
|
||||
className="px-2"
|
||||
onClick={() => handlePageChange(currentPage - 1)}
|
||||
disabled={currentPage === 1}
|
||||
>
|
||||
<ChevronLeft className="h-5 w-5" />
|
||||
</Button>
|
||||
<span className="text-sm text-gray-500">
|
||||
Page {currentPage} of{" "}
|
||||
{Math.ceil(crawledUrls.length / urlsPerPage)}
|
||||
</span>
|
||||
<Button
|
||||
variant="outline"
|
||||
className="px-2"
|
||||
onClick={() => handlePageChange(currentPage + 1)}
|
||||
disabled={currentPage * urlsPerPage >= crawledUrls.length}
|
||||
>
|
||||
<ChevronRight className="h-5 w-5 " />
|
||||
</Button>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</CardContent>
|
||||
<CardFooter className="flex justify-center">
|
||||
<CardFooter className="w-full flex justify-center">
|
||||
{crawledUrls.length > 0 && !scrapingSelectedLoading && (
|
||||
<Button
|
||||
variant="default"
|
||||
className="w-full"
|
||||
onClick={handleScrapeSelected}
|
||||
disabled={loading || selectedUrls.length === 0}
|
||||
>
|
||||
@ -549,45 +610,42 @@ export default function FirecrawlComponent() {
|
||||
|
||||
{Object.keys(scrapeResults).length > 0 && (
|
||||
<div className="mt-4">
|
||||
<h2 className="text-2xl font-bold mb-4">Scrape Results</h2>
|
||||
<div className="grid grid-cols-2 gap-4">
|
||||
<h2 className="text-base font-bold ">Scrape Results</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
You can do whatever you want with the scrape results. Here is a
|
||||
basic showcase of the markdown.
|
||||
</p>
|
||||
<div className="flex flex-col gap-4 mt-4 w-full">
|
||||
{Object.entries(scrapeResults).map(([url, result]) => (
|
||||
<Card key={url} className="overflow-hidden">
|
||||
<CardContent>
|
||||
<div className="max-h-60 overflow-y-auto">
|
||||
<div className="text-base font-bold py-2">
|
||||
{url
|
||||
.replace(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.replace(/\/$/, "")}
|
||||
</div>
|
||||
<Card key={url} className="relative p-4 w-full">
|
||||
<CardTitle className="text-sm font-normal flex flex-col">
|
||||
<span>{result.data.metadata.title}</span>
|
||||
<span className="text-xs text-gray-500">
|
||||
{url
|
||||
.replace(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.replace(/\/$/, "")}
|
||||
</span>
|
||||
</CardTitle>
|
||||
<CardContent className="relative px-0 pt-2 !text-xs w-full">
|
||||
<div className=" overflow-y-auto h-32 bg-zinc-100 rounded-md p-2 w-full">
|
||||
{result.success ? (
|
||||
<>
|
||||
<pre className="text-xs whitespace-pre-wrap">
|
||||
{result.data.markdown.trim().slice(0, 200)}...
|
||||
{result.data.markdown.trim()}
|
||||
</pre>
|
||||
</>
|
||||
) : (
|
||||
<p className="text-red-500">Failed to scrape this URL</p>
|
||||
<>
|
||||
<p className="text-red-500">
|
||||
Failed to scrape this URL
|
||||
</p>
|
||||
<p className="text-zinc-500 font-mono">
|
||||
{result.toString()}
|
||||
</p>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</CardContent>
|
||||
<CardFooter className="flex justify-center">
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={(event) => {
|
||||
navigator.clipboard.writeText(result.data.markdown);
|
||||
const button = event.currentTarget as HTMLButtonElement;
|
||||
const originalText = button.textContent;
|
||||
button.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
button.textContent = originalText;
|
||||
}, 2000);
|
||||
}}
|
||||
>
|
||||
Copy Markdown
|
||||
</Button>
|
||||
</CardFooter>
|
||||
</Card>
|
||||
))}
|
||||
</div>
|
||||
|
@ -13,6 +13,8 @@ x-common-service: &common-service
|
||||
- PORT=${PORT:-3002}
|
||||
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
|
||||
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
|
||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||
|
@ -4,12 +4,12 @@
|
||||
2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own)
|
||||
1. API (which is also used as a worker image)
|
||||
1. ```bash
|
||||
docker build -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api
|
||||
docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../../apps/api
|
||||
docker push ghcr.io/winkk-dev/firecrawl:latest
|
||||
```
|
||||
2. Playwright
|
||||
1. ```bash
|
||||
docker build -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service
|
||||
docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../../apps/playwright-service
|
||||
docker push ghcr.io/winkk-dev/firecrawl-playwright:latest
|
||||
```
|
||||
3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml)
|
||||
|
@ -15,16 +15,35 @@ spec:
|
||||
imagePullSecrets:
|
||||
- name: docker-registry-secret
|
||||
containers:
|
||||
- name: api
|
||||
image: ghcr.io/winkk-dev/firecrawl:latest
|
||||
args: [ "pnpm", "run", "start:production" ]
|
||||
ports:
|
||||
- containerPort: 3002
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: firecrawl-config
|
||||
- secretRef:
|
||||
name: firecrawl-secret
|
||||
- name: api
|
||||
image: ghcr.io/winkk-dev/firecrawl:latest
|
||||
imagePullPolicy: Always
|
||||
args: [ "pnpm", "run", "start:production" ]
|
||||
ports:
|
||||
- containerPort: 3002
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: firecrawl-config
|
||||
#- secretRef:
|
||||
# name: firecrawl-secret
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /v0/health/liveness
|
||||
port: 3002
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v0/health/readiness
|
||||
port: 3002
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
|
@ -7,8 +7,7 @@ data:
|
||||
PORT: "3002"
|
||||
HOST: "0.0.0.0"
|
||||
REDIS_URL: "redis://redis:6379"
|
||||
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000"
|
||||
REDIS_RATE_LIMIT_URL: "redis://redis:6379"
|
||||
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/html"
|
||||
USE_DB_AUTHENTICATION: "false"
|
||||
SUPABASE_ANON_TOKEN: ""
|
||||
SUPABASE_URL: ""
|
||||
SUPABASE_SERVICE_TOKEN: ""
|
||||
HDX_NODE_BETA_MODE: "1"
|
||||
|
@ -1,3 +1,10 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: playwright-service-config
|
||||
data:
|
||||
PORT: "3000"
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
@ -15,13 +22,32 @@ spec:
|
||||
imagePullSecrets:
|
||||
- name: docker-registry-secret
|
||||
containers:
|
||||
- name: playwright-service
|
||||
image: ghcr.io/winkk-dev/firecrawl-playwright:latest
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: firecrawl-config
|
||||
- name: playwright-service
|
||||
image: ghcr.io/winkk-dev/firecrawl-playwright:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: playwright-service-config
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/liveness
|
||||
port: 3000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/readiness
|
||||
port: 3000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
|
@ -15,10 +15,12 @@ spec:
|
||||
imagePullSecrets:
|
||||
- name: docker-registry-secret
|
||||
containers:
|
||||
- name: worker
|
||||
image: ghcr.io/winkk-dev/firecrawl:latest
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: firecrawl-config
|
||||
- secretRef:
|
||||
name: firecrawl-secret
|
||||
- name: worker
|
||||
image: ghcr.io/winkk-dev/firecrawl:latest
|
||||
imagePullPolicy: Always
|
||||
args: [ "pnpm", "run", "workers" ]
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: firecrawl-config
|
||||
#- secretRef:
|
||||
# name: firecrawl-secret
|
||||
|
Loading…
x
Reference in New Issue
Block a user