mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:06:00 +08:00
feat(app): add extra crawl logging (app-side only for now)
This commit is contained in:
parent
cce94289ee
commit
845c2744a9
@ -5,6 +5,7 @@ import {
|
|||||||
batchScrapeRequestSchema,
|
batchScrapeRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
|
ScrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJobs,
|
addCrawlJobs,
|
||||||
@ -14,10 +15,10 @@ import {
|
|||||||
StoredCrawl,
|
StoredCrawl,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
|
import { logger as _logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function batchScrapeController(
|
export async function batchScrapeController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||||
@ -26,6 +27,8 @@ export async function batchScrapeController(
|
|||||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = req.body.appendToId ?? uuidv4();
|
const id = req.body.appendToId ?? uuidv4();
|
||||||
|
const logger = _logger.child({ crawlId: id, batchScrapeId: id, module: "api/v1", method: "batchScrapeController", teamId: req.auth.team_id, plan: req.auth.plan });
|
||||||
|
logger.debug("Batch scrape " + id + " starting", { urlsLength: req.body.urls, appendToId: req.body.appendToId, account: req.account });
|
||||||
|
|
||||||
if (!req.body.appendToId) {
|
if (!req.body.appendToId) {
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
@ -58,6 +61,7 @@ export async function batchScrapeController(
|
|||||||
// set base to 21
|
// set base to 21
|
||||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||||
}
|
}
|
||||||
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
const scrapeOptions: ScrapeOptions = { ...req.body };
|
const scrapeOptions: ScrapeOptions = { ...req.body };
|
||||||
delete (scrapeOptions as any).urls;
|
delete (scrapeOptions as any).urls;
|
||||||
@ -85,18 +89,22 @@ export async function batchScrapeController(
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
logger.debug("Locking URLs...");
|
||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url)
|
jobs.map((x) => x.data.url)
|
||||||
);
|
);
|
||||||
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
id,
|
id,
|
||||||
jobs.map((x) => x.opts.jobId)
|
jobs.map((x) => x.opts.jobId)
|
||||||
);
|
);
|
||||||
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await addScrapeJobs(jobs);
|
await addScrapeJobs(jobs);
|
||||||
|
|
||||||
if(req.body.webhook) {
|
if(req.body.webhook) {
|
||||||
|
logger.debug("Calling webhook with batch_scrape.started...", { webhook: req.body.webhook });
|
||||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
|
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ import {
|
|||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
||||||
@ -28,9 +28,12 @@ export async function crawlController(
|
|||||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
res: Response<CrawlResponse>
|
res: Response<CrawlResponse>
|
||||||
) {
|
) {
|
||||||
|
const preNormalizedBody = req.body;
|
||||||
req.body = crawlRequestSchema.parse(req.body);
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
|
const logger = _logger.child({ crawlId: id, module: "api/v1", method: "crawlController", teamId: req.auth.team_id, plan: req.auth.plan });
|
||||||
|
logger.debug("Crawl " + id + " starting", { request: req.body, originalRequest: preNormalizedBody, account: req.account });
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
@ -68,7 +71,9 @@ export async function crawlController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const originalLimit = crawlerOptions.limit;
|
||||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||||
|
logger.debug("Determined limit: " + crawlerOptions.limit, { remainingCredits, bodyLimit: originalLimit, originalBodyLimit: preNormalizedBody.limit });
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
@ -85,11 +90,7 @@ export async function crawlController(
|
|||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug(
|
logger.debug("Failed to get robots.txt (this is probably fine!)", { error: e });
|
||||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
|
||||||
e
|
|
||||||
)}`
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
@ -97,15 +98,18 @@ export async function crawlController(
|
|||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
? null
|
? null
|
||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
if (sitemap !== null && sitemap.length > 0) {
|
||||||
|
logger.debug("Using sitemap of length " + sitemap.length, { sitemapLength: sitemap.length });
|
||||||
let jobPriority = 20;
|
let jobPriority = 20;
|
||||||
// If it is over 1000, we need to get the job priority,
|
// If it is over 1000, we need to get the job priority,
|
||||||
// otherwise we can use the default priority of 20
|
// otherwise we can use the default priority of 20
|
||||||
if(sitemap.length > 1000){
|
if(sitemap.length > 1000){
|
||||||
// set base to 21
|
// set base to 21
|
||||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||||
}
|
}
|
||||||
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
const jobs = sitemap.map((x) => {
|
const jobs = sitemap.map((x) => {
|
||||||
const url = x.url;
|
const url = x.url;
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
@ -131,19 +135,26 @@ export async function crawlController(
|
|||||||
};
|
};
|
||||||
})
|
})
|
||||||
|
|
||||||
|
logger.debug("Locking URLs...");
|
||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url)
|
jobs.map((x) => x.data.url)
|
||||||
);
|
);
|
||||||
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
id,
|
id,
|
||||||
jobs.map((x) => x.opts.jobId)
|
jobs.map((x) => x.opts.jobId)
|
||||||
);
|
);
|
||||||
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
|
logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap });
|
||||||
|
|
||||||
|
logger.debug("Locking URL...");
|
||||||
await lockURL(id, sc, req.body.url);
|
await lockURL(id, sc, req.body.url);
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
logger.debug("Adding scrape job to Redis...", { jobId });
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
@ -162,10 +173,13 @@ export async function crawlController(
|
|||||||
},
|
},
|
||||||
jobId,
|
jobId,
|
||||||
);
|
);
|
||||||
|
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
if(req.body.webhook) {
|
if(req.body.webhook) {
|
||||||
|
logger.debug("Calling webhook with crawl.started...", { webhook: req.body.webhook });
|
||||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ import { InternalOptions } from "../scraper/scrapeURL";
|
|||||||
import { ScrapeOptions } from "../controllers/v1/types";
|
import { ScrapeOptions } from "../controllers/v1/types";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisConnection } from "../services/queue-service";
|
||||||
import { logger } from "./logger";
|
import { logger as _logger } from "./logger";
|
||||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||||
|
|
||||||
export type StoredCrawl = {
|
export type StoredCrawl = {
|
||||||
@ -18,6 +18,7 @@ export type StoredCrawl = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||||
|
_logger.debug("Saving crawl " + id + " to Redis...", { crawl, module: "crawl-redis", method: "saveCrawl", crawlId: id, teamId: crawl.team_id, plan: crawl.plan });
|
||||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
@ -41,16 +42,19 @@ export async function getCrawlExpiry(id: string): Promise<Date> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJob(id: string, job_id: string) {
|
export async function addCrawlJob(id: string, job_id: string) {
|
||||||
|
_logger.debug("Adding crawl job " + job_id + " to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJob", crawlId: id });
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
|
_logger.debug("Adding crawl jobs to Redis...", { jobIds: job_ids, module: "crawl-redis", method: "addCrawlJobs", crawlId: id });
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||||
|
_logger.debug("Adding done crawl job to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJobDone", crawlId: id });
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||||
@ -75,11 +79,14 @@ export async function isCrawlFinishedLocked(id: string) {
|
|||||||
|
|
||||||
export async function finishCrawl(id: string) {
|
export async function finishCrawl(id: string) {
|
||||||
if (await isCrawlFinished(id)) {
|
if (await isCrawlFinished(id)) {
|
||||||
|
_logger.debug("Marking crawl as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id });
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||||
if (set === 1) {
|
if (set === 1) {
|
||||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
return set === 1
|
return set === 1
|
||||||
|
} else {
|
||||||
|
_logger.debug("Crawl can not be finished yet, not marking as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,14 +139,19 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||||
|
const logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", preNormalizedURL: url, teamId: sc.team_id, plan: sc.plan });
|
||||||
|
|
||||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
|
if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
|
||||||
|
logger.debug("Crawl has already hit visited_unique limit, not locking URL.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
url = normalizeURL(url, sc);
|
url = normalizeURL(url, sc);
|
||||||
|
logger.defaultMeta.url = url;
|
||||||
|
|
||||||
|
logger.debug("Locking URL " + JSON.stringify(url) + "...");
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||||
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
@ -147,20 +159,25 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
} else {
|
} else {
|
||||||
const permutations = generateURLPermutations(url);
|
const permutations = generateURLPermutations(url).map(x => x.href);
|
||||||
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
|
logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
||||||
|
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations));
|
||||||
res = x === permutations.length;
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
|
logger.debug("lockURL final result: " + res, { res });
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
||||||
urls = urls.map(url => normalizeURL(url, sc));
|
urls = urls.map(url => normalizeURL(url, sc));
|
||||||
|
const logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", teamId: sc.team_id, plan: sc.plan });
|
||||||
|
|
||||||
// Add to visited_unique set
|
// Add to visited_unique set
|
||||||
|
logger.debug("Locking " + urls.length + " URLs...");
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
||||||
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
@ -170,11 +187,14 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||||||
res = x === urls.length;
|
res = x === urls.length;
|
||||||
} else {
|
} else {
|
||||||
const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href));
|
const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href));
|
||||||
|
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
||||||
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations);
|
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations);
|
||||||
res = x === allPermutations.length;
|
res = x === allPermutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
|
logger.debug("lockURLs final result: " + res, { res });
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import { getLinksFromSitemap } from "./sitemap";
|
|||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { logger } from "../../../src/lib/logger";
|
import { logger as _logger } from "../../../src/lib/logger";
|
||||||
import https from "https";
|
import https from "https";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
@ -25,6 +25,7 @@ export class WebCrawler {
|
|||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
private allowSubdomains: boolean;
|
private allowSubdomains: boolean;
|
||||||
private ignoreRobotsTxt: boolean;
|
private ignoreRobotsTxt: boolean;
|
||||||
|
private logger: typeof _logger;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -71,6 +72,7 @@ export class WebCrawler {
|
|||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
this.allowSubdomains = allowSubdomains ?? false;
|
this.allowSubdomains = allowSubdomains ?? false;
|
||||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||||
|
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||||
@ -85,7 +87,7 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
url = new URL(link.trim(), this.baseUrl);
|
url = new URL(link.trim(), this.baseUrl);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
this.logger.debug(`Error processing link: ${link}`, { link, error, method: "filterLinks" });
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
@ -144,7 +146,7 @@ export class WebCrawler {
|
|||||||
const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
this.logger.debug(`Link disallowed by robots.txt: ${link}`, { method: "filterLinks", link });
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -173,7 +175,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
||||||
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap" });
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if(fromMap && onlySitemap) {
|
if(fromMap && onlySitemap) {
|
||||||
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
||||||
@ -350,7 +352,7 @@ export class WebCrawler {
|
|||||||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
||||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error processing URL in isFile: ${error}`);
|
this.logger.error(`Error processing URL in isFile`, { method: "isFile", error });
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -390,14 +392,14 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
this.logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error });
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }, this.logger);
|
||||||
if (response) {
|
if (response) {
|
||||||
sitemapLinks = response;
|
sitemapLinks = response;
|
||||||
}
|
}
|
||||||
@ -409,14 +411,14 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", sitemapUrl: baseUrlSitemap, error });
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,9 +2,9 @@ import axios from "axios";
|
|||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { logger } from "../../lib/logger";
|
|
||||||
import { scrapeURL } from "../scrapeURL";
|
import { scrapeURL } from "../scrapeURL";
|
||||||
import { scrapeOptions } from "../../controllers/v1/types";
|
import { scrapeOptions } from "../../controllers/v1/types";
|
||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
@ -15,7 +15,8 @@ export async function getLinksFromSitemap(
|
|||||||
sitemapUrl: string,
|
sitemapUrl: string,
|
||||||
allUrls?: string[],
|
allUrls?: string[],
|
||||||
mode?: 'axios' | 'fire-engine'
|
mode?: 'axios' | 'fire-engine'
|
||||||
}
|
},
|
||||||
|
logger: Logger,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
@ -31,7 +32,7 @@ export async function getLinksFromSitemap(
|
|||||||
content = response.document.rawHtml!;
|
content = response.document.rawHtml!;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
logger.error(`Request failed for ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
|
|||||||
if (root && root.sitemap) {
|
if (root && root.sitemap) {
|
||||||
const sitemapPromises = root.sitemap
|
const sitemapPromises = root.sitemap
|
||||||
.filter(sitemap => sitemap.loc && sitemap.loc.length > 0)
|
.filter(sitemap => sitemap.loc && sitemap.loc.length > 0)
|
||||||
.map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }));
|
.map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }, logger));
|
||||||
await Promise.all(sitemapPromises);
|
await Promise.all(sitemapPromises);
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
const validUrls = root.url
|
const validUrls = root.url
|
||||||
@ -51,7 +52,7 @@ export async function getLinksFromSitemap(
|
|||||||
allUrls.push(...validUrls);
|
allUrls.push(...validUrls);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user