From e6531278f65494de8ad9afeeecafeb6d2553fd7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 17 Jan 2025 17:12:04 +0100 Subject: [PATCH] feat(v1): crawl/batch scrape errors route --- apps/api/src/controllers/v1/crawl-errors.ts | 81 +++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 13 ++++ apps/api/src/routes/v1.ts | 13 ++++ apps/api/src/scraper/WebScraper/crawler.ts | 10 +++ 4 files changed, 117 insertions(+) create mode 100644 apps/api/src/controllers/v1/crawl-errors.ts diff --git a/apps/api/src/controllers/v1/crawl-errors.ts b/apps/api/src/controllers/v1/crawl-errors.ts new file mode 100644 index 00000000..b64d02fa --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-errors.ts @@ -0,0 +1,81 @@ +import { Response } from "express"; +import { + CrawlErrorsResponse, + CrawlStatusParams, + CrawlStatusResponse, + ErrorResponse, + RequestWithAuth, +} from "./types"; +import { + getCrawl, + getCrawlExpiry, + getCrawlJobs, + getDoneJobsOrdered, + getDoneJobsOrderedLength, + getThrottledJobs, + isCrawlFinished, +} from "../../lib/crawl-redis"; +import { getScrapeQueue, redisConnection } from "../../services/queue-service"; +import { + supabaseGetJobById, + supabaseGetJobsById, +} from "../../lib/supabase-jobs"; +import { configDotenv } from "dotenv"; +import { Job, JobState } from "bullmq"; +import { logger } from "../../lib/logger"; +configDotenv(); + +export async function getJob(id: string) { + const job = await getScrapeQueue().getJob(id); + if (!job) return job; + + return job; +} + +export async function getJobs(ids: string[]) { + const jobs: (Job & { id: string })[] = ( + await Promise.all(ids.map((x) => getScrapeQueue().getJob(x))) + ).filter((x) => x) as (Job & { id: string })[]; + + return jobs; +} + +export async function crawlErrorsController( + req: RequestWithAuth, + res: Response, +) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ success: false, error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return res.status(403).json({ success: false, error: "Forbidden" }); + } + + let jobStatuses = await Promise.all( + (await getCrawlJobs(req.params.jobId)).map( + async (x) => [x, await getScrapeQueue().getJobState(x)] as const, + ), + ); + + const failedJobIDs: string[] = []; + + for (const [id, status] of jobStatuses) { + if ( + status === "failed" + ) { + failedJobIDs.push(id); + } + } + + res.status(200).json({ + errors: (await getJobs(failedJobIDs)).map(x => ({ + id: x.id, + timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined, + url: x.data.url, + error: x.failedReason, + })), + robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"), + }); +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ac3743e8..ee141625 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -568,6 +568,19 @@ export type CrawlStatusResponse = data: Document[]; }; + +export type CrawlErrorsResponse = + | ErrorResponse + | { + errors: { + id: string, + timestamp?: string, + url: string, + error: string, + }[]; + robotsBlocked: string[]; + }; + type AuthObject = { team_id: string; plan: PlanType | undefined; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index a916dd40..4aacfe18 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status"; import { creditUsageController } from "../controllers/v1/credit-usage"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { searchController } from "../controllers/v1/search"; +import { crawlErrorsController } from "../controllers/v1/crawl-errors"; function checkCreditsMiddleware( minimum?: number, @@ -192,6 +193,18 @@ v1Router.get( wrap((req: any, res): any => crawlStatusController(req, res, true)), ); +v1Router.get( + "/crawl/:jobId/errors", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlErrorsController), +); + +v1Router.get( + "/batch/scrape/:jobId/errors", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlErrorsController), +); + v1Router.get( "/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index a6645152..7d4be97b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -299,6 +299,16 @@ export class WebCrawler { this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) ) { return fullUrl; + } else if ( + this.isInternalLink(fullUrl) && + this.noSections(fullUrl) && + !this.matchesExcludes(path) && + !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) + ) { + (async() => { + await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl); + await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX"); + })(); } } else { // EXTERNAL LINKS