feat(v1): crawl/batch scrape errors route

This commit is contained in:
Gergő Móricz 2025-01-17 17:12:04 +01:00
parent dcd3d6d98d
commit e6531278f6
4 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,81 @@
import { Response } from "express";
import {
CrawlErrorsResponse,
CrawlStatusParams,
CrawlStatusResponse,
ErrorResponse,
RequestWithAuth,
} from "./types";
import {
getCrawl,
getCrawlExpiry,
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
} from "../../lib/crawl-redis";
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
import {
supabaseGetJobById,
supabaseGetJobsById,
} from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
import { Job, JobState } from "bullmq";
import { logger } from "../../lib/logger";
configDotenv();
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);
if (!job) return job;
return job;
}
export async function getJobs(ids: string[]) {
const jobs: (Job & { id: string })[] = (
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
).filter((x) => x) as (Job & { id: string })[];
return jobs;
}
export async function crawlErrorsController(
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
res: Response<CrawlErrorsResponse>,
) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return res.status(403).json({ success: false, error: "Forbidden" });
}
let jobStatuses = await Promise.all(
(await getCrawlJobs(req.params.jobId)).map(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const failedJobIDs: string[] = [];
for (const [id, status] of jobStatuses) {
if (
status === "failed"
) {
failedJobIDs.push(id);
}
}
res.status(200).json({
errors: (await getJobs(failedJobIDs)).map(x => ({
id: x.id,
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
url: x.data.url,
error: x.failedReason,
})),
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
});
}

View File

@ -568,6 +568,19 @@ export type CrawlStatusResponse =
data: Document[];
};
export type CrawlErrorsResponse =
| ErrorResponse
| {
errors: {
id: string,
timestamp?: string,
url: string,
error: string,
}[];
robotsBlocked: string[];
};
type AuthObject = {
team_id: string;
plan: PlanType | undefined;

View File

@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { searchController } from "../controllers/v1/search";
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
function checkCreditsMiddleware(
minimum?: number,
@ -192,6 +193,18 @@ v1Router.get(
wrap((req: any, res): any => crawlStatusController(req, res, true)),
);
v1Router.get(
"/crawl/:jobId/errors",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlErrorsController),
);
v1Router.get(
"/batch/scrape/:jobId/errors",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlErrorsController),
);
v1Router.get(
"/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),

View File

@ -299,6 +299,16 @@ export class WebCrawler {
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) {
return fullUrl;
} else if (
this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) {
(async() => {
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
})();
}
} else {
// EXTERNAL LINKS