mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 20:16:03 +08:00
feat(v1): crawl/batch scrape errors route
This commit is contained in:
parent
dcd3d6d98d
commit
e6531278f6
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
@ -0,0 +1,81 @@
|
||||
import { Response } from "express";
|
||||
import {
|
||||
CrawlErrorsResponse,
|
||||
CrawlStatusParams,
|
||||
CrawlStatusResponse,
|
||||
ErrorResponse,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
getCrawl,
|
||||
getCrawlExpiry,
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||
import {
|
||||
supabaseGetJobById,
|
||||
supabaseGetJobsById,
|
||||
} from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs: (Job & { id: string })[] = (
|
||||
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||
).filter((x) => x) as (Job & { id: string })[];
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlErrorsController(
|
||||
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
|
||||
res: Response<CrawlErrorsResponse>,
|
||||
) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
let jobStatuses = await Promise.all(
|
||||
(await getCrawlJobs(req.params.jobId)).map(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
|
||||
const failedJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
status === "failed"
|
||||
) {
|
||||
failedJobIDs.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
errors: (await getJobs(failedJobIDs)).map(x => ({
|
||||
id: x.id,
|
||||
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
||||
url: x.data.url,
|
||||
error: x.failedReason,
|
||||
})),
|
||||
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
||||
});
|
||||
}
|
@ -568,6 +568,19 @@ export type CrawlStatusResponse =
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
|
||||
export type CrawlErrorsResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
errors: {
|
||||
id: string,
|
||||
timestamp?: string,
|
||||
url: string,
|
||||
error: string,
|
||||
}[];
|
||||
robotsBlocked: string[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType | undefined;
|
||||
|
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
|
||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { searchController } from "../controllers/v1/search";
|
||||
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||
|
||||
function checkCreditsMiddleware(
|
||||
minimum?: number,
|
||||
@ -192,6 +193,18 @@ v1Router.get(
|
||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId/errors",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlErrorsController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/batch/scrape/:jobId/errors",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlErrorsController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
|
@ -299,6 +299,16 @@ export class WebCrawler {
|
||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
return fullUrl;
|
||||
} else if (
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
(async() => {
|
||||
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
||||
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
||||
})();
|
||||
}
|
||||
} else {
|
||||
// EXTERNAL LINKS
|
||||
|
Loading…
x
Reference in New Issue
Block a user