mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 20:16:03 +08:00
feat(v1): crawl/batch scrape errors route
This commit is contained in:
parent
dcd3d6d98d
commit
e6531278f6
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import {
|
||||||
|
CrawlErrorsResponse,
|
||||||
|
CrawlStatusParams,
|
||||||
|
CrawlStatusResponse,
|
||||||
|
ErrorResponse,
|
||||||
|
RequestWithAuth,
|
||||||
|
} from "./types";
|
||||||
|
import {
|
||||||
|
getCrawl,
|
||||||
|
getCrawlExpiry,
|
||||||
|
getCrawlJobs,
|
||||||
|
getDoneJobsOrdered,
|
||||||
|
getDoneJobsOrderedLength,
|
||||||
|
getThrottledJobs,
|
||||||
|
isCrawlFinished,
|
||||||
|
} from "../../lib/crawl-redis";
|
||||||
|
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||||
|
import {
|
||||||
|
supabaseGetJobById,
|
||||||
|
supabaseGetJobsById,
|
||||||
|
} from "../../lib/supabase-jobs";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Job, JobState } from "bullmq";
|
||||||
|
import { logger } from "../../lib/logger";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
|
export async function getJob(id: string) {
|
||||||
|
const job = await getScrapeQueue().getJob(id);
|
||||||
|
if (!job) return job;
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getJobs(ids: string[]) {
|
||||||
|
const jobs: (Job & { id: string })[] = (
|
||||||
|
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||||
|
).filter((x) => x) as (Job & { id: string })[];
|
||||||
|
|
||||||
|
return jobs;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function crawlErrorsController(
|
||||||
|
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
|
||||||
|
res: Response<CrawlErrorsResponse>,
|
||||||
|
) {
|
||||||
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
if (!sc) {
|
||||||
|
return res.status(404).json({ success: false, error: "Job not found" });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sc.team_id !== req.auth.team_id) {
|
||||||
|
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||||
|
}
|
||||||
|
|
||||||
|
let jobStatuses = await Promise.all(
|
||||||
|
(await getCrawlJobs(req.params.jobId)).map(
|
||||||
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const failedJobIDs: string[] = [];
|
||||||
|
|
||||||
|
for (const [id, status] of jobStatuses) {
|
||||||
|
if (
|
||||||
|
status === "failed"
|
||||||
|
) {
|
||||||
|
failedJobIDs.push(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
errors: (await getJobs(failedJobIDs)).map(x => ({
|
||||||
|
id: x.id,
|
||||||
|
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
||||||
|
url: x.data.url,
|
||||||
|
error: x.failedReason,
|
||||||
|
})),
|
||||||
|
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
||||||
|
});
|
||||||
|
}
|
@ -568,6 +568,19 @@ export type CrawlStatusResponse =
|
|||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
export type CrawlErrorsResponse =
|
||||||
|
| ErrorResponse
|
||||||
|
| {
|
||||||
|
errors: {
|
||||||
|
id: string,
|
||||||
|
timestamp?: string,
|
||||||
|
url: string,
|
||||||
|
error: string,
|
||||||
|
}[];
|
||||||
|
robotsBlocked: string[];
|
||||||
|
};
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: PlanType | undefined;
|
plan: PlanType | undefined;
|
||||||
|
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
|
|||||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { searchController } from "../controllers/v1/search";
|
import { searchController } from "../controllers/v1/search";
|
||||||
|
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||||
|
|
||||||
function checkCreditsMiddleware(
|
function checkCreditsMiddleware(
|
||||||
minimum?: number,
|
minimum?: number,
|
||||||
@ -192,6 +193,18 @@ v1Router.get(
|
|||||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/crawl/:jobId/errors",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(crawlErrorsController),
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/batch/scrape/:jobId/errors",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(crawlErrorsController),
|
||||||
|
);
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/scrape/:jobId",
|
"/scrape/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
@ -299,6 +299,16 @@ export class WebCrawler {
|
|||||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
return fullUrl;
|
return fullUrl;
|
||||||
|
} else if (
|
||||||
|
this.isInternalLink(fullUrl) &&
|
||||||
|
this.noSections(fullUrl) &&
|
||||||
|
!this.matchesExcludes(path) &&
|
||||||
|
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
|
) {
|
||||||
|
(async() => {
|
||||||
|
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
||||||
|
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
||||||
|
})();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// EXTERNAL LINKS
|
// EXTERNAL LINKS
|
||||||
|
Loading…
x
Reference in New Issue
Block a user