mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-21 05:29:42 +08:00
129 lines
2.9 KiB
TypeScript
129 lines
2.9 KiB
TypeScript
import { Response } from "express";
|
|
import { v4 as uuidv4 } from "uuid";
|
|
import {
|
|
CrawlRequest,
|
|
crawlRequestSchema,
|
|
CrawlResponse,
|
|
legacyCrawlerOptions,
|
|
legacyScrapeOptions,
|
|
RequestWithAuth,
|
|
} from "./types";
|
|
import {
|
|
addCrawlJob,
|
|
addCrawlJobs,
|
|
crawlToCrawler,
|
|
lockURL,
|
|
lockURLs,
|
|
saveCrawl,
|
|
StoredCrawl,
|
|
} from "../../lib/crawl-redis";
|
|
import { logCrawl } from "../../services/logging/crawl_log";
|
|
import { getScrapeQueue } from "../../services/queue-service";
|
|
import { addScrapeJob } from "../../services/queue-jobs";
|
|
import { Logger } from "../../lib/logger";
|
|
|
|
export async function crawlController(
|
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
|
res: Response<CrawlResponse>
|
|
) {
|
|
req.body = crawlRequestSchema.parse(req.body);
|
|
|
|
const id = uuidv4();
|
|
|
|
await logCrawl(id, req.auth.team_id);
|
|
|
|
const { remainingCredits } = req.account;
|
|
|
|
// TODO: Get rid of crawlerOptions
|
|
const crawlerOptions = legacyCrawlerOptions(req.body);
|
|
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
|
|
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
|
|
|
const sc: StoredCrawl = {
|
|
originUrl: req.body.url,
|
|
crawlerOptions,
|
|
pageOptions,
|
|
team_id: req.auth.team_id,
|
|
createdAt: Date.now(),
|
|
};
|
|
|
|
const crawler = crawlToCrawler(id, sc);
|
|
|
|
try {
|
|
sc.robots = await crawler.getRobotsTxt();
|
|
} catch (e) {
|
|
Logger.debug(
|
|
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
|
e
|
|
)}`
|
|
);
|
|
}
|
|
|
|
await saveCrawl(id, sc);
|
|
|
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
|
? null
|
|
: await crawler.tryGetSitemap();
|
|
|
|
if (sitemap !== null) {
|
|
const jobs = sitemap.map((x) => {
|
|
const url = x.url;
|
|
const uuid = uuidv4();
|
|
return {
|
|
name: uuid,
|
|
data: {
|
|
url,
|
|
mode: "single_urls",
|
|
team_id: req.auth.team_id,
|
|
crawlerOptions,
|
|
pageOptions,
|
|
origin: "api",
|
|
crawl_id: id,
|
|
sitemapped: true,
|
|
v1: true,
|
|
},
|
|
opts: {
|
|
jobId: uuid,
|
|
priority: 20,
|
|
},
|
|
};
|
|
});
|
|
|
|
await lockURLs(
|
|
id,
|
|
jobs.map((x) => x.data.url)
|
|
);
|
|
await addCrawlJobs(
|
|
id,
|
|
jobs.map((x) => x.opts.jobId)
|
|
);
|
|
await getScrapeQueue().addBulk(jobs);
|
|
} else {
|
|
await lockURL(id, sc, req.body.url);
|
|
const job = await addScrapeJob(
|
|
{
|
|
url: req.body.url,
|
|
mode: "single_urls",
|
|
crawlerOptions: crawlerOptions,
|
|
team_id: req.auth.team_id,
|
|
pageOptions: pageOptions,
|
|
origin: "api",
|
|
crawl_id: id,
|
|
webhook: req.body.webhook,
|
|
v1: true,
|
|
},
|
|
{
|
|
priority: 15,
|
|
}
|
|
);
|
|
await addCrawlJob(id, job.id);
|
|
}
|
|
|
|
return res.status(200).json({
|
|
success: true,
|
|
id,
|
|
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
|
});
|
|
}
|