Update crawl.ts

This commit is contained in:
Nicolas 2024-08-19 11:02:24 -03:00
parent 36b35dbc67
commit 8e4ca86463

View File

@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log"; import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import {
defaultCrawlPageOptions,
defaultCrawlerOptions,
defaultOrigin,
} from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service"; import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../src/lib/validateUrl";
@ -38,10 +50,12 @@ export async function crawlController(req: Request, res: Response) {
} }
} }
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const crawlerOptions = {
...defaultCrawlerOptions,
...req.body.crawlerOptions,
};
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
const limitCheck = crawlerOptions?.limit ?? 1; const limitCheck = crawlerOptions?.limit ?? 1;
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, limitCheck); await checkTeamCredits(team_id, limitCheck);
@ -63,9 +77,7 @@ export async function crawlController(req: Request, res: Response) {
} }
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res return res.status(403).json({
.status(403)
.json({
error: error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
}); });
@ -73,7 +85,6 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try { // try {
// const a = new WebScraperDataProvider(); // const a = new WebScraperDataProvider();
@ -123,10 +134,12 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc); await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
const jobs = sitemap.map(x => { const jobs = sitemap.map((x) => {
const url = x.url; const url = x.url;
const uuid = uuidv4(); const uuid = uuidv4();
return { return {
@ -144,16 +157,23 @@ export async function crawlController(req: Request, res: Response) {
opts: { opts: {
jobId: uuid, jobId: uuid,
priority: 20, priority: 20,
} },
}; };
}) });
await lockURLs(id, jobs.map(x => x.data.url)); await lockURLs(
await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs); await getScrapeQueue().addBulk(jobs);
} else { } else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const job = await addScrapeJob({ const job = await addScrapeJob(
{
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@ -161,9 +181,11 @@ export async function crawlController(req: Request, res: Response) {
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
crawl_id: id, crawl_id: id,
}, { },
{
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
}); }
);
await addCrawlJob(id, job.id); await addCrawlJob(id, job.id);
} }