From 36b35dbc67ee82ebe3e2fb00a841c3ae1ed28360 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 19 Aug 2024 11:01:26 -0300 Subject: [PATCH 1/2] Update crawl.ts --- apps/api/src/controllers/crawl.ts | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 54eb1f40..a80ac10f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -38,8 +38,14 @@ export async function crawlController(req: Request, res: Response) { } } + const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; + const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; + + + const limitCheck = crawlerOptions?.limit ?? 1; const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); + await checkTeamCredits(team_id, limitCheck); + if (!creditsCheckSuccess) { return res.status(402).json({ error: "Insufficient credits" }); } @@ -67,8 +73,6 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; - const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; - const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { From 8e4ca8646347c7a44e2cf04479bab564db013289 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 19 Aug 2024 11:02:24 -0300 Subject: [PATCH 2/2] Update crawl.ts --- apps/api/src/controllers/crawl.ts | 78 ++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index a80ac10f..bf402d38 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; -import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { + defaultCrawlPageOptions, + defaultCrawlerOptions, + defaultOrigin, +} from "../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; import { Logger } from "../../src/lib/logger"; -import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../src/services/queue-service"; import { checkAndUpdateURL } from "../../src/lib/validateUrl"; @@ -38,10 +50,12 @@ export async function crawlController(req: Request, res: Response) { } } - const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; + const crawlerOptions = { + ...defaultCrawlerOptions, + ...req.body.crawlerOptions, + }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - const limitCheck = crawlerOptions?.limit ?? 1; const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, limitCheck); @@ -63,17 +77,14 @@ export async function crawlController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res - .status(403) - .json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - }); + return res.status(403).json({ + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + }); } const mode = req.body.mode ?? "crawl"; - // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { // const a = new WebScraperDataProvider(); @@ -123,10 +134,12 @@ export async function crawlController(req: Request, res: Response) { await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions?.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); if (sitemap !== null) { - const jobs = sitemap.map(x => { + const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); return { @@ -144,26 +157,35 @@ export async function crawlController(req: Request, res: Response) { opts: { jobId: uuid, priority: 20, - } + }, }; - }) + }); - await lockURLs(id, jobs.map(x => x.data.url)); - await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); await getScrapeQueue().addBulk(jobs); } else { await lockURL(id, sc, url); - const job = await addScrapeJob({ - url, - mode: "single_urls", - crawlerOptions: crawlerOptions, - team_id: team_id, - pageOptions: pageOptions, - origin: req.body.origin ?? defaultOrigin, - crawl_id: id, - }, { - priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs - }); + const job = await addScrapeJob( + { + url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: team_id, + pageOptions: pageOptions, + origin: req.body.origin ?? defaultOrigin, + crawl_id: id, + }, + { + priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs + } + ); await addCrawlJob(id, job.id); }