diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/bulk-scrape.ts new file mode 100644 index 00000000..3e1afbd0 --- /dev/null +++ b/apps/api/src/controllers/v1/bulk-scrape.ts @@ -0,0 +1,99 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + BulkScrapeRequest, + bulkScrapeRequestSchema, + CrawlResponse, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJobs, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function bulkScrapeController( + req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>, + res: Response +) { + req.body = bulkScrapeRequestSchema.parse(req.body); + + const id = uuidv4(); + + await logCrawl(id, req.auth.team_id); + + let { remainingCredits } = req.account; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if(!useDbAuthentication){ + remainingCredits = Infinity; + } + + const pageOptions = legacyScrapeOptions(req.body); + + const sc: StoredCrawl = { + crawlerOptions: null, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + await saveCrawl(id, sc); + + let jobPriority = 20; + + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(req.body.urls.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } + + const jobs = req.body.urls.map((x) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url: x, + mode: "single_urls", + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions: null, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + v1: true, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); + await getScrapeQueue().addBulk(jobs); + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + + return res.status(200).json({ + success: true, + id, + url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`, + }); +} + + diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 033de6e0..56c944ec 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -141,19 +141,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({ return obj; }); -// export type ScrapeRequest = { -// url: string; -// formats?: Format[]; -// headers?: { [K: string]: string }; -// includeTags?: string[]; -// excludeTags?: string[]; -// onlyMainContent?: boolean; -// timeout?: number; -// waitFor?: number; -// } - export type ScrapeRequest = z.infer; +export const bulkScrapeRequestSchema = scrapeOptions.extend({ + urls: url.array(), + origin: z.string().optional().default("api"), +}).strict(strictMessage).refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); + }, + { + message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + } +).transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; +}); + +export type BulkScrapeRequest = z.infer; + const crawlerOptions = z.object({ includePaths: z.string().array().default([]), excludePaths: z.string().array().default([]), diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index f0ece43f..379bc179 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service"; import { Logger } from "./logger"; export type StoredCrawl = { - originUrl: string; + originUrl?: string; crawlerOptions: any; pageOptions: any; team_id: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 6e642c65..8eb679e7 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -112,7 +112,7 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = crawlerOptions.returnOnlyUrls + const filteredDocs = crawlerOptions?.returnOnlyUrls ? docs.map((doc) => { if (doc.metadata.sourceURL) { return { url: doc.metadata.sourceURL }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index b0ceceb4..2bd3d3ea 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; +import { bulkScrapeController } from "../controllers/v1/bulk-scrape"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -122,6 +123,15 @@ v1Router.post( wrap(crawlController) ); +v1Router.post( + "/bulk/scrape", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(bulkScrapeController) +); + v1Router.post( "/map", authMiddleware(RateLimiterMode.Map), @@ -136,6 +146,12 @@ v1Router.get( wrap(crawlStatusController) ); +v1Router.get( + "/bulk/scrape/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + v1Router.get( "/scrape/:jobId", wrap(scrapeStatusController) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index bff51f74..1ea4775a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -365,7 +365,7 @@ async function processJob(job: Job, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - if (!job.data.sitemapped) { + if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); @@ -414,9 +414,7 @@ async function processJob(job: Job, token: string) { } } - if (await finishCrawl(job.data.crawl_id)) { - - + if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) { if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id);