feat(v1/batch/scrape): add ignoreInvalidURLs option

2025-08-06 08:06:01 +08:00 · 2024-12-14 01:11:43 +01:00 · 2024-12-14 01:11:43 +01:00 · 4b5014d7fe
commit 4b5014d7fe
parent e74e4bcefc
4 changed files with 72 additions and 7 deletions
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
 import {
  BatchScrapeRequest,
  batchScrapeRequestSchema,
-  CrawlResponse,
+  batchScrapeRequestSchemaNoURLValidation,
+  url as urlSchema,
  RequestWithAuth,
  ScrapeOptions,
+  BatchScrapeResponse,
 } from "./types";
 import {
  addCrawlJobs,
@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
 import { logger as _logger } from "../../lib/logger";

 export async function batchScrapeController(
-  req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
-  res: Response<CrawlResponse>,
+  req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
+  res: Response<BatchScrapeResponse>,
 ) {
-  req.body = batchScrapeRequestSchema.parse(req.body);
+  if (req.body?.ignoreInvalidURLs === true) {
+    req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
+  } else {
+    req.body = batchScrapeRequestSchema.parse(req.body);
+  }

  const id = req.body.appendToId ?? uuidv4();
  const logger = _logger.child({
@ -35,8 +41,27 @@ export async function batchScrapeController(
    teamId: req.auth.team_id,
    plan: req.auth.plan,
  });
+
+  let urls = req.body.urls;
+  let invalidURLs: string[] | undefined = undefined;
+
+  if (req.body.ignoreInvalidURLs) {
+    invalidURLs = [];
+
+    let pendingURLs = urls;
+    urls = [];
+    for (const u of pendingURLs) {
+      try {
+        const nu = urlSchema.parse(u);
+        urls.push(nu);
+      } catch (_) {
+        invalidURLs.push(u);
+      }
+    }
+  }
+
  logger.debug("Batch scrape " + id + " starting", {
-    urlsLength: req.body.urls,
+    urlsLength: urls,
    appendToId: req.body.appendToId,
    account: req.account,
  });
@ -70,7 +95,7 @@ export async function batchScrapeController(

  // If it is over 1000, we need to get the job priority,
  // otherwise we can use the default priority of 20
-  if (req.body.urls.length > 1000) {
+  if (urls.length > 1000) {
    // set base to 21
    jobPriority = await getJobPriority({
      plan: req.auth.plan,
@ -84,7 +109,7 @@ export async function batchScrapeController(
  delete (scrapeOptions as any).urls;
  delete (scrapeOptions as any).appendToId;

-  const jobs = req.body.urls.map((x) => {
+  const jobs = urls.map((x) => {
    return {
      data: {
        url: x,
@ -140,5 +165,6 @@ export async function batchScrapeController(
    success: true,
    id,
    url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
+    invalidURLs,
  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
    origin: z.string().optional().default("api"),
    webhook: webhookSchema.optional(),
    appendToId: z.string().uuid().optional(),
+    ignoreInvalidURLs: z.boolean().default(false),
+  })
+  .strict(strictMessage)
+  .refine(
+    (obj) => {
+      const hasExtractFormat = obj.formats?.includes("extract");
+      const hasExtractOptions = obj.extract !== undefined;
+      return (
+        (hasExtractFormat && hasExtractOptions) ||
+        (!hasExtractFormat && !hasExtractOptions)
+      );
+    },
+    {
+      message:
+        "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
+    },
+  );
+
+export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
+  .extend({
+    urls: z.string().array(),
+    origin: z.string().optional().default("api"),
+    webhook: webhookSchema.optional(),
+    appendToId: z.string().uuid().optional(),
+    ignoreInvalidURLs: z.boolean().default(false),
  })
  .strict(strictMessage)
  .refine(
@ -446,6 +471,15 @@ export type CrawlResponse =
      url: string;
    };

+export type BatchScrapeResponse =
+  | ErrorResponse
+  | {
+      success: true;
+      id: string;
+      url: string;
+      invalidURLs?: string[];
+    };
+
 export type MapResponse =
  | ErrorResponse
  | {
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
 }

 export async function addCrawlJobs(id: string, job_ids: string[]) {
+  if (job_ids.length === 0) return true;
+
  _logger.debug("Adding crawl jobs to Redis...", {
    jobIds: job_ids,
    module: "crawl-redis",
@ -261,6 +263,8 @@ export async function lockURLs(
  sc: StoredCrawl,
  urls: string[],
 ): Promise<boolean> {
+  if (urls.length === 0) return true;
+
  urls = urls.map((url) => normalizeURL(url, sc));
  const logger = _logger.child({
    crawlId: id,
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -108,6 +108,7 @@ export async function addScrapeJobs(
    };
  }[],
 ) {
+  if (jobs.length === 0) return true;
  // TODO: better
  await Promise.all(
    jobs.map((job) =>