feat(v1/batch/scrape): add ignoreInvalidURLs option

This commit is contained in:
Gergő Móricz 2024-12-14 01:11:43 +01:00
parent e74e4bcefc
commit 4b5014d7fe
4 changed files with 72 additions and 7 deletions

View File

@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
batchScrapeRequestSchemaNoURLValidation,
url as urlSchema,
RequestWithAuth,
ScrapeOptions,
BatchScrapeResponse,
} from "./types";
import {
addCrawlJobs,
@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>,
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<BatchScrapeResponse>,
) {
req.body = batchScrapeRequestSchema.parse(req.body);
if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else {
req.body = batchScrapeRequestSchema.parse(req.body);
}
const id = req.body.appendToId ?? uuidv4();
const logger = _logger.child({
@ -35,8 +41,27 @@ export async function batchScrapeController(
teamId: req.auth.team_id,
plan: req.auth.plan,
});
let urls = req.body.urls;
let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) {
invalidURLs = [];
let pendingURLs = urls;
urls = [];
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
urls.push(nu);
} catch (_) {
invalidURLs.push(u);
}
}
}
logger.debug("Batch scrape " + id + " starting", {
urlsLength: req.body.urls,
urlsLength: urls,
appendToId: req.body.appendToId,
account: req.account,
});
@ -70,7 +95,7 @@ export async function batchScrapeController(
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (req.body.urls.length > 1000) {
if (urls.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({
plan: req.auth.plan,
@ -84,7 +109,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId;
const jobs = req.body.urls.map((x) => {
const jobs = urls.map((x) => {
return {
data: {
url: x,
@ -140,5 +165,6 @@ export async function batchScrapeController(
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
invalidURLs,
});
}

View File

@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
@ -446,6 +471,15 @@ export type CrawlResponse =
url: string;
};
export type BatchScrapeResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
invalidURLs?: string[];
};
export type MapResponse =
| ErrorResponse
| {

View File

@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
}
export async function addCrawlJobs(id: string, job_ids: string[]) {
if (job_ids.length === 0) return true;
_logger.debug("Adding crawl jobs to Redis...", {
jobIds: job_ids,
module: "crawl-redis",
@ -261,6 +263,8 @@ export async function lockURLs(
sc: StoredCrawl,
urls: string[],
): Promise<boolean> {
if (urls.length === 0) return true;
urls = urls.map((url) => normalizeURL(url, sc));
const logger = _logger.child({
crawlId: id,

View File

@ -108,6 +108,7 @@ export async function addScrapeJobs(
};
}[],
) {
if (jobs.length === 0) return true;
// TODO: better
await Promise.all(
jobs.map((job) =>