From a60f3ff645e586d96efc87f61a20a8d6437d467a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 19 Feb 2025 15:01:47 -0300 Subject: [PATCH] Nick: fixes --- .../src/controllers/v1/generate-llmstxt.ts | 33 ++- apps/api/src/controllers/v1/types.ts | 219 +++++++++++------- 2 files changed, 156 insertions(+), 96 deletions(-) diff --git a/apps/api/src/controllers/v1/generate-llmstxt.ts b/apps/api/src/controllers/v1/generate-llmstxt.ts index e7618f72..f4e528ce 100644 --- a/apps/api/src/controllers/v1/generate-llmstxt.ts +++ b/apps/api/src/controllers/v1/generate-llmstxt.ts @@ -1,19 +1,14 @@ import { Response } from "express"; -import { RequestWithAuth } from "./types"; +import { + GenerateLLMsTextRequest, + generateLLMsTextRequestSchema, + RequestWithAuth, +} from "./types"; import { getGenerateLlmsTxtQueue } from "../../services/queue-service"; import * as Sentry from "@sentry/node"; import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis"; import { z } from "zod"; -export const generateLLMsTextRequestSchema = z.object({ - url: z.string().url().describe('The URL to generate text from'), - maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'), - showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'), - __experimental_stream: z.boolean().optional(), -}); - -export type GenerateLLMsTextRequest = z.infer; - export type GenerateLLMsTextResponse = { success: boolean; id: string; @@ -66,14 +61,18 @@ export async function generateLLMsTextController( }, }, async (span) => { - await getGenerateLlmsTxtQueue().add(generationId, { - ...jobData, - sentry: { - trace: Sentry.spanToTraceHeader(span), - baggage: Sentry.spanToBaggageHeader(span), - size, + await getGenerateLlmsTxtQueue().add( + generationId, + { + ...jobData, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, }, - }, { jobId: generationId }); + { jobId: generationId }, + ); }, ); } else { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9f0150b6..580cf80a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -34,7 +34,10 @@ export const url = z.preprocess( .url() .regex(/^https?:\/\//, "URL uses unsupported protocol") .refine( - (x) => /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(x), + (x) => + /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test( + x, + ), "URL must have a valid top-level domain or be a valid path", ) .refine((x) => { @@ -69,7 +72,10 @@ export type ExtractOptions = z.infer; const ACTIONS_MAX_WAIT_TIME = 60; const MAX_ACTIONS = 50; -function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number { +function calculateTotalWaitTime( + actions: any[] = [], + waitFor: number = 0, +): number { const actionWaitTime = actions.reduce((acc, action) => { if (action.type === "wait") { if (action.milliseconds) { @@ -86,63 +92,64 @@ function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): numbe return waitFor + actionWaitTime; } -export const actionsSchema = z.array( - z.union([ - z - .object({ - type: z.literal("wait"), - milliseconds: z.number().int().positive().finite().optional(), +export const actionsSchema = z + .array( + z.union([ + z + .object({ + type: z.literal("wait"), + milliseconds: z.number().int().positive().finite().optional(), + selector: z.string().optional(), + }) + .refine( + (data) => + (data.milliseconds !== undefined || data.selector !== undefined) && + !(data.milliseconds !== undefined && data.selector !== undefined), + { + message: + "Either 'milliseconds' or 'selector' must be provided, but not both.", + }, + ), + z.object({ + type: z.literal("click"), + selector: z.string(), + }), + z.object({ + type: z.literal("screenshot"), + fullPage: z.boolean().default(false), + }), + z.object({ + type: z.literal("write"), + text: z.string(), + }), + z.object({ + type: z.literal("press"), + key: z.string(), + }), + z.object({ + type: z.literal("scroll"), + direction: z.enum(["up", "down"]).optional().default("down"), selector: z.string().optional(), - }) - .refine( - (data) => - (data.milliseconds !== undefined || data.selector !== undefined) && - !(data.milliseconds !== undefined && data.selector !== undefined), - { - message: - "Either 'milliseconds' or 'selector' must be provided, but not both.", - }, - ), - z.object({ - type: z.literal("click"), - selector: z.string(), - }), - z.object({ - type: z.literal("screenshot"), - fullPage: z.boolean().default(false), - }), - z.object({ - type: z.literal("write"), - text: z.string(), - }), - z.object({ - type: z.literal("press"), - key: z.string(), - }), - z.object({ - type: z.literal("scroll"), - direction: z.enum(["up", "down"]).optional().default("down"), - selector: z.string().optional(), - }), - z.object({ - type: z.literal("scrape"), - }), - z.object({ - type: z.literal("executeJavascript"), - script: z.string(), - }), - ]), -).refine( - (actions) => actions.length <= MAX_ACTIONS, - { + }), + z.object({ + type: z.literal("scrape"), + }), + z.object({ + type: z.literal("executeJavascript"), + script: z.string(), + }), + ]), + ) + .refine((actions) => actions.length <= MAX_ACTIONS, { message: `Number of actions cannot exceed ${MAX_ACTIONS}`, - }, -).refine( - (actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000, - { - message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, - }, -); + }) + .refine( + (actions) => + calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000, + { + message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, + }, + ); const baseScrapeOptions = z .object({ @@ -169,7 +176,14 @@ const baseScrapeOptions = z excludeTags: z.string().array().optional(), onlyMainContent: z.boolean().default(true), timeout: z.number().int().positive().finite().safe().optional(), - waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0), + waitFor: z + .number() + .int() + .nonnegative() + .finite() + .safe() + .max(60000) + .default(0), // Deprecate this to jsonOptions extract: extractOptions.optional(), // New @@ -184,7 +198,10 @@ const baseScrapeOptions = z .string() .optional() .refine( - (val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic", + (val) => + !val || + Object.keys(countries).includes(val.toUpperCase()) || + val === "US-generic", { message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", @@ -267,18 +284,23 @@ const extractTransform = (obj) => { } return obj; -} +}; -export const scrapeOptions = baseScrapeOptions.refine( - (obj) => { - if (!obj.actions) return true; - return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000; - }, - { - message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, - } -).refine(extractRefine, extractRefineOpts) -.transform(extractTransform); +export const scrapeOptions = baseScrapeOptions + .refine( + (obj) => { + if (!obj.actions) return true; + return ( + calculateTotalWaitTime(obj.actions, obj.waitFor) <= + ACTIONS_MAX_WAIT_TIME * 1000 + ); + }, + { + message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, + }, + ) + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export type ScrapeOptions = z.infer; @@ -324,15 +346,26 @@ export const extractV1Options = z __experimental_showSources: z.boolean().default(false), showSources: z.boolean().default(false), __experimental_cacheKey: z.string().optional(), - __experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional() + __experimental_cacheMode: z + .enum(["direct", "save", "load"]) + .default("direct") + .optional(), }) .strict(strictMessage) .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, })) - .refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts) - .transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions })); + .refine( + (x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true), + extractRefineOpts, + ) + .transform((x) => ({ + ...x, + scrapeOptions: x.scrapeOptions + ? extractTransform(x.scrapeOptions) + : x.scrapeOptions, + })); export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; @@ -366,7 +399,9 @@ export const webhookSchema = z.preprocess( url: z.string().url(), headers: z.record(z.string(), z.string()).default({}), metadata: z.record(z.string(), z.string()).default({}), - events: z.array(z.enum(["completed", "failed", "page", "started"])).default(["completed", "failed", "page", "started"]), + events: z + .array(z.enum(["completed", "failed", "page", "started"])) + .default(["completed", "failed", "page", "started"]), }) .strict(strictMessage), ); @@ -435,8 +470,11 @@ export const crawlRequestSchema = crawlerOptions limit: z.number().default(10000), }) .strict(strictMessage) - .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) - .transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); + .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform((x) => ({ + ...x, + scrapeOptions: extractTransform(x.scrapeOptions), + })); // export type CrawlRequest = { // url: string; @@ -902,7 +940,8 @@ export const searchRequestSchema = z location: z.string().optional(), origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), - scrapeOptions: baseScrapeOptions.extend({ + scrapeOptions: baseScrapeOptions + .extend({ formats: z .array( z.enum([ @@ -922,8 +961,11 @@ export const searchRequestSchema = z .strict( "Unrecognized key in body -- please review the v1 API documentation for request body changes", ) - .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) - .transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); + .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform((x) => ({ + ...x, + scrapeOptions: extractTransform(x.scrapeOptions), + })); export type SearchRequest = z.infer; export type SearchRequestInput = z.input; @@ -943,3 +985,22 @@ export type TokenUsage = { step?: string; model?: string; }; + +export const generateLLMsTextRequestSchema = z.object({ + url: url.describe("The URL to generate text from"), + maxUrls: z + .number() + .min(1) + .max(100) + .default(10) + .describe("Maximum number of URLs to process"), + showFullText: z + .boolean() + .default(false) + .describe("Whether to show the full LLMs-full.txt in the response"), + __experimental_stream: z.boolean().optional(), +}); + +export type GenerateLLMsTextRequest = z.infer< + typeof generateLLMsTextRequestSchema +>;