Nick: fixes

This commit is contained in:
Nicolas 2025-02-19 15:01:47 -03:00
parent d984b50400
commit a60f3ff645
2 changed files with 156 additions and 96 deletions

View File

@ -1,19 +1,14 @@
import { Response } from "express"; import { Response } from "express";
import { RequestWithAuth } from "./types"; import {
GenerateLLMsTextRequest,
generateLLMsTextRequestSchema,
RequestWithAuth,
} from "./types";
import { getGenerateLlmsTxtQueue } from "../../services/queue-service"; import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis"; import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
import { z } from "zod"; import { z } from "zod";
export const generateLLMsTextRequestSchema = z.object({
url: z.string().url().describe('The URL to generate text from'),
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
__experimental_stream: z.boolean().optional(),
});
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
export type GenerateLLMsTextResponse = { export type GenerateLLMsTextResponse = {
success: boolean; success: boolean;
id: string; id: string;
@ -66,14 +61,18 @@ export async function generateLLMsTextController(
}, },
}, },
async (span) => { async (span) => {
await getGenerateLlmsTxtQueue().add(generationId, { await getGenerateLlmsTxtQueue().add(
...jobData, generationId,
sentry: { {
trace: Sentry.spanToTraceHeader(span), ...jobData,
baggage: Sentry.spanToBaggageHeader(span), sentry: {
size, trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
}, },
}, { jobId: generationId }); { jobId: generationId },
);
}, },
); );
} else { } else {

View File

@ -34,7 +34,10 @@ export const url = z.preprocess(
.url() .url()
.regex(/^https?:\/\//, "URL uses unsupported protocol") .regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine( .refine(
(x) => /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(x), (x) =>
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
x,
),
"URL must have a valid top-level domain or be a valid path", "URL must have a valid top-level domain or be a valid path",
) )
.refine((x) => { .refine((x) => {
@ -69,7 +72,10 @@ export type ExtractOptions = z.infer<typeof extractOptions>;
const ACTIONS_MAX_WAIT_TIME = 60; const ACTIONS_MAX_WAIT_TIME = 60;
const MAX_ACTIONS = 50; const MAX_ACTIONS = 50;
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number { function calculateTotalWaitTime(
actions: any[] = [],
waitFor: number = 0,
): number {
const actionWaitTime = actions.reduce((acc, action) => { const actionWaitTime = actions.reduce((acc, action) => {
if (action.type === "wait") { if (action.type === "wait") {
if (action.milliseconds) { if (action.milliseconds) {
@ -86,63 +92,64 @@ function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): numbe
return waitFor + actionWaitTime; return waitFor + actionWaitTime;
} }
export const actionsSchema = z.array( export const actionsSchema = z
z.union([ .array(
z z.union([
.object({ z
type: z.literal("wait"), .object({
milliseconds: z.number().int().positive().finite().optional(), type: z.literal("wait"),
milliseconds: z.number().int().positive().finite().optional(),
selector: z.string().optional(),
})
.refine(
(data) =>
(data.milliseconds !== undefined || data.selector !== undefined) &&
!(data.milliseconds !== undefined && data.selector !== undefined),
{
message:
"Either 'milliseconds' or 'selector' must be provided, but not both.",
},
),
z.object({
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("write"),
text: z.string(),
}),
z.object({
type: z.literal("press"),
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]).optional().default("down"),
selector: z.string().optional(), selector: z.string().optional(),
}) }),
.refine( z.object({
(data) => type: z.literal("scrape"),
(data.milliseconds !== undefined || data.selector !== undefined) && }),
!(data.milliseconds !== undefined && data.selector !== undefined), z.object({
{ type: z.literal("executeJavascript"),
message: script: z.string(),
"Either 'milliseconds' or 'selector' must be provided, but not both.", }),
}, ]),
), )
z.object({ .refine((actions) => actions.length <= MAX_ACTIONS, {
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("write"),
text: z.string(),
}),
z.object({
type: z.literal("press"),
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]).optional().default("down"),
selector: z.string().optional(),
}),
z.object({
type: z.literal("scrape"),
}),
z.object({
type: z.literal("executeJavascript"),
script: z.string(),
}),
]),
).refine(
(actions) => actions.length <= MAX_ACTIONS,
{
message: `Number of actions cannot exceed ${MAX_ACTIONS}`, message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
}, })
).refine( .refine(
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000, (actions) =>
{ calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, {
}, message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
); },
);
const baseScrapeOptions = z const baseScrapeOptions = z
.object({ .object({
@ -169,7 +176,14 @@ const baseScrapeOptions = z
excludeTags: z.string().array().optional(), excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true), onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(), timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0), waitFor: z
.number()
.int()
.nonnegative()
.finite()
.safe()
.max(60000)
.default(0),
// Deprecate this to jsonOptions // Deprecate this to jsonOptions
extract: extractOptions.optional(), extract: extractOptions.optional(),
// New // New
@ -184,7 +198,10 @@ const baseScrapeOptions = z
.string() .string()
.optional() .optional()
.refine( .refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic", (val) =>
!val ||
Object.keys(countries).includes(val.toUpperCase()) ||
val === "US-generic",
{ {
message: message:
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
@ -267,18 +284,23 @@ const extractTransform = (obj) => {
} }
return obj; return obj;
} };
export const scrapeOptions = baseScrapeOptions.refine( export const scrapeOptions = baseScrapeOptions
(obj) => { .refine(
if (!obj.actions) return true; (obj) => {
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000; if (!obj.actions) return true;
}, return (
{ calculateTotalWaitTime(obj.actions, obj.waitFor) <=
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, ACTIONS_MAX_WAIT_TIME * 1000
} );
).refine(extractRefine, extractRefineOpts) },
.transform(extractTransform); {
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
)
.refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>; export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
@ -324,15 +346,26 @@ export const extractV1Options = z
__experimental_showSources: z.boolean().default(false), __experimental_showSources: z.boolean().default(false),
showSources: z.boolean().default(false), showSources: z.boolean().default(false),
__experimental_cacheKey: z.string().optional(), __experimental_cacheKey: z.string().optional(),
__experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional() __experimental_cacheMode: z
.enum(["direct", "save", "load"])
.default("direct")
.optional(),
}) })
.strict(strictMessage) .strict(strictMessage)
.transform((obj) => ({ .transform((obj) => ({
...obj, ...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
})) }))
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts) .refine(
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions })); (x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true),
extractRefineOpts,
)
.transform((x) => ({
...x,
scrapeOptions: x.scrapeOptions
? extractTransform(x.scrapeOptions)
: x.scrapeOptions,
}));
export type ExtractV1Options = z.infer<typeof extractV1Options>; export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options; export const extractRequestSchema = extractV1Options;
@ -366,7 +399,9 @@ export const webhookSchema = z.preprocess(
url: z.string().url(), url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}), headers: z.record(z.string(), z.string()).default({}),
metadata: z.record(z.string(), z.string()).default({}), metadata: z.record(z.string(), z.string()).default({}),
events: z.array(z.enum(["completed", "failed", "page", "started"])).default(["completed", "failed", "page", "started"]), events: z
.array(z.enum(["completed", "failed", "page", "started"]))
.default(["completed", "failed", "page", "started"]),
}) })
.strict(strictMessage), .strict(strictMessage),
); );
@ -435,8 +470,11 @@ export const crawlRequestSchema = crawlerOptions
limit: z.number().default(10000), limit: z.number().default(10000),
}) })
.strict(strictMessage) .strict(strictMessage)
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); .transform((x) => ({
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
}));
// export type CrawlRequest = { // export type CrawlRequest = {
// url: string; // url: string;
@ -902,7 +940,8 @@ export const searchRequestSchema = z
location: z.string().optional(), location: z.string().optional(),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: baseScrapeOptions.extend({ scrapeOptions: baseScrapeOptions
.extend({
formats: z formats: z
.array( .array(
z.enum([ z.enum([
@ -922,8 +961,11 @@ export const searchRequestSchema = z
.strict( .strict(
"Unrecognized key in body -- please review the v1 API documentation for request body changes", "Unrecognized key in body -- please review the v1 API documentation for request body changes",
) )
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); .transform((x) => ({
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
}));
export type SearchRequest = z.infer<typeof searchRequestSchema>; export type SearchRequest = z.infer<typeof searchRequestSchema>;
export type SearchRequestInput = z.input<typeof searchRequestSchema>; export type SearchRequestInput = z.input<typeof searchRequestSchema>;
@ -943,3 +985,22 @@ export type TokenUsage = {
step?: string; step?: string;
model?: string; model?: string;
}; };
export const generateLLMsTextRequestSchema = z.object({
url: url.describe("The URL to generate text from"),
maxUrls: z
.number()
.min(1)
.max(100)
.default(10)
.describe("Maximum number of URLs to process"),
showFullText: z
.boolean()
.default(false)
.describe("Whether to show the full LLMs-full.txt in the response"),
__experimental_stream: z.boolean().optional(),
});
export type GenerateLLMsTextRequest = z.infer<
typeof generateLLMsTextRequestSchema
>;