Nick: fixes

This commit is contained in:
Nicolas 2025-02-19 15:01:47 -03:00
parent d984b50400
commit a60f3ff645
2 changed files with 156 additions and 96 deletions

View File

@ -1,19 +1,14 @@
import { Response } from "express";
import { RequestWithAuth } from "./types";
import {
GenerateLLMsTextRequest,
generateLLMsTextRequestSchema,
RequestWithAuth,
} from "./types";
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node";
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
import { z } from "zod";
export const generateLLMsTextRequestSchema = z.object({
url: z.string().url().describe('The URL to generate text from'),
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
__experimental_stream: z.boolean().optional(),
});
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
export type GenerateLLMsTextResponse = {
success: boolean;
id: string;
@ -66,14 +61,18 @@ export async function generateLLMsTextController(
},
},
async (span) => {
await getGenerateLlmsTxtQueue().add(generationId, {
...jobData,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
await getGenerateLlmsTxtQueue().add(
generationId,
{
...jobData,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
},
}, { jobId: generationId });
{ jobId: generationId },
);
},
);
} else {

View File

@ -34,7 +34,10 @@ export const url = z.preprocess(
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(x),
(x) =>
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
x,
),
"URL must have a valid top-level domain or be a valid path",
)
.refine((x) => {
@ -69,7 +72,10 @@ export type ExtractOptions = z.infer<typeof extractOptions>;
const ACTIONS_MAX_WAIT_TIME = 60;
const MAX_ACTIONS = 50;
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number {
function calculateTotalWaitTime(
actions: any[] = [],
waitFor: number = 0,
): number {
const actionWaitTime = actions.reduce((acc, action) => {
if (action.type === "wait") {
if (action.milliseconds) {
@ -86,63 +92,64 @@ function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): numbe
return waitFor + actionWaitTime;
}
export const actionsSchema = z.array(
z.union([
z
.object({
type: z.literal("wait"),
milliseconds: z.number().int().positive().finite().optional(),
export const actionsSchema = z
.array(
z.union([
z
.object({
type: z.literal("wait"),
milliseconds: z.number().int().positive().finite().optional(),
selector: z.string().optional(),
})
.refine(
(data) =>
(data.milliseconds !== undefined || data.selector !== undefined) &&
!(data.milliseconds !== undefined && data.selector !== undefined),
{
message:
"Either 'milliseconds' or 'selector' must be provided, but not both.",
},
),
z.object({
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("write"),
text: z.string(),
}),
z.object({
type: z.literal("press"),
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]).optional().default("down"),
selector: z.string().optional(),
})
.refine(
(data) =>
(data.milliseconds !== undefined || data.selector !== undefined) &&
!(data.milliseconds !== undefined && data.selector !== undefined),
{
message:
"Either 'milliseconds' or 'selector' must be provided, but not both.",
},
),
z.object({
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("write"),
text: z.string(),
}),
z.object({
type: z.literal("press"),
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]).optional().default("down"),
selector: z.string().optional(),
}),
z.object({
type: z.literal("scrape"),
}),
z.object({
type: z.literal("executeJavascript"),
script: z.string(),
}),
]),
).refine(
(actions) => actions.length <= MAX_ACTIONS,
{
}),
z.object({
type: z.literal("scrape"),
}),
z.object({
type: z.literal("executeJavascript"),
script: z.string(),
}),
]),
)
.refine((actions) => actions.length <= MAX_ACTIONS, {
message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
},
).refine(
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
);
})
.refine(
(actions) =>
calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
);
const baseScrapeOptions = z
.object({
@ -169,7 +176,14 @@ const baseScrapeOptions = z
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0),
waitFor: z
.number()
.int()
.nonnegative()
.finite()
.safe()
.max(60000)
.default(0),
// Deprecate this to jsonOptions
extract: extractOptions.optional(),
// New
@ -184,7 +198,10 @@ const baseScrapeOptions = z
.string()
.optional()
.refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic",
(val) =>
!val ||
Object.keys(countries).includes(val.toUpperCase()) ||
val === "US-generic",
{
message:
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
@ -267,18 +284,23 @@ const extractTransform = (obj) => {
}
return obj;
}
};
export const scrapeOptions = baseScrapeOptions.refine(
(obj) => {
if (!obj.actions) return true;
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000;
},
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
}
).refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export const scrapeOptions = baseScrapeOptions
.refine(
(obj) => {
if (!obj.actions) return true;
return (
calculateTotalWaitTime(obj.actions, obj.waitFor) <=
ACTIONS_MAX_WAIT_TIME * 1000
);
},
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
)
.refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
@ -324,15 +346,26 @@ export const extractV1Options = z
__experimental_showSources: z.boolean().default(false),
showSources: z.boolean().default(false),
__experimental_cacheKey: z.string().optional(),
__experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional()
__experimental_cacheMode: z
.enum(["direct", "save", "load"])
.default("direct")
.optional(),
})
.strict(strictMessage)
.transform((obj) => ({
...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
}))
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
.refine(
(x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true),
extractRefineOpts,
)
.transform((x) => ({
...x,
scrapeOptions: x.scrapeOptions
? extractTransform(x.scrapeOptions)
: x.scrapeOptions,
}));
export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
@ -366,7 +399,9 @@ export const webhookSchema = z.preprocess(
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
metadata: z.record(z.string(), z.string()).default({}),
events: z.array(z.enum(["completed", "failed", "page", "started"])).default(["completed", "failed", "page", "started"]),
events: z
.array(z.enum(["completed", "failed", "page", "started"]))
.default(["completed", "failed", "page", "started"]),
})
.strict(strictMessage),
);
@ -435,8 +470,11 @@ export const crawlRequestSchema = crawlerOptions
limit: z.number().default(10000),
})
.strict(strictMessage)
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform((x) => ({
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
}));
// export type CrawlRequest = {
// url: string;
@ -902,7 +940,8 @@ export const searchRequestSchema = z
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: baseScrapeOptions.extend({
scrapeOptions: baseScrapeOptions
.extend({
formats: z
.array(
z.enum([
@ -922,8 +961,11 @@ export const searchRequestSchema = z
.strict(
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
)
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform((x) => ({
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
}));
export type SearchRequest = z.infer<typeof searchRequestSchema>;
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
@ -943,3 +985,22 @@ export type TokenUsage = {
step?: string;
model?: string;
};
export const generateLLMsTextRequestSchema = z.object({
url: url.describe("The URL to generate text from"),
maxUrls: z
.number()
.min(1)
.max(100)
.default(10)
.describe("Maximum number of URLs to process"),
showFullText: z
.boolean()
.default(false)
.describe("Whether to show the full LLMs-full.txt in the response"),
__experimental_stream: z.boolean().optional(),
});
export type GenerateLLMsTextRequest = z.infer<
typeof generateLLMsTextRequestSchema
>;