mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 17:59:07 +08:00
Nick: fixes
This commit is contained in:
parent
d984b50400
commit
a60f3ff645
@ -1,19 +1,14 @@
|
||||
import { Response } from "express";
|
||||
import { RequestWithAuth } from "./types";
|
||||
import {
|
||||
GenerateLLMsTextRequest,
|
||||
generateLLMsTextRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { z } from "zod";
|
||||
|
||||
export const generateLLMsTextRequestSchema = z.object({
|
||||
url: z.string().url().describe('The URL to generate text from'),
|
||||
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
|
||||
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
|
||||
__experimental_stream: z.boolean().optional(),
|
||||
});
|
||||
|
||||
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
|
||||
|
||||
export type GenerateLLMsTextResponse = {
|
||||
success: boolean;
|
||||
id: string;
|
||||
@ -66,14 +61,18 @@ export async function generateLLMsTextController(
|
||||
},
|
||||
},
|
||||
async (span) => {
|
||||
await getGenerateLlmsTxtQueue().add(generationId, {
|
||||
...jobData,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
await getGenerateLlmsTxtQueue().add(
|
||||
generationId,
|
||||
{
|
||||
...jobData,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
},
|
||||
}, { jobId: generationId });
|
||||
{ jobId: generationId },
|
||||
);
|
||||
},
|
||||
);
|
||||
} else {
|
||||
|
@ -34,7 +34,10 @@ export const url = z.preprocess(
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(x),
|
||||
(x) =>
|
||||
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
|
||||
x,
|
||||
),
|
||||
"URL must have a valid top-level domain or be a valid path",
|
||||
)
|
||||
.refine((x) => {
|
||||
@ -69,7 +72,10 @@ export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
const ACTIONS_MAX_WAIT_TIME = 60;
|
||||
const MAX_ACTIONS = 50;
|
||||
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number {
|
||||
function calculateTotalWaitTime(
|
||||
actions: any[] = [],
|
||||
waitFor: number = 0,
|
||||
): number {
|
||||
const actionWaitTime = actions.reduce((acc, action) => {
|
||||
if (action.type === "wait") {
|
||||
if (action.milliseconds) {
|
||||
@ -86,63 +92,64 @@ function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): numbe
|
||||
return waitFor + actionWaitTime;
|
||||
}
|
||||
|
||||
export const actionsSchema = z.array(
|
||||
z.union([
|
||||
z
|
||||
.object({
|
||||
type: z.literal("wait"),
|
||||
milliseconds: z.number().int().positive().finite().optional(),
|
||||
export const actionsSchema = z
|
||||
.array(
|
||||
z.union([
|
||||
z
|
||||
.object({
|
||||
type: z.literal("wait"),
|
||||
milliseconds: z.number().int().positive().finite().optional(),
|
||||
selector: z.string().optional(),
|
||||
})
|
||||
.refine(
|
||||
(data) =>
|
||||
(data.milliseconds !== undefined || data.selector !== undefined) &&
|
||||
!(data.milliseconds !== undefined && data.selector !== undefined),
|
||||
{
|
||||
message:
|
||||
"Either 'milliseconds' or 'selector' must be provided, but not both.",
|
||||
},
|
||||
),
|
||||
z.object({
|
||||
type: z.literal("click"),
|
||||
selector: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("screenshot"),
|
||||
fullPage: z.boolean().default(false),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("write"),
|
||||
text: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("press"),
|
||||
key: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("scroll"),
|
||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||
selector: z.string().optional(),
|
||||
})
|
||||
.refine(
|
||||
(data) =>
|
||||
(data.milliseconds !== undefined || data.selector !== undefined) &&
|
||||
!(data.milliseconds !== undefined && data.selector !== undefined),
|
||||
{
|
||||
message:
|
||||
"Either 'milliseconds' or 'selector' must be provided, but not both.",
|
||||
},
|
||||
),
|
||||
z.object({
|
||||
type: z.literal("click"),
|
||||
selector: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("screenshot"),
|
||||
fullPage: z.boolean().default(false),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("write"),
|
||||
text: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("press"),
|
||||
key: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("scroll"),
|
||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||
selector: z.string().optional(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("scrape"),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("executeJavascript"),
|
||||
script: z.string(),
|
||||
}),
|
||||
]),
|
||||
).refine(
|
||||
(actions) => actions.length <= MAX_ACTIONS,
|
||||
{
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("scrape"),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("executeJavascript"),
|
||||
script: z.string(),
|
||||
}),
|
||||
]),
|
||||
)
|
||||
.refine((actions) => actions.length <= MAX_ACTIONS, {
|
||||
message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
|
||||
},
|
||||
).refine(
|
||||
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
},
|
||||
);
|
||||
})
|
||||
.refine(
|
||||
(actions) =>
|
||||
calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
},
|
||||
);
|
||||
|
||||
const baseScrapeOptions = z
|
||||
.object({
|
||||
@ -169,7 +176,14 @@ const baseScrapeOptions = z
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().optional(),
|
||||
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0),
|
||||
waitFor: z
|
||||
.number()
|
||||
.int()
|
||||
.nonnegative()
|
||||
.finite()
|
||||
.safe()
|
||||
.max(60000)
|
||||
.default(0),
|
||||
// Deprecate this to jsonOptions
|
||||
extract: extractOptions.optional(),
|
||||
// New
|
||||
@ -184,7 +198,10 @@ const baseScrapeOptions = z
|
||||
.string()
|
||||
.optional()
|
||||
.refine(
|
||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic",
|
||||
(val) =>
|
||||
!val ||
|
||||
Object.keys(countries).includes(val.toUpperCase()) ||
|
||||
val === "US-generic",
|
||||
{
|
||||
message:
|
||||
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||
@ -267,18 +284,23 @@ const extractTransform = (obj) => {
|
||||
}
|
||||
|
||||
return obj;
|
||||
}
|
||||
};
|
||||
|
||||
export const scrapeOptions = baseScrapeOptions.refine(
|
||||
(obj) => {
|
||||
if (!obj.actions) return true;
|
||||
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000;
|
||||
},
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
}
|
||||
).refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
export const scrapeOptions = baseScrapeOptions
|
||||
.refine(
|
||||
(obj) => {
|
||||
if (!obj.actions) return true;
|
||||
return (
|
||||
calculateTotalWaitTime(obj.actions, obj.waitFor) <=
|
||||
ACTIONS_MAX_WAIT_TIME * 1000
|
||||
);
|
||||
},
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
},
|
||||
)
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||
|
||||
@ -324,15 +346,26 @@ export const extractV1Options = z
|
||||
__experimental_showSources: z.boolean().default(false),
|
||||
showSources: z.boolean().default(false),
|
||||
__experimental_cacheKey: z.string().optional(),
|
||||
__experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional()
|
||||
__experimental_cacheMode: z
|
||||
.enum(["direct", "save", "load"])
|
||||
.default("direct")
|
||||
.optional(),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
}))
|
||||
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
|
||||
.refine(
|
||||
(x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true),
|
||||
extractRefineOpts,
|
||||
)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: x.scrapeOptions
|
||||
? extractTransform(x.scrapeOptions)
|
||||
: x.scrapeOptions,
|
||||
}));
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
@ -366,7 +399,9 @@ export const webhookSchema = z.preprocess(
|
||||
url: z.string().url(),
|
||||
headers: z.record(z.string(), z.string()).default({}),
|
||||
metadata: z.record(z.string(), z.string()).default({}),
|
||||
events: z.array(z.enum(["completed", "failed", "page", "started"])).default(["completed", "failed", "page", "started"]),
|
||||
events: z
|
||||
.array(z.enum(["completed", "failed", "page", "started"]))
|
||||
.default(["completed", "failed", "page", "started"]),
|
||||
})
|
||||
.strict(strictMessage),
|
||||
);
|
||||
@ -435,8 +470,11 @@ export const crawlRequestSchema = crawlerOptions
|
||||
limit: z.number().default(10000),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
}));
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
@ -902,7 +940,8 @@ export const searchRequestSchema = z
|
||||
location: z.string().optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
scrapeOptions: baseScrapeOptions.extend({
|
||||
scrapeOptions: baseScrapeOptions
|
||||
.extend({
|
||||
formats: z
|
||||
.array(
|
||||
z.enum([
|
||||
@ -922,8 +961,11 @@ export const searchRequestSchema = z
|
||||
.strict(
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||
)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
}));
|
||||
|
||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
||||
@ -943,3 +985,22 @@ export type TokenUsage = {
|
||||
step?: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export const generateLLMsTextRequestSchema = z.object({
|
||||
url: url.describe("The URL to generate text from"),
|
||||
maxUrls: z
|
||||
.number()
|
||||
.min(1)
|
||||
.max(100)
|
||||
.default(10)
|
||||
.describe("Maximum number of URLs to process"),
|
||||
showFullText: z
|
||||
.boolean()
|
||||
.default(false)
|
||||
.describe("Whether to show the full LLMs-full.txt in the response"),
|
||||
__experimental_stream: z.boolean().optional(),
|
||||
});
|
||||
|
||||
export type GenerateLLMsTextRequest = z.infer<
|
||||
typeof generateLLMsTextRequestSchema
|
||||
>;
|
||||
|
Loading…
x
Reference in New Issue
Block a user