mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 18:19:00 +08:00
Nick: fixes
This commit is contained in:
parent
d984b50400
commit
a60f3ff645
@ -1,19 +1,14 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { RequestWithAuth } from "./types";
|
import {
|
||||||
|
GenerateLLMsTextRequest,
|
||||||
|
generateLLMsTextRequestSchema,
|
||||||
|
RequestWithAuth,
|
||||||
|
} from "./types";
|
||||||
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
|
import { getGenerateLlmsTxtQueue } from "../../services/queue-service";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
import { saveGeneratedLlmsTxt } from "../../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
export const generateLLMsTextRequestSchema = z.object({
|
|
||||||
url: z.string().url().describe('The URL to generate text from'),
|
|
||||||
maxUrls: z.number().min(1).max(100).default(10).describe('Maximum number of URLs to process'),
|
|
||||||
showFullText: z.boolean().default(false).describe('Whether to show the full LLMs-full.txt in the response'),
|
|
||||||
__experimental_stream: z.boolean().optional(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export type GenerateLLMsTextRequest = z.infer<typeof generateLLMsTextRequestSchema>;
|
|
||||||
|
|
||||||
export type GenerateLLMsTextResponse = {
|
export type GenerateLLMsTextResponse = {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
id: string;
|
id: string;
|
||||||
@ -66,14 +61,18 @@ export async function generateLLMsTextController(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
await getGenerateLlmsTxtQueue().add(generationId, {
|
await getGenerateLlmsTxtQueue().add(
|
||||||
...jobData,
|
generationId,
|
||||||
sentry: {
|
{
|
||||||
trace: Sentry.spanToTraceHeader(span),
|
...jobData,
|
||||||
baggage: Sentry.spanToBaggageHeader(span),
|
sentry: {
|
||||||
size,
|
trace: Sentry.spanToTraceHeader(span),
|
||||||
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
|
size,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}, { jobId: generationId });
|
{ jobId: generationId },
|
||||||
|
);
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
|
@ -34,7 +34,10 @@ export const url = z.preprocess(
|
|||||||
.url()
|
.url()
|
||||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||||
.refine(
|
.refine(
|
||||||
(x) => /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(x),
|
(x) =>
|
||||||
|
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
|
||||||
|
x,
|
||||||
|
),
|
||||||
"URL must have a valid top-level domain or be a valid path",
|
"URL must have a valid top-level domain or be a valid path",
|
||||||
)
|
)
|
||||||
.refine((x) => {
|
.refine((x) => {
|
||||||
@ -69,7 +72,10 @@ export type ExtractOptions = z.infer<typeof extractOptions>;
|
|||||||
|
|
||||||
const ACTIONS_MAX_WAIT_TIME = 60;
|
const ACTIONS_MAX_WAIT_TIME = 60;
|
||||||
const MAX_ACTIONS = 50;
|
const MAX_ACTIONS = 50;
|
||||||
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number {
|
function calculateTotalWaitTime(
|
||||||
|
actions: any[] = [],
|
||||||
|
waitFor: number = 0,
|
||||||
|
): number {
|
||||||
const actionWaitTime = actions.reduce((acc, action) => {
|
const actionWaitTime = actions.reduce((acc, action) => {
|
||||||
if (action.type === "wait") {
|
if (action.type === "wait") {
|
||||||
if (action.milliseconds) {
|
if (action.milliseconds) {
|
||||||
@ -86,63 +92,64 @@ function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): numbe
|
|||||||
return waitFor + actionWaitTime;
|
return waitFor + actionWaitTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const actionsSchema = z.array(
|
export const actionsSchema = z
|
||||||
z.union([
|
.array(
|
||||||
z
|
z.union([
|
||||||
.object({
|
z
|
||||||
type: z.literal("wait"),
|
.object({
|
||||||
milliseconds: z.number().int().positive().finite().optional(),
|
type: z.literal("wait"),
|
||||||
|
milliseconds: z.number().int().positive().finite().optional(),
|
||||||
|
selector: z.string().optional(),
|
||||||
|
})
|
||||||
|
.refine(
|
||||||
|
(data) =>
|
||||||
|
(data.milliseconds !== undefined || data.selector !== undefined) &&
|
||||||
|
!(data.milliseconds !== undefined && data.selector !== undefined),
|
||||||
|
{
|
||||||
|
message:
|
||||||
|
"Either 'milliseconds' or 'selector' must be provided, but not both.",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("click"),
|
||||||
|
selector: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("screenshot"),
|
||||||
|
fullPage: z.boolean().default(false),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("write"),
|
||||||
|
text: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("press"),
|
||||||
|
key: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("scroll"),
|
||||||
|
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||||
selector: z.string().optional(),
|
selector: z.string().optional(),
|
||||||
})
|
}),
|
||||||
.refine(
|
z.object({
|
||||||
(data) =>
|
type: z.literal("scrape"),
|
||||||
(data.milliseconds !== undefined || data.selector !== undefined) &&
|
}),
|
||||||
!(data.milliseconds !== undefined && data.selector !== undefined),
|
z.object({
|
||||||
{
|
type: z.literal("executeJavascript"),
|
||||||
message:
|
script: z.string(),
|
||||||
"Either 'milliseconds' or 'selector' must be provided, but not both.",
|
}),
|
||||||
},
|
]),
|
||||||
),
|
)
|
||||||
z.object({
|
.refine((actions) => actions.length <= MAX_ACTIONS, {
|
||||||
type: z.literal("click"),
|
|
||||||
selector: z.string(),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("screenshot"),
|
|
||||||
fullPage: z.boolean().default(false),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("write"),
|
|
||||||
text: z.string(),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("press"),
|
|
||||||
key: z.string(),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("scroll"),
|
|
||||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
|
||||||
selector: z.string().optional(),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("scrape"),
|
|
||||||
}),
|
|
||||||
z.object({
|
|
||||||
type: z.literal("executeJavascript"),
|
|
||||||
script: z.string(),
|
|
||||||
}),
|
|
||||||
]),
|
|
||||||
).refine(
|
|
||||||
(actions) => actions.length <= MAX_ACTIONS,
|
|
||||||
{
|
|
||||||
message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
|
message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
|
||||||
},
|
})
|
||||||
).refine(
|
.refine(
|
||||||
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
|
(actions) =>
|
||||||
{
|
calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
|
||||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
{
|
||||||
},
|
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||||
);
|
},
|
||||||
|
);
|
||||||
|
|
||||||
const baseScrapeOptions = z
|
const baseScrapeOptions = z
|
||||||
.object({
|
.object({
|
||||||
@ -169,7 +176,14 @@ const baseScrapeOptions = z
|
|||||||
excludeTags: z.string().array().optional(),
|
excludeTags: z.string().array().optional(),
|
||||||
onlyMainContent: z.boolean().default(true),
|
onlyMainContent: z.boolean().default(true),
|
||||||
timeout: z.number().int().positive().finite().safe().optional(),
|
timeout: z.number().int().positive().finite().safe().optional(),
|
||||||
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0),
|
waitFor: z
|
||||||
|
.number()
|
||||||
|
.int()
|
||||||
|
.nonnegative()
|
||||||
|
.finite()
|
||||||
|
.safe()
|
||||||
|
.max(60000)
|
||||||
|
.default(0),
|
||||||
// Deprecate this to jsonOptions
|
// Deprecate this to jsonOptions
|
||||||
extract: extractOptions.optional(),
|
extract: extractOptions.optional(),
|
||||||
// New
|
// New
|
||||||
@ -184,7 +198,10 @@ const baseScrapeOptions = z
|
|||||||
.string()
|
.string()
|
||||||
.optional()
|
.optional()
|
||||||
.refine(
|
.refine(
|
||||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic",
|
(val) =>
|
||||||
|
!val ||
|
||||||
|
Object.keys(countries).includes(val.toUpperCase()) ||
|
||||||
|
val === "US-generic",
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||||
@ -267,18 +284,23 @@ const extractTransform = (obj) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return obj;
|
return obj;
|
||||||
}
|
};
|
||||||
|
|
||||||
export const scrapeOptions = baseScrapeOptions.refine(
|
export const scrapeOptions = baseScrapeOptions
|
||||||
(obj) => {
|
.refine(
|
||||||
if (!obj.actions) return true;
|
(obj) => {
|
||||||
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000;
|
if (!obj.actions) return true;
|
||||||
},
|
return (
|
||||||
{
|
calculateTotalWaitTime(obj.actions, obj.waitFor) <=
|
||||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
ACTIONS_MAX_WAIT_TIME * 1000
|
||||||
}
|
);
|
||||||
).refine(extractRefine, extractRefineOpts)
|
},
|
||||||
.transform(extractTransform);
|
{
|
||||||
|
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.refine(extractRefine, extractRefineOpts)
|
||||||
|
.transform(extractTransform);
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||||
|
|
||||||
@ -324,15 +346,26 @@ export const extractV1Options = z
|
|||||||
__experimental_showSources: z.boolean().default(false),
|
__experimental_showSources: z.boolean().default(false),
|
||||||
showSources: z.boolean().default(false),
|
showSources: z.boolean().default(false),
|
||||||
__experimental_cacheKey: z.string().optional(),
|
__experimental_cacheKey: z.string().optional(),
|
||||||
__experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional()
|
__experimental_cacheMode: z
|
||||||
|
.enum(["direct", "save", "load"])
|
||||||
|
.default("direct")
|
||||||
|
.optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
...obj,
|
...obj,
|
||||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||||
}))
|
}))
|
||||||
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
|
.refine(
|
||||||
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
|
(x) => (x.scrapeOptions ? extractRefine(x.scrapeOptions) : true),
|
||||||
|
extractRefineOpts,
|
||||||
|
)
|
||||||
|
.transform((x) => ({
|
||||||
|
...x,
|
||||||
|
scrapeOptions: x.scrapeOptions
|
||||||
|
? extractTransform(x.scrapeOptions)
|
||||||
|
: x.scrapeOptions,
|
||||||
|
}));
|
||||||
|
|
||||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
export const extractRequestSchema = extractV1Options;
|
export const extractRequestSchema = extractV1Options;
|
||||||
@ -366,7 +399,9 @@ export const webhookSchema = z.preprocess(
|
|||||||
url: z.string().url(),
|
url: z.string().url(),
|
||||||
headers: z.record(z.string(), z.string()).default({}),
|
headers: z.record(z.string(), z.string()).default({}),
|
||||||
metadata: z.record(z.string(), z.string()).default({}),
|
metadata: z.record(z.string(), z.string()).default({}),
|
||||||
events: z.array(z.enum(["completed", "failed", "page", "started"])).default(["completed", "failed", "page", "started"]),
|
events: z
|
||||||
|
.array(z.enum(["completed", "failed", "page", "started"]))
|
||||||
|
.default(["completed", "failed", "page", "started"]),
|
||||||
})
|
})
|
||||||
.strict(strictMessage),
|
.strict(strictMessage),
|
||||||
);
|
);
|
||||||
@ -435,8 +470,11 @@ export const crawlRequestSchema = crawlerOptions
|
|||||||
limit: z.number().default(10000),
|
limit: z.number().default(10000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
.transform((x) => ({
|
||||||
|
...x,
|
||||||
|
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||||
|
}));
|
||||||
|
|
||||||
// export type CrawlRequest = {
|
// export type CrawlRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -902,7 +940,8 @@ export const searchRequestSchema = z
|
|||||||
location: z.string().optional(),
|
location: z.string().optional(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
scrapeOptions: baseScrapeOptions.extend({
|
scrapeOptions: baseScrapeOptions
|
||||||
|
.extend({
|
||||||
formats: z
|
formats: z
|
||||||
.array(
|
.array(
|
||||||
z.enum([
|
z.enum([
|
||||||
@ -922,8 +961,11 @@ export const searchRequestSchema = z
|
|||||||
.strict(
|
.strict(
|
||||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||||
)
|
)
|
||||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
.transform((x) => ({
|
||||||
|
...x,
|
||||||
|
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||||
|
}));
|
||||||
|
|
||||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||||
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
||||||
@ -943,3 +985,22 @@ export type TokenUsage = {
|
|||||||
step?: string;
|
step?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const generateLLMsTextRequestSchema = z.object({
|
||||||
|
url: url.describe("The URL to generate text from"),
|
||||||
|
maxUrls: z
|
||||||
|
.number()
|
||||||
|
.min(1)
|
||||||
|
.max(100)
|
||||||
|
.default(10)
|
||||||
|
.describe("Maximum number of URLs to process"),
|
||||||
|
showFullText: z
|
||||||
|
.boolean()
|
||||||
|
.default(false)
|
||||||
|
.describe("Whether to show the full LLMs-full.txt in the response"),
|
||||||
|
__experimental_stream: z.boolean().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type GenerateLLMsTextRequest = z.infer<
|
||||||
|
typeof generateLLMsTextRequestSchema
|
||||||
|
>;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user