[FIR-796] feat(api/types): Add action and wait time validation for scrape requests

- Implement max actions limit (15 actions)
- Add total wait time calculation for actions
- Increase max waitFor time to 60 seconds
- Refactor scrape options schema to include validation
This commit is contained in:
Ademílson F. Tonato 2025-02-07 21:16:38 +00:00
parent 2b7b7400f6
commit 843cec971d
No known key found for this signature in database
GPG Key ID: 169C7BE271C9FA3A
4 changed files with 333 additions and 10 deletions

View File

@ -67,6 +67,25 @@ export const extractOptions = z
export type ExtractOptions = z.infer<typeof extractOptions>;
const ACTIONS_MAX_WAIT_TIME = 60;
const MAX_ACTIONS = 15;
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number {
const actionWaitTime = actions.reduce((acc, action) => {
if (action.type === "wait") {
if (action.milliseconds) {
return acc + action.milliseconds;
}
// Consider selector actions as 1 second
if (action.selector) {
return acc + 1000;
}
}
return acc;
}, 0);
return waitFor + actionWaitTime;
}
export const actionsSchema = z.array(
z.union([
z
@ -113,9 +132,19 @@ export const actionsSchema = z.array(
script: z.string(),
}),
]),
).refine(
(actions) => actions.length <= MAX_ACTIONS,
{
message: `Maximum of ${MAX_ACTIONS} actions allowed`,
},
).refine(
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
);
export const scrapeOptions = z
const baseScrapeOptions = z
.object({
formats: z
.enum([
@ -140,7 +169,7 @@ export const scrapeOptions = z
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().max(30000).default(0),
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0),
// Deprecate this to jsonOptions
extract: extractOptions.optional(),
// New
@ -191,7 +220,17 @@ export const scrapeOptions = z
})
.strict(strictMessage);
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeOptions = baseScrapeOptions.refine(
(obj) => {
if (!obj.actions) return true;
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000;
},
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
}
);
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
import Ajv from "ajv";
@ -246,7 +285,7 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions
export const scrapeRequestSchema = baseScrapeOptions
.omit({ timeout: true })
.extend({
url,
@ -325,7 +364,7 @@ export const webhookSchema = z.preprocess(
.strict(strictMessage),
);
export const batchScrapeRequestSchema = scrapeOptions
export const batchScrapeRequestSchema = baseScrapeOptions
.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
@ -349,7 +388,7 @@ export const batchScrapeRequestSchema = scrapeOptions
},
);
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
@ -876,8 +915,7 @@ export const searchRequestSchema = z
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions
.extend({
scrapeOptions: baseScrapeOptions.extend({
formats: z
.array(
z.enum([

View File

@ -6,7 +6,8 @@
"test:suite": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:load": "artillery run --output ./load-test-results/test-run-report.json load-test.yml",
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts",
"test:schema-validation": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/schema-validation.test.ts"
},
"author": "",
"license": "ISC",
@ -22,9 +23,11 @@
"ts-jest": "^29.1.2"
},
"devDependencies": {
"@jest/globals": "^29.7.0",
"@types/jest": "^29.5.12",
"@types/supertest": "^6.0.2",
"artillery": "^2.0.19",
"typescript": "^5.4.5"
"typescript": "^5.4.5",
"zod": "^3.24.1"
}
}

View File

@ -36,6 +36,9 @@ importers:
specifier: ^29.1.2
version: 29.1.5(@babel/core@7.24.5)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.5))(jest@29.7.0(@types/node@20.14.9)(ts-node@10.9.2(@types/node@20.14.9)(typescript@5.4.5)))(typescript@5.4.5)
devDependencies:
'@jest/globals':
specifier: ^29.7.0
version: 29.7.0
'@types/jest':
specifier: ^29.5.12
version: 29.5.12
@ -48,6 +51,9 @@ importers:
typescript:
specifier: ^5.4.5
version: 5.4.5
zod:
specifier: ^3.24.1
version: 3.24.1
packages:
@ -4315,6 +4321,9 @@ packages:
resolution: {integrity: sha512-9qv4rlDiopXg4E69k+vMHjNN63YFMe9sZMrdlvKnCjlCRWeCBswPPMPUfx+ipsAWq1LXHe70RcbaHdJJpS6hyQ==}
engines: {node: '>= 10'}
zod@3.24.1:
resolution: {integrity: sha512-muH7gBL9sI1nciMZV67X5fTKKBLtwpZ5VBp1vsOQzj1MhrBZ4wlVCm3gedKZWLp0Oyel8sIGfeiz54Su+OVT+A==}
snapshots:
'@alcalzone/ansi-tokenize@0.1.3':
@ -9997,3 +10006,5 @@ snapshots:
archiver-utils: 3.0.4
compress-commons: 4.1.2
readable-stream: 3.6.2
zod@3.24.1: {}

View File

@ -0,0 +1,271 @@
import {actionsSchema, scrapeOptions} from '../../api/src/controllers/v1/types';
import {describe, it, expect} from '@jest/globals';
describe('Schema Validation Tests', () => {
describe('Actions Schema Validation', () => {
it('should allow valid actions within limits', () => {
const validActions = [
{type: 'wait', milliseconds: 1000},
{type: 'click', selector: '#button'},
{type: 'screenshot', fullPage: false},
{type: 'write', text: 'olá - hello'},
{type: 'press', key: 'Enter'},
{type: 'scroll', direction: 'down'},
{type: 'scrape'},
{type: 'executeJavascript', script: 'console.log("test")'},
];
const result = actionsSchema.safeParse(validActions);
expect(result.success).toBe(true);
});
it('should reject more than 15 actions', () => {
const tooManyActions = Array(16).fill({type: 'click', selector: '#button'});
const result = actionsSchema.safeParse(tooManyActions);
expect(result.success).toBe(false);
if (!result.success) {
expect(result.error.errors[0].message).toBe('Maximum of 15 actions allowed');
}
});
describe('Wait Action Validations', () => {
it('should validate wait with milliseconds', () => {
const validWait = [{type: 'wait', milliseconds: 1000}];
expect(actionsSchema.safeParse(validWait).success).toBe(true);
const invalidWait = [{type: 'wait', milliseconds: -1000}];
expect(actionsSchema.safeParse(invalidWait).success).toBe(false);
});
it('should validate wait with selector', () => {
const validWait = [{type: 'wait', selector: '#element'}];
expect(actionsSchema.safeParse(validWait).success).toBe(true);
});
it('should reject wait with both milliseconds and selector', () => {
const invalidWait = [{type: 'wait', milliseconds: 1000, selector: '#element'}];
const result = actionsSchema.safeParse(invalidWait);
expect(result.success).toBe(false);
if (!result.success) {
expect(result.error.errors[0].message).toBe(
"Either 'milliseconds' or 'selector' must be provided, but not both.",
);
}
});
it('should reject wait without either milliseconds or selector', () => {
const invalidWait = [{type: 'wait'}];
const result = actionsSchema.safeParse(invalidWait);
expect(result.success).toBe(false);
});
it('should reject when total wait time exceeds 60 seconds', () => {
const longWaitActions = [
{type: 'wait', milliseconds: 50000},
{type: 'wait', milliseconds: 11000},
];
const result = actionsSchema.safeParse(longWaitActions);
expect(result.success).toBe(false);
if (!result.success) {
expect(result.error.errors[0].message).toBe(
'Total wait time (waitFor + wait actions) cannot exceed 60 seconds',
);
}
});
it('should count selector waits as 1 second each', () => {
// 15 selector clicks = 15 seconds total, should pass
const maxWaitSelectors = Array(15).fill({type: 'click', selector: '#element'});
const result = actionsSchema.safeParse(maxWaitSelectors);
expect(result.success).toBe(true);
// Test the time limit with mixed waits
const mixedWaits = [
{type: 'wait', milliseconds: 58000}, // 58 seconds
{type: 'wait', selector: '#element'}, // 1 second
{type: 'wait', selector: '#load-more-button'}, // 1 second
{type: 'wait', selector: '#toomuch'}, // 1 second, exceeds 60 seconds total
];
const timeFailResult = actionsSchema.safeParse(mixedWaits);
expect(timeFailResult.success).toBe(false);
if (!timeFailResult.success) {
expect(timeFailResult.error.errors[0].message).toBe(
'Total wait time (waitFor + wait actions) cannot exceed 60 seconds',
);
}
});
});
describe('Other Action Type Validations', () => {
it('should validate click action', () => {
const validClick = [{type: 'click', selector: '#button'}];
expect(actionsSchema.safeParse(validClick).success).toBe(true);
const invalidClick = [{type: 'click'}];
expect(actionsSchema.safeParse(invalidClick).success).toBe(false);
});
it('should validate screenshot action', () => {
const validScreenshot = [{type: 'screenshot', fullPage: true}];
expect(actionsSchema.safeParse(validScreenshot).success).toBe(true);
const defaultScreenshot = [{type: 'screenshot', fullPage: false}];
const result = actionsSchema.safeParse(defaultScreenshot);
expect(result.success).toBe(true);
});
it('should validate write action', () => {
const validWrite = [{type: 'write', text: 'hello'}];
expect(actionsSchema.safeParse(validWrite).success).toBe(true);
const invalidWrite = [{type: 'write'}];
expect(actionsSchema.safeParse(invalidWrite).success).toBe(false);
});
it('should validate press action', () => {
const validPress = [{type: 'press', key: 'Enter'}];
expect(actionsSchema.safeParse(validPress).success).toBe(true);
const invalidPress = [{type: 'press'}];
expect(actionsSchema.safeParse(invalidPress).success).toBe(false);
});
it('should validate scroll action', () => {
const validScroll = [{type: 'scroll', direction: 'up', selector: '#element'}];
expect(actionsSchema.safeParse(validScroll).success).toBe(true);
const defaultScroll = [{type: 'scroll', direction: 'down'}];
const result = actionsSchema.safeParse(defaultScroll);
expect(result.success).toBe(true);
const invalidDirection = [{type: 'scroll', direction: 'left'}];
expect(actionsSchema.safeParse(invalidDirection).success).toBe(false);
});
it('should validate scrape action', () => {
const validScrape = [{type: 'scrape'}];
expect(actionsSchema.safeParse(validScrape).success).toBe(true);
});
it('should validate executeJavascript action', () => {
const validJs = [{type: 'executeJavascript', script: 'console.log("test")'}];
expect(actionsSchema.safeParse(validJs).success).toBe(true);
const invalidJs = [{type: 'executeJavascript'}];
expect(actionsSchema.safeParse(invalidJs).success).toBe(false);
});
});
});
describe('Scrape Options Schema Validation', () => {
it('should validate waitFor limit', () => {
const validOptions = {
waitFor: 60000, // 60 seconds
};
expect(scrapeOptions.safeParse(validOptions).success).toBe(true);
const invalidOptions = {
waitFor: 61000, // 61 seconds
};
expect(scrapeOptions.safeParse(invalidOptions).success).toBe(false);
});
describe('Combined Wait Time Validations', () => {
it('should validate combined waitFor and actions wait time', () => {
// Test valid combination (at the limit)
const validOptions = {
waitFor: 30000, // 30 seconds
actions: [
{type: 'wait', milliseconds: 29000}, // 29 seconds
{type: 'wait', selector: '#element'}, // 1 second
],
};
expect(scrapeOptions.safeParse(validOptions).success).toBe(true);
// Test invalid combination (exceeds limit)
const invalidOptions = {
waitFor: 30000, // 30 seconds
actions: [
{type: 'wait', milliseconds: 29000}, // 29 seconds
{type: 'wait', selector: '#element'}, // 1 second
{type: 'wait', selector: '#another'}, // 1 second
],
};
const failResult = scrapeOptions.safeParse(invalidOptions);
expect(failResult.success).toBe(false);
if (!failResult.success) {
expect(failResult.error.errors[0].message).toBe(
'Total wait time (waitFor + wait actions) cannot exceed 60 seconds',
);
}
});
it('should handle edge cases of combined wait times', () => {
// Test with only waitFor at limit
const maxWaitFor = {
waitFor: 60000, // 60 seconds
actions: [
{type: 'write', text: 'Olá galera!'}, // non-wait action
],
};
expect(scrapeOptions.safeParse(maxWaitFor).success).toBe(true);
// Test with only action waits at limit
const maxActionWaits = {
waitFor: 0,
actions: [
{type: 'wait', milliseconds: 59000}, // 59 seconds
{type: 'wait', selector: '#element'}, // 1 second
],
};
expect(scrapeOptions.safeParse(maxActionWaits).success).toBe(true);
// Test with mixed waits slightly over limit
const slightlyOver = {
waitFor: 30000, // 30 seconds
actions: [
{type: 'wait', milliseconds: 30000}, // 30 seconds
{type: 'wait', selector: '#element'}, // 1 second
],
};
const overResult = scrapeOptions.safeParse(slightlyOver);
expect(overResult.success).toBe(false);
if (!overResult.success) {
expect(overResult.error.errors[0].message).toBe(
'Total wait time (waitFor + wait actions) cannot exceed 60 seconds',
);
}
});
});
describe('Format Validations', () => {
it('should validate screenshot format combinations', () => {
const validScreenshot = {
formats: ['screenshot'],
};
expect(scrapeOptions.safeParse(validScreenshot).success).toBe(true);
const validFullPage = {
formats: ['screenshot@fullPage'],
};
expect(scrapeOptions.safeParse(validFullPage).success).toBe(true);
const invalidBoth = {
formats: ['screenshot', 'screenshot@fullPage'],
};
expect(scrapeOptions.safeParse(invalidBoth).success).toBe(false);
});
it('should default to markdown format', () => {
const noFormat = {};
const result = scrapeOptions.safeParse(noFormat);
expect(result.success).toBe(true);
if (result.success) {
expect(result.data.formats).toEqual(['markdown']);
}
});
});
});
});