From f0318453005149335e2f0226d7a32643d058f76b Mon Sep 17 00:00:00 2001 From: Srikanth Chekuri Date: Thu, 8 Aug 2024 17:34:25 +0530 Subject: [PATCH] chore: make eval delay configurable (#5649) --- ee/query-service/app/server.go | 1 + pkg/query-service/app/server.go | 1 + pkg/query-service/constants/constants.go | 9 +++ pkg/query-service/rules/manager.go | 6 +- pkg/query-service/rules/thresholdRule.go | 25 ++++++-- pkg/query-service/rules/thresholdRule_test.go | 59 +++++++++++++++++-- 6 files changed, 89 insertions(+), 12 deletions(-) diff --git a/ee/query-service/app/server.go b/ee/query-service/app/server.go index c41788e971..a016d18d7e 100644 --- a/ee/query-service/app/server.go +++ b/ee/query-service/app/server.go @@ -728,6 +728,7 @@ func makeRulesManager( DisableRules: disableRules, FeatureFlags: fm, Reader: ch, + EvalDelay: baseconst.GetEvalDelay(), } // create Manager diff --git a/pkg/query-service/app/server.go b/pkg/query-service/app/server.go index 99fe166a70..0e6ca010ab 100644 --- a/pkg/query-service/app/server.go +++ b/pkg/query-service/app/server.go @@ -714,6 +714,7 @@ func makeRulesManager( DisableRules: disableRules, FeatureFlags: fm, Reader: ch, + EvalDelay: constants.GetEvalDelay(), } // create Manager diff --git a/pkg/query-service/constants/constants.go b/pkg/query-service/constants/constants.go index e7e3bcf2a7..4b5134c6ee 100644 --- a/pkg/query-service/constants/constants.go +++ b/pkg/query-service/constants/constants.go @@ -152,6 +152,15 @@ func GetContextTimeoutMaxAllowed() time.Duration { return contextTimeoutDuration } +func GetEvalDelay() time.Duration { + evalDelayStr := GetOrDefaultEnv("RULES_EVAL_DELAY", "2m") + evalDelayDuration, err := time.ParseDuration(evalDelayStr) + if err != nil { + return 0 + } + return evalDelayDuration +} + var ContextTimeoutMaxAllowed = GetContextTimeoutMaxAllowed() const ( diff --git a/pkg/query-service/rules/manager.go b/pkg/query-service/rules/manager.go index 6050981de8..d660d863f1 100644 --- a/pkg/query-service/rules/manager.go +++ b/pkg/query-service/rules/manager.go @@ -63,6 +63,8 @@ type ManagerOptions struct { DisableRules bool FeatureFlags interfaces.FeatureLookup Reader interfaces.Reader + + EvalDelay time.Duration } // The Manager manages recording and alerting rules. @@ -524,7 +526,9 @@ func (m *Manager) prepareTask(acquireLock bool, r *PostableRule, taskName string tr, err := NewThresholdRule( ruleId, r, - ThresholdRuleOpts{}, + ThresholdRuleOpts{ + EvalDelay: m.opts.EvalDelay, + }, m.featureFlags, m.reader, ) diff --git a/pkg/query-service/rules/thresholdRule.go b/pkg/query-service/rules/thresholdRule.go index e642fb3a0a..5426c04449 100644 --- a/pkg/query-service/rules/thresholdRule.go +++ b/pkg/query-service/rules/thresholdRule.go @@ -75,6 +75,8 @@ type ThresholdRule struct { querier interfaces.Querier querierV2 interfaces.Querier + + evalDelay time.Duration } type ThresholdRuleOpts struct { @@ -86,6 +88,12 @@ type ThresholdRuleOpts struct { // sendAlways will send alert irresepective of resendDelay // or other params SendAlways bool + + // EvalDelay is the time to wait for data to be available + // before evaluating the rule. This is useful in scenarios + // where data might not be available in the system immediately + // after the timestamp. + EvalDelay time.Duration } func NewThresholdRule( @@ -96,6 +104,8 @@ func NewThresholdRule( reader interfaces.Reader, ) (*ThresholdRule, error) { + zap.L().Info("creating new ThresholdRule", zap.String("id", id), zap.Any("opts", opts)) + if p.RuleCondition == nil { return nil, fmt.Errorf("no rule condition") } else if !p.RuleCondition.IsValid() { @@ -117,6 +127,7 @@ func NewThresholdRule( typ: p.AlertType, version: p.Version, temporalityMap: make(map[string]map[v3.Temporality]bool), + evalDelay: opts.EvalDelay, } if int64(t.evalWindow) == 0 { @@ -402,7 +413,6 @@ func (r *ThresholdRule) ForEachActiveAlert(f func(*Alert)) { } func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, interval time.Duration, notifyFunc NotifyFunc) { - zap.L().Info("sending alerts", zap.String("rule", r.Name())) alerts := []*Alert{} r.ForEachActiveAlert(func(alert *Alert) { if r.opts.SendAlways || alert.needsSending(ts, resendDelay) { @@ -431,11 +441,14 @@ func (r *ThresholdRule) Unit() string { func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 { - // todo(srikanthccv): make this configurable - // 2 minutes is reasonable time to wait for data to be available - // 60 seconds (SDK) + 10 seconds (batch) + rest for n/w + serialization + write to disk etc.. - start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() - 2*60*1000 - end := ts.UnixMilli() - 2*60*1000 + zap.L().Info("prepareQueryRange", zap.Int64("ts", ts.UnixMilli()), zap.Int64("evalWindow", r.evalWindow.Milliseconds()), zap.Int64("evalDelay", r.evalDelay.Milliseconds())) + + start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() + end := ts.UnixMilli() + if r.evalDelay > 0 { + start = start - int64(r.evalDelay.Milliseconds()) + end = end - int64(r.evalDelay.Milliseconds()) + } // round to minute otherwise we could potentially miss data start = start - (start % (60 * 1000)) end = end - (end % (60 * 1000)) diff --git a/pkg/query-service/rules/thresholdRule_test.go b/pkg/query-service/rules/thresholdRule_test.go index 115ccddc4c..65f4da088a 100644 --- a/pkg/query-service/rules/thresholdRule_test.go +++ b/pkg/query-service/rules/thresholdRule_test.go @@ -611,7 +611,7 @@ func TestThresholdRuleShouldAlert(t *testing.T) { postableRule.RuleCondition.MatchType = MatchType(c.matchType) postableRule.RuleCondition.Target = &c.target - rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{EvalDelay: 2 * time.Minute}, fm, nil) if err != nil { assert.NoError(t, err) } @@ -697,7 +697,7 @@ func TestPrepareLinksToLogs(t *testing.T) { } fm := featureManager.StartManager() - rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{EvalDelay: 2 * time.Minute}, fm, nil) if err != nil { assert.NoError(t, err) } @@ -739,7 +739,7 @@ func TestPrepareLinksToTraces(t *testing.T) { } fm := featureManager.StartManager() - rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{EvalDelay: 2 * time.Minute}, fm, nil) if err != nil { assert.NoError(t, err) } @@ -815,7 +815,7 @@ func TestThresholdRuleLabelNormalization(t *testing.T) { postableRule.RuleCondition.MatchType = MatchType(c.matchType) postableRule.RuleCondition.Target = &c.target - rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{EvalDelay: 2 * time.Minute}, fm, nil) if err != nil { assert.NoError(t, err) } @@ -834,6 +834,55 @@ func TestThresholdRuleLabelNormalization(t *testing.T) { } } +func TestThresholdRuleEvalDelay(t *testing.T) { + postableRule := PostableRule{ + AlertName: "Test Eval Delay", + AlertType: "METRIC_BASED_ALERT", + RuleType: RuleTypeThreshold, + EvalWindow: Duration(5 * time.Minute), + Frequency: Duration(1 * time.Minute), + RuleCondition: &RuleCondition{ + CompositeQuery: &v3.CompositeQuery{ + QueryType: v3.QueryTypeClickHouseSQL, + ClickHouseQueries: map[string]*v3.ClickHouseQuery{ + "A": { + Query: "SELECT 1 >= {{.start_timestamp_ms}} AND 1 <= {{.end_timestamp_ms}}", + }, + }, + }, + }, + } + + // 01:39:47 + ts := time.Unix(1717205987, 0) + + cases := []struct { + expectedQuery string + }{ + // Test cases for Equals Always + { + // 01:34:00 - 01:39:00 + expectedQuery: "SELECT 1 >= 1717205640000 AND 1 <= 1717205940000", + }, + } + + fm := featureManager.StartManager() + for idx, c := range cases { + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) // no eval delay + if err != nil { + assert.NoError(t, err) + } + + params := rule.prepareQueryRange(ts) + + assert.Equal(t, c.expectedQuery, params.CompositeQuery.ClickHouseQueries["A"].Query, "Test case %d", idx) + + secondTimeParams := rule.prepareQueryRange(ts) + + assert.Equal(t, c.expectedQuery, secondTimeParams.CompositeQuery.ClickHouseQueries["A"].Query, "Test case %d", idx) + } +} + func TestThresholdRuleClickHouseTmpl(t *testing.T) { postableRule := PostableRule{ AlertName: "Tricky Condition Tests", @@ -868,7 +917,7 @@ func TestThresholdRuleClickHouseTmpl(t *testing.T) { fm := featureManager.StartManager() for idx, c := range cases { - rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{}, fm, nil) + rule, err := NewThresholdRule("69", &postableRule, ThresholdRuleOpts{EvalDelay: 2 * time.Minute}, fm, nil) if err != nil { assert.NoError(t, err) }