From b8aba4f935142437aedf8a3ab3df0f1f32105d5c Mon Sep 17 00:00:00 2001 From: Srikanth Chekuri Date: Wed, 5 Jul 2023 10:34:07 +0530 Subject: [PATCH] fix: alert evaluation params and query (#3010) * fix: alert evaluation params and query 1. Update the rate query to not generate intermediary +inf value when the denominator is zero 2. Adjust the start and end time to incorporate data in movement 3. Round the start and end to minute 4. Add log to find the exact query that triggered alert for troubleshooting ; * chore: fix query builder tests --- .../app/metrics/query_builder.go | 2 +- .../app/metrics/v3/query_builder.go | 2 +- .../app/metrics/v3/query_builder_test.go | 2 +- pkg/query-service/rules/thresholdRule.go | 39 +++++++++++++------ 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/pkg/query-service/app/metrics/query_builder.go b/pkg/query-service/app/metrics/query_builder.go index 16620bf7cc..435e011dbd 100644 --- a/pkg/query-service/app/metrics/query_builder.go +++ b/pkg/query-service/app/metrics/query_builder.go @@ -45,7 +45,7 @@ var AggregateOperatorToSQLFunc = map[model.AggregateOperator]string{ } // See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056 -var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))` +var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))` var SupportedFunctions = []string{"exp", "log", "ln", "exp2", "log2", "exp10", "log10", "sqrt", "cbrt", "erf", "erfc", "lgamma", "tgamma", "sin", "cos", "tan", "asin", "acos", "atan", "degrees", "radians"} diff --git a/pkg/query-service/app/metrics/v3/query_builder.go b/pkg/query-service/app/metrics/v3/query_builder.go index 780fbd3b89..f7e3956cca 100644 --- a/pkg/query-service/app/metrics/v3/query_builder.go +++ b/pkg/query-service/app/metrics/v3/query_builder.go @@ -44,7 +44,7 @@ var aggregateOperatorToSQLFunc = map[v3.AggregateOperator]string{ } // See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056 -var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))` +var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))` // buildMetricsTimeSeriesFilterQuery builds the sub-query to be used for filtering // timeseries based on search criteria diff --git a/pkg/query-service/app/metrics/v3/query_builder_test.go b/pkg/query-service/app/metrics/v3/query_builder_test.go index b07a788855..7319236254 100644 --- a/pkg/query-service/app/metrics/v3/query_builder_test.go +++ b/pkg/query-service/app/metrics/v3/query_builder_test.go @@ -238,7 +238,7 @@ func TestBuildQueryOperators(t *testing.T) { func TestBuildQueryXRate(t *testing.T) { t.Run("TestBuildQueryXRate", func(t *testing.T) { - tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts` + tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts` cases := []struct { aggregateOperator v3.AggregateOperator diff --git a/pkg/query-service/rules/thresholdRule.go b/pkg/query-service/rules/thresholdRule.go index 39d0aa0cad..f6e79a1643 100644 --- a/pkg/query-service/rules/thresholdRule.go +++ b/pkg/query-service/rules/thresholdRule.go @@ -22,7 +22,6 @@ import ( querytemplate "go.signoz.io/signoz/pkg/query-service/utils/queryTemplate" "go.signoz.io/signoz/pkg/query-service/utils/times" "go.signoz.io/signoz/pkg/query-service/utils/timestamp" - "go.signoz.io/signoz/pkg/query-service/utils/value" logsv3 "go.signoz.io/signoz/pkg/query-service/app/logs/v3" metricsv3 "go.signoz.io/signoz/pkg/query-service/app/metrics/v3" @@ -327,7 +326,7 @@ func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDela } func (r *ThresholdRule) CheckCondition(v float64) bool { - if value.IsNaN(v) { + if math.IsNaN(v) { zap.S().Debugf("msg:", "found NaN in rule condition", "\t rule name:", r.Name()) return false } @@ -355,21 +354,37 @@ func (r *ThresholdRule) CheckCondition(v float64) bool { func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 { // todo(amol): add 30 seconds to evalWindow for rate calc + // todo(srikanthccv): make this configurable + // 2 minutes is reasonable time to wait for data to be available + // 60 seconds (SDK) + 10 seconds (batch) + rest for n/w + serialization + write to disk etc.. + start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() - 2*60*1000 + end := ts.UnixMilli() - 2*60*1000 + + // round to minute otherwise we could potentially miss data + start = start - (start % (60 * 1000)) + end = end - (end % (60 * 1000)) + if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL { return &v3.QueryRangeParamsV3{ - Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(), - End: ts.UnixMilli(), - Step: 30, + Start: start, + End: end, + Step: 60, CompositeQuery: r.ruleCondition.CompositeQuery, Variables: make(map[string]interface{}, 0), } } + if r.ruleCondition.CompositeQuery != nil && r.ruleCondition.CompositeQuery.BuilderQueries != nil { + for _, q := range r.ruleCondition.CompositeQuery.BuilderQueries { + q.StepInterval = 60 + } + } + // default mode return &v3.QueryRangeParamsV3{ - Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(), - End: ts.UnixMilli(), - Step: 30, + Start: start, + End: end, + Step: 60, CompositeQuery: r.ruleCondition.CompositeQuery, } } @@ -476,7 +491,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer } } - if value.IsNaN(sample.Point.V) { + if math.IsNaN(sample.Point.V) { continue } @@ -521,7 +536,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer // we skip the first record to support rate cases correctly // improvement(amol): explore approaches to limit this only for // rate uses cases - if exists, _ := skipFirstRecord[labelHash]; exists { + if exists := skipFirstRecord[labelHash]; exists { resultMap[labelHash] = sample } else { // looks like the first record for this label combo, skip it @@ -545,7 +560,9 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer result = append(result, sample) } } - zap.S().Debugf("ruleid:", r.ID(), "\t result (found alerts):", len(result)) + if len(result) != 0 { + zap.S().Infof("For rule %s, with ClickHouseQuery %s, found %d alerts", r.ID(), query, len(result)) + } return result, nil }