fix: alert evaluation params and query (#3010)

* fix: alert evaluation params and query
1. Update the rate query to not generate intermediary +inf value when the denominator is zero
2. Adjust the start and end time to incorporate data in movement
3. Round the start and end to minute
4. Add log to find the exact query that triggered alert for troubleshooting
;

* chore: fix query builder tests
This commit is contained in:
Srikanth Chekuri 2023-07-05 10:34:07 +05:30 committed by GitHub
parent ea89433dc0
commit b8aba4f935
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 14 deletions

View File

@ -45,7 +45,7 @@ var AggregateOperatorToSQLFunc = map[model.AggregateOperator]string{
} }
// See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056 // See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))` var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))`
var SupportedFunctions = []string{"exp", "log", "ln", "exp2", "log2", "exp10", "log10", "sqrt", "cbrt", "erf", "erfc", "lgamma", "tgamma", "sin", "cos", "tan", "asin", "acos", "atan", "degrees", "radians"} var SupportedFunctions = []string{"exp", "log", "ln", "exp2", "log2", "exp10", "log10", "sqrt", "cbrt", "erf", "erfc", "lgamma", "tgamma", "sin", "cos", "tan", "asin", "acos", "atan", "degrees", "radians"}

View File

@ -44,7 +44,7 @@ var aggregateOperatorToSQLFunc = map[v3.AggregateOperator]string{
} }
// See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056 // See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))` var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))`
// buildMetricsTimeSeriesFilterQuery builds the sub-query to be used for filtering // buildMetricsTimeSeriesFilterQuery builds the sub-query to be used for filtering
// timeseries based on search criteria // timeseries based on search criteria

View File

@ -238,7 +238,7 @@ func TestBuildQueryOperators(t *testing.T) {
func TestBuildQueryXRate(t *testing.T) { func TestBuildQueryXRate(t *testing.T) {
t.Run("TestBuildQueryXRate", func(t *testing.T) { t.Run("TestBuildQueryXRate", func(t *testing.T) {
tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts` tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts`
cases := []struct { cases := []struct {
aggregateOperator v3.AggregateOperator aggregateOperator v3.AggregateOperator

View File

@ -22,7 +22,6 @@ import (
querytemplate "go.signoz.io/signoz/pkg/query-service/utils/queryTemplate" querytemplate "go.signoz.io/signoz/pkg/query-service/utils/queryTemplate"
"go.signoz.io/signoz/pkg/query-service/utils/times" "go.signoz.io/signoz/pkg/query-service/utils/times"
"go.signoz.io/signoz/pkg/query-service/utils/timestamp" "go.signoz.io/signoz/pkg/query-service/utils/timestamp"
"go.signoz.io/signoz/pkg/query-service/utils/value"
logsv3 "go.signoz.io/signoz/pkg/query-service/app/logs/v3" logsv3 "go.signoz.io/signoz/pkg/query-service/app/logs/v3"
metricsv3 "go.signoz.io/signoz/pkg/query-service/app/metrics/v3" metricsv3 "go.signoz.io/signoz/pkg/query-service/app/metrics/v3"
@ -327,7 +326,7 @@ func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDela
} }
func (r *ThresholdRule) CheckCondition(v float64) bool { func (r *ThresholdRule) CheckCondition(v float64) bool {
if value.IsNaN(v) { if math.IsNaN(v) {
zap.S().Debugf("msg:", "found NaN in rule condition", "\t rule name:", r.Name()) zap.S().Debugf("msg:", "found NaN in rule condition", "\t rule name:", r.Name())
return false return false
} }
@ -355,21 +354,37 @@ func (r *ThresholdRule) CheckCondition(v float64) bool {
func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 { func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 {
// todo(amol): add 30 seconds to evalWindow for rate calc // todo(amol): add 30 seconds to evalWindow for rate calc
// todo(srikanthccv): make this configurable
// 2 minutes is reasonable time to wait for data to be available
// 60 seconds (SDK) + 10 seconds (batch) + rest for n/w + serialization + write to disk etc..
start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() - 2*60*1000
end := ts.UnixMilli() - 2*60*1000
// round to minute otherwise we could potentially miss data
start = start - (start % (60 * 1000))
end = end - (end % (60 * 1000))
if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL { if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL {
return &v3.QueryRangeParamsV3{ return &v3.QueryRangeParamsV3{
Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(), Start: start,
End: ts.UnixMilli(), End: end,
Step: 30, Step: 60,
CompositeQuery: r.ruleCondition.CompositeQuery, CompositeQuery: r.ruleCondition.CompositeQuery,
Variables: make(map[string]interface{}, 0), Variables: make(map[string]interface{}, 0),
} }
} }
if r.ruleCondition.CompositeQuery != nil && r.ruleCondition.CompositeQuery.BuilderQueries != nil {
for _, q := range r.ruleCondition.CompositeQuery.BuilderQueries {
q.StepInterval = 60
}
}
// default mode // default mode
return &v3.QueryRangeParamsV3{ return &v3.QueryRangeParamsV3{
Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(), Start: start,
End: ts.UnixMilli(), End: end,
Step: 30, Step: 60,
CompositeQuery: r.ruleCondition.CompositeQuery, CompositeQuery: r.ruleCondition.CompositeQuery,
} }
} }
@ -476,7 +491,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
} }
} }
if value.IsNaN(sample.Point.V) { if math.IsNaN(sample.Point.V) {
continue continue
} }
@ -521,7 +536,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
// we skip the first record to support rate cases correctly // we skip the first record to support rate cases correctly
// improvement(amol): explore approaches to limit this only for // improvement(amol): explore approaches to limit this only for
// rate uses cases // rate uses cases
if exists, _ := skipFirstRecord[labelHash]; exists { if exists := skipFirstRecord[labelHash]; exists {
resultMap[labelHash] = sample resultMap[labelHash] = sample
} else { } else {
// looks like the first record for this label combo, skip it // looks like the first record for this label combo, skip it
@ -545,7 +560,9 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
result = append(result, sample) result = append(result, sample)
} }
} }
zap.S().Debugf("ruleid:", r.ID(), "\t result (found alerts):", len(result)) if len(result) != 0 {
zap.S().Infof("For rule %s, with ClickHouseQuery %s, found %d alerts", r.ID(), query, len(result))
}
return result, nil return result, nil
} }