mirror of
https://git.mirrors.martin98.com/https://github.com/SigNoz/signoz
synced 2025-08-14 13:55:54 +08:00
fix: alert evaluation params and query (#3010)
* fix: alert evaluation params and query 1. Update the rate query to not generate intermediary +inf value when the denominator is zero 2. Adjust the start and end time to incorporate data in movement 3. Round the start and end to minute 4. Add log to find the exact query that triggered alert for troubleshooting ; * chore: fix query builder tests
This commit is contained in:
parent
ea89433dc0
commit
b8aba4f935
@ -45,7 +45,7 @@ var AggregateOperatorToSQLFunc = map[model.AggregateOperator]string{
|
||||
}
|
||||
|
||||
// See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056
|
||||
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))`
|
||||
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))`
|
||||
|
||||
var SupportedFunctions = []string{"exp", "log", "ln", "exp2", "log2", "exp10", "log10", "sqrt", "cbrt", "erf", "erfc", "lgamma", "tgamma", "sin", "cos", "tan", "asin", "acos", "atan", "degrees", "radians"}
|
||||
|
||||
|
@ -44,7 +44,7 @@ var aggregateOperatorToSQLFunc = map[v3.AggregateOperator]string{
|
||||
}
|
||||
|
||||
// See https://github.com/SigNoz/signoz/issues/2151#issuecomment-1467249056
|
||||
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))`
|
||||
var rateWithoutNegative = `if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))`
|
||||
|
||||
// buildMetricsTimeSeriesFilterQuery builds the sub-query to be used for filtering
|
||||
// timeseries based on search criteria
|
||||
|
@ -238,7 +238,7 @@ func TestBuildQueryOperators(t *testing.T) {
|
||||
func TestBuildQueryXRate(t *testing.T) {
|
||||
t.Run("TestBuildQueryXRate", func(t *testing.T) {
|
||||
|
||||
tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) < 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts`
|
||||
tmpl := `SELECT ts, %s(value) as value FROM (SELECT ts, if (runningDifference(value) < 0 OR runningDifference(ts) <= 0, nan, runningDifference(value)/runningDifference(ts))as value FROM(SELECT fingerprint, toStartOfInterval(toDateTime(intDiv(timestamp_ms, 1000)), INTERVAL 0 SECOND) as ts, max(value) as value FROM signoz_metrics.distributed_samples_v2 GLOBAL INNER JOIN (SELECT fingerprint FROM signoz_metrics.distributed_time_series_v2 WHERE metric_name = 'name') as filtered_time_series USING fingerprint WHERE metric_name = 'name' AND timestamp_ms >= 1650991982000 AND timestamp_ms <= 1651078382000 GROUP BY fingerprint, ts ORDER BY fingerprint, ts) WHERE isNaN(value) = 0) GROUP BY ts ORDER BY ts`
|
||||
|
||||
cases := []struct {
|
||||
aggregateOperator v3.AggregateOperator
|
||||
|
@ -22,7 +22,6 @@ import (
|
||||
querytemplate "go.signoz.io/signoz/pkg/query-service/utils/queryTemplate"
|
||||
"go.signoz.io/signoz/pkg/query-service/utils/times"
|
||||
"go.signoz.io/signoz/pkg/query-service/utils/timestamp"
|
||||
"go.signoz.io/signoz/pkg/query-service/utils/value"
|
||||
|
||||
logsv3 "go.signoz.io/signoz/pkg/query-service/app/logs/v3"
|
||||
metricsv3 "go.signoz.io/signoz/pkg/query-service/app/metrics/v3"
|
||||
@ -327,7 +326,7 @@ func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDela
|
||||
}
|
||||
func (r *ThresholdRule) CheckCondition(v float64) bool {
|
||||
|
||||
if value.IsNaN(v) {
|
||||
if math.IsNaN(v) {
|
||||
zap.S().Debugf("msg:", "found NaN in rule condition", "\t rule name:", r.Name())
|
||||
return false
|
||||
}
|
||||
@ -355,21 +354,37 @@ func (r *ThresholdRule) CheckCondition(v float64) bool {
|
||||
func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 {
|
||||
// todo(amol): add 30 seconds to evalWindow for rate calc
|
||||
|
||||
// todo(srikanthccv): make this configurable
|
||||
// 2 minutes is reasonable time to wait for data to be available
|
||||
// 60 seconds (SDK) + 10 seconds (batch) + rest for n/w + serialization + write to disk etc..
|
||||
start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() - 2*60*1000
|
||||
end := ts.UnixMilli() - 2*60*1000
|
||||
|
||||
// round to minute otherwise we could potentially miss data
|
||||
start = start - (start % (60 * 1000))
|
||||
end = end - (end % (60 * 1000))
|
||||
|
||||
if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL {
|
||||
return &v3.QueryRangeParamsV3{
|
||||
Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(),
|
||||
End: ts.UnixMilli(),
|
||||
Step: 30,
|
||||
Start: start,
|
||||
End: end,
|
||||
Step: 60,
|
||||
CompositeQuery: r.ruleCondition.CompositeQuery,
|
||||
Variables: make(map[string]interface{}, 0),
|
||||
}
|
||||
}
|
||||
|
||||
if r.ruleCondition.CompositeQuery != nil && r.ruleCondition.CompositeQuery.BuilderQueries != nil {
|
||||
for _, q := range r.ruleCondition.CompositeQuery.BuilderQueries {
|
||||
q.StepInterval = 60
|
||||
}
|
||||
}
|
||||
|
||||
// default mode
|
||||
return &v3.QueryRangeParamsV3{
|
||||
Start: ts.Add(-time.Duration(r.evalWindow)).UnixMilli(),
|
||||
End: ts.UnixMilli(),
|
||||
Step: 30,
|
||||
Start: start,
|
||||
End: end,
|
||||
Step: 60,
|
||||
CompositeQuery: r.ruleCondition.CompositeQuery,
|
||||
}
|
||||
}
|
||||
@ -476,7 +491,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
|
||||
}
|
||||
}
|
||||
|
||||
if value.IsNaN(sample.Point.V) {
|
||||
if math.IsNaN(sample.Point.V) {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -521,7 +536,7 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
|
||||
// we skip the first record to support rate cases correctly
|
||||
// improvement(amol): explore approaches to limit this only for
|
||||
// rate uses cases
|
||||
if exists, _ := skipFirstRecord[labelHash]; exists {
|
||||
if exists := skipFirstRecord[labelHash]; exists {
|
||||
resultMap[labelHash] = sample
|
||||
} else {
|
||||
// looks like the first record for this label combo, skip it
|
||||
@ -545,7 +560,9 @@ func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, quer
|
||||
result = append(result, sample)
|
||||
}
|
||||
}
|
||||
zap.S().Debugf("ruleid:", r.ID(), "\t result (found alerts):", len(result))
|
||||
if len(result) != 0 {
|
||||
zap.S().Infof("For rule %s, with ClickHouseQuery %s, found %d alerts", r.ID(), query, len(result))
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user