Add support for histogram quantiles (#1533)

2025-08-12 06:39:03 +08:00 · 2023-01-10 21:42:44 +05:30 · 2023-01-10 21:42:44 +05:30 · 44360ecacf
commit 44360ecacf
parent b675c3cfec
8 changed files with 315 additions and 10 deletions
--- a/deploy/docker/clickhouse-setup/clickhouse-config.xml
+++ b/deploy/docker/clickhouse-setup/clickhouse-config.xml
@ -905,7 +905,8 @@
    <dictionaries_config>*_dictionary.xml</dictionaries_config>
    <!-- Configuration of user defined executable functions -->
-    <user_defined_executable_functions_config>*_function.xml</user_defined_executable_functions_config>
+    <user_defined_executable_functions_config>*function.xml</user_defined_executable_functions_config>
    <user_scripts_path>/var/lib/clickhouse/user_scripts/</user_scripts_path>
    <!-- Uncomment if you want data to be compressed 30-100% better.
         Don't do that if you just started using ClickHouse.
--- a/deploy/docker/clickhouse-setup/custom-function.xml
+++ b/deploy/docker/clickhouse-setup/custom-function.xml
@ -0,0 +1,21 @@
 <functions>
    <function>
        <type>executable</type>
        <name>histogramQuantile</name>
        <return_type>Float64</return_type>
        <argument>
            <type>Array(Float64)</type>
            <name>buckets</name>
        </argument>
        <argument>
            <type>Array(Float64)</type>
            <name>counts</name>
        </argument>
        <argument>
            <type>Float64</type>
            <name>quantile</name>
        </argument>
        <format>CSV</format>
        <command>./histogramQuantile</command>
    </function>
 </functions>
--- a/deploy/docker/clickhouse-setup/docker-compose.yaml
+++ b/deploy/docker/clickhouse-setup/docker-compose.yaml
@ -97,9 +97,11 @@ services:
    volumes:
      - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
      - ./clickhouse-users.xml:/etc/clickhouse-server/users.xml
      - ./custom-function.xml:/etc/clickhouse-server/custom-function.xml
      - ./clickhouse-cluster.xml:/etc/clickhouse-server/config.d/cluster.xml
      # - ./clickhouse-storage.xml:/etc/clickhouse-server/config.d/storage.xml
      - ./data/clickhouse/:/var/lib/clickhouse/
      - ./user_scripts:/var/lib/clickhouse/user_scripts/
  # clickhouse-2:
  #   <<: *clickhouse-defaults
@ -112,9 +114,12 @@ services:
  #   volumes:
  #     - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
  #     - ./clickhouse-users.xml:/etc/clickhouse-server/users.xml
  #    - ./custom-function.xml:/etc/clickhouse-server/custom-function.xml
  #     - ./clickhouse-cluster.xml:/etc/clickhouse-server/config.d/cluster.xml
  #     # - ./clickhouse-storage.xml:/etc/clickhouse-server/config.d/storage.xml
  #     - ./data/clickhouse-2/:/var/lib/clickhouse/
  #     - ./user_scripts:/var/lib/clickhouse/user_scripts/
  # clickhouse-3:
  #   <<: *clickhouse-defaults
@ -127,9 +132,11 @@ services:
  #   volumes:
  #     - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
  #     - ./clickhouse-users.xml:/etc/clickhouse-server/users.xml
  #    - ./custom-function.xml:/etc/clickhouse-server/custom-function.xml
  #     - ./clickhouse-cluster.xml:/etc/clickhouse-server/config.d/cluster.xml
  #     # - ./clickhouse-storage.xml:/etc/clickhouse-server/config.d/storage.xml
  #     - ./data/clickhouse-3/:/var/lib/clickhouse/
  #     - ./user_scripts:/var/lib/clickhouse/user_scripts/
  alertmanager:
    image: signoz/alertmanager:${ALERTMANAGER_TAG:-0.23.0-0.2}
--- a/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile
+++ b/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile
--- a/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile.go
+++ b/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile.go
@ -0,0 +1,237 @@
 package main
 import (
 	"bufio"
 	"fmt"
 	"math"
 	"os"
 	"sort"
 	"strconv"
 	"strings"
 )
 // NOTE: executable must be built with target OS and architecture set to linux/amd64
 // env GOOS=linux GOARCH=arm64 go build -o histogramQuantile histogramQuantile.go
 // The following code is adapted from the following source:
 // https://github.com/prometheus/prometheus/blob/main/promql/quantile.go
 type bucket struct {
 	upperBound float64
 	count      float64
 }
 // buckets implements sort.Interface.
 type buckets []bucket
 func (b buckets) Len() int           { return len(b) }
 func (b buckets) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
 func (b buckets) Less(i, j int) bool { return b[i].upperBound < b[j].upperBound }
 // bucketQuantile calculates the quantile 'q' based on the given buckets. The
 // buckets will be sorted by upperBound by this function (i.e. no sorting
 // needed before calling this function). The quantile value is interpolated
 // assuming a linear distribution within a bucket. However, if the quantile
 // falls into the highest bucket, the upper bound of the 2nd highest bucket is
 // returned. A natural lower bound of 0 is assumed if the upper bound of the
 // lowest bucket is greater 0. In that case, interpolation in the lowest bucket
 // happens linearly between 0 and the upper bound of the lowest bucket.
 // However, if the lowest bucket has an upper bound less or equal 0, this upper
 // bound is returned if the quantile falls into the lowest bucket.
 //
 // There are a number of special cases (once we have a way to report errors
 // happening during evaluations of AST functions, we should report those
 // explicitly):
 //
 // If 'buckets' has 0 observations, NaN is returned.
 //
 // If 'buckets' has fewer than 2 elements, NaN is returned.
 //
 // If the highest bucket is not +Inf, NaN is returned.
 //
 // If q==NaN, NaN is returned.
 //
 // If q<0, -Inf is returned.
 //
 // If q>1, +Inf is returned.
 func bucketQuantile(q float64, buckets buckets) float64 {
 	if math.IsNaN(q) {
 		return math.NaN()
 	}
 	if q < 0 {
 		return math.Inf(-1)
 	}
 	if q > 1 {
 		return math.Inf(+1)
 	}
 	sort.Sort(buckets)
 	if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
 		return math.NaN()
 	}
 	buckets = coalesceBuckets(buckets)
 	ensureMonotonic(buckets)
 	if len(buckets) < 2 {
 		return math.NaN()
 	}
 	observations := buckets[len(buckets)-1].count
 	if observations == 0 {
 		return math.NaN()
 	}
 	rank := q * observations
 	b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
 	if b == len(buckets)-1 {
 		return buckets[len(buckets)-2].upperBound
 	}
 	if b == 0 && buckets[0].upperBound <= 0 {
 		return buckets[0].upperBound
 	}
 	var (
 		bucketStart float64
 		bucketEnd   = buckets[b].upperBound
 		count       = buckets[b].count
 	)
 	if b > 0 {
 		bucketStart = buckets[b-1].upperBound
 		count -= buckets[b-1].count
 		rank -= buckets[b-1].count
 	}
 	return bucketStart + (bucketEnd-bucketStart)*(rank/count)
 }
 // coalesceBuckets merges buckets with the same upper bound.
 //
 // The input buckets must be sorted.
 func coalesceBuckets(buckets buckets) buckets {
 	last := buckets[0]
 	i := 0
 	for _, b := range buckets[1:] {
 		if b.upperBound == last.upperBound {
 			last.count += b.count
 		} else {
 			buckets[i] = last
 			last = b
 			i++
 		}
 	}
 	buckets[i] = last
 	return buckets[:i+1]
 }
 // The assumption that bucket counts increase monotonically with increasing
 // upperBound may be violated during:
 //
 //   * Recording rule evaluation of histogram_quantile, especially when rate()
 //      has been applied to the underlying bucket timeseries.
 //   * Evaluation of histogram_quantile computed over federated bucket
 //      timeseries, especially when rate() has been applied.
 //
 // This is because scraped data is not made available to rule evaluation or
 // federation atomically, so some buckets are computed with data from the
 // most recent scrapes, but the other buckets are missing data from the most
 // recent scrape.
 //
 // Monotonicity is usually guaranteed because if a bucket with upper bound
 // u1 has count c1, then any bucket with a higher upper bound u > u1 must
 // have counted all c1 observations and perhaps more, so that c  >= c1.
 //
 // Randomly interspersed partial sampling breaks that guarantee, and rate()
 // exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
 // 4 samples but the bucket with le=2000 has a count of 7 from 3 samples. The
 // monotonicity is broken. It is exacerbated by rate() because under normal
 // operation, cumulative counting of buckets will cause the bucket counts to
 // diverge such that small differences from missing samples are not a problem.
 // rate() removes this divergence.)
 //
 // bucketQuantile depends on that monotonicity to do a binary search for the
 // bucket with the φ-quantile count, so breaking the monotonicity
 // guarantee causes bucketQuantile() to return undefined (nonsense) results.
 //
 // As a somewhat hacky solution until ingestion is atomic per scrape, we
 // calculate the "envelope" of the histogram buckets, essentially removing
 // any decreases in the count between successive buckets.
 func ensureMonotonic(buckets buckets) {
 	max := buckets[0].count
 	for i := 1; i < len(buckets); i++ {
 		switch {
 		case buckets[i].count > max:
 			max = buckets[i].count
 		case buckets[i].count < max:
 			buckets[i].count = max
 		}
 	}
 }
 // End of copied code.
 func readLines() []string {
 	r := bufio.NewReader(os.Stdin)
 	bytes := []byte{}
 	lines := []string{}
 	for {
 		line, isPrefix, err := r.ReadLine()
 		if err != nil {
 			break
 		}
 		bytes = append(bytes, line...)
 		if !isPrefix {
 			str := strings.TrimSpace(string(bytes))
 			if len(str) > 0 {
 				lines = append(lines, str)
 				bytes = []byte{}
 			}
 		}
 	}
 	if len(bytes) > 0 {
 		lines = append(lines, string(bytes))
 	}
 	return lines
 }
 func main() {
 	lines := readLines()
 	for _, text := range lines {
 		// Example input
 		// "[1, 2, 4, 8, 16]", "[1, 5, 8, 10, 14]", 0.9"
 		// bounds - counts - quantile
 		parts := strings.Split(text, "\",")
 		var bucketNumbers []float64
 		// Strip the ends with square brackets
 		text = parts[0][2 : len(parts[0])-1]
 		// Parse the bucket bounds
 		for _, num := range strings.Split(text, ",") {
 			num = strings.TrimSpace(num)
 			number, err := strconv.ParseFloat(num, 64)
 			if err == nil {
 				bucketNumbers = append(bucketNumbers, number)
 			}
 		}
 		var bucketCounts []float64
 		// Strip the ends with square brackets
 		text = parts[1][2 : len(parts[1])-1]
 		// Parse the bucket counts
 		for _, num := range strings.Split(text, ",") {
 			num = strings.TrimSpace(num)
 			number, err := strconv.ParseFloat(num, 64)
 			if err == nil {
 				bucketCounts = append(bucketCounts, number)
 			}
 		}
 		// Parse the quantile
 		q, err := strconv.ParseFloat(parts[2], 64)
 		var b buckets
 		if err == nil {
 			for i := 0; i < len(bucketNumbers); i++ {
 				b = append(b, bucket{upperBound: bucketNumbers[i], count: bucketCounts[i]})
 			}
 		}
 		fmt.Println(bucketQuantile(q, b))
 	}
 }
--- a/frontend/src/types/common/dashboard.ts
+++ b/frontend/src/types/common/dashboard.ts
@ -34,6 +34,11 @@ export enum EAggregateOperator {
 	RATE_AVG = 23,
 	RATE_MAX = 24,
 	RATE_MIN = 25,
 	HIST_QUANTILE_50 = 26,
 	HIST_QUANTILE_75 = 27,
 	HIST_QUANTILE_90 = 28,
 	HIST_QUANTILE_95 = 29,
 	HIST_QUANTILE_99 = 30,
 }
 export enum EPanelType {
--- a/pkg/query-service/app/metrics/query_builder.go
+++ b/pkg/query-service/app/metrics/query_builder.go
@ -26,6 +26,11 @@ var AggregateOperatorToPercentile = map[model.AggregateOperator]float64{
 	model.P90:              0.90,
 	model.P95:              0.95,
 	model.P99:              0.99,
 	model.HIST_QUANTILE_50: 0.50,
 	model.HIST_QUANTILE_75: 0.75,
 	model.HIST_QUANTILE_90: 0.90,
 	model.HIST_QUANTILE_95: 0.95,
 	model.HIST_QUANTILE_99: 0.99,
 }
 var AggregateOperatorToSQLFunc = map[model.AggregateOperator]string{
@ -173,6 +178,16 @@ func BuildMetricQuery(qp *model.QueryRangeParamsV2, mq *model.MetricQuery, table
 			" GROUP BY %s" +
 			" ORDER BY %s ts"
 	tagsWithoutLe := []string{}
 	for _, tag := range mq.GroupingTags {
 		if tag != "le" {
 			tagsWithoutLe = append(tagsWithoutLe, tag)
 		}
 	}
 	groupByWithoutLe := groupBy(tagsWithoutLe...)
 	groupTagsWithoutLe := groupSelect(tagsWithoutLe...)
 	groupBy := groupBy(mq.GroupingTags...)
 	groupTags := groupSelect(mq.GroupingTags...)
@ -210,6 +225,20 @@ func BuildMetricQuery(qp *model.QueryRangeParamsV2, mq *model.MetricQuery, table
 		op := fmt.Sprintf("quantile(%v)(value)", AggregateOperatorToPercentile[mq.AggregateOperator])
 		query := fmt.Sprintf(queryTmpl, groupTags, qp.Step, op, filterSubQuery, groupBy, groupTags)
 		return query, nil
 	case model.HIST_QUANTILE_50, model.HIST_QUANTILE_75, model.HIST_QUANTILE_90, model.HIST_QUANTILE_95, model.HIST_QUANTILE_99:
 		rateGroupBy := "fingerprint, " + groupBy
 		rateGroupTags := "fingerprint, " + groupTags
 		op := "max(value)"
 		subQuery := fmt.Sprintf(
 			queryTmpl, rateGroupTags, qp.Step, op, filterSubQuery, rateGroupBy, rateGroupTags,
 		) // labels will be same so any should be fine
 		query := `SELECT %s ts, runningDifference(value)/runningDifference(ts) as value FROM(%s) OFFSET 1`
 		query = fmt.Sprintf(query, groupTags, subQuery)
 		query = fmt.Sprintf(`SELECT %s ts, sum(value) as value FROM (%s) GROUP BY %s ORDER BY %s ts`, groupTags, query, groupBy, groupTags)
 		value := AggregateOperatorToPercentile[mq.AggregateOperator]
 		query = fmt.Sprintf(`SELECT %s ts, histogramQuantile(arrayMap(x -> toFloat64(x), groupArray(le)), groupArray(value), %.3f) as value FROM (%s) GROUP BY %s ORDER BY %s ts`, groupTagsWithoutLe, value, query, groupByWithoutLe, groupTagsWithoutLe)
 		return query, nil
 	case model.AVG, model.SUM, model.MIN, model.MAX:
 		op := fmt.Sprintf("%s(value)", AggregateOperatorToSQLFunc[mq.AggregateOperator])
 		query := fmt.Sprintf(queryTmpl, groupTags, qp.Step, op, filterSubQuery, groupBy, groupTags)
--- a/pkg/query-service/model/queryParams.go
+++ b/pkg/query-service/model/queryParams.go
@ -106,6 +106,11 @@ const (
 	RATE_AVG
 	RATE_MAX
 	RATE_MIN
 	HIST_QUANTILE_50
 	HIST_QUANTILE_75
 	HIST_QUANTILE_90
 	HIST_QUANTILE_95
 	HIST_QUANTILE_99
 )
 type DataSource int