Fix: cheaper query for fetching log attribute values for filter suggestions (#5989)

* chore: change query for fetching multiple log attribs to make sure it is always cheap * chore: get filter suggestions tests passing
2025-10-18 08:11:30 +08:00 · 2024-09-17 15:49:14 +05:30 · 2024-09-17 15:49:14 +05:30 · 8c891f0e87
commit 8c891f0e87
parent 49dd5f2ef7
2 changed files with 58 additions and 37 deletions
--- a/pkg/query-service/app/clickhouseReader/reader.go
+++ b/pkg/query-service/app/clickhouseReader/reader.go
@ -4098,44 +4098,65 @@ func (r *ClickHouseReader) GetQBFilterSuggestionsForLogs(
 func (r *ClickHouseReader) getValuesForLogAttributes(
 	ctx context.Context, attributes []v3.AttributeKey, limit uint64,
 ) ([][]any, *model.ApiError) {
-	// query top `limit` distinct values seen for `tagKey`s of interest
-	// ordered by timestamp when the value was seen
+	/*
+		The query used here needs to be as cheap as possible, and while uncommon, it is possible for
+		a tag to have 100s of millions of values (eg: message, request_id)

-	// we added the settings max_rows_to_group_by=100, group_by_overflow_mode = 'break'
-	// to avoid query from taking up all the resources when value is high cardinality.
-	query := fmt.Sprintf(
-		`
-		select tagKey, stringTagValue, int64TagValue, float64TagValue
-		from (
-			select
-				tagKey,
-				stringTagValue,
-				int64TagValue,
-				float64TagValue,
-				row_number() over (partition by tagKey order by ts desc) as rank
-			from (
-				select
-					tagKey,
-					stringTagValue,
-					int64TagValue,
-					float64TagValue,
-					max(timestamp) as ts
-				from %s.%s
-				where tagKey in $1
-				group by (tagKey, stringTagValue, int64TagValue, float64TagValue) SETTINGS max_rows_to_group_by = 100, group_by_overflow_mode = 'break', max_threads=4
-			)
+		Construct a query to UNION the result of querying first `limit` values for each attribute. For example:
+		```
+			select * from (
+				(
+					select tagKey, stringTagValue, int64TagValue, float64TagValue
+					from signoz_logs.distributed_tag_attributes
+					where tagKey = $1 and (
+						stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
+					)
+					limit 2
+				) UNION DISTINCT (
+					select tagKey, stringTagValue, int64TagValue, float64TagValue
+					from signoz_logs.distributed_tag_attributes
+					where tagKey = $2 and (
+						stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
+					)
+					limit 2
+				)
+			) settings max_threads=2
+		```
+		Since tag_attributes table uses ReplacingMergeTree, the values would be distinct and no order by
+		is being used to ensure the `limit` clause minimizes the amount of data scanned.
+
+		This query scanned ~30k rows per attribute on fiscalnote-v2 for attributes like `message` and `time`
+		that had >~110M values each
+	*/
+
+	if len(attributes) > 10 {
+		zap.L().Error(
+			"log attribute values requested for too many attributes. This can lead to slow and costly queries",
+			zap.Int("count", len(attributes)),
 		)
-		where rank <= %d
-		`,
-		r.logsDB, r.logsTagAttributeTable, limit,
-	)
-
-	attribNames := []string{}
-	for _, attrib := range attributes {
-		attribNames = append(attribNames, attrib.Key)
+		attributes = attributes[:10]
 	}

-	rows, err := r.db.Query(ctx, query, attribNames)
+	tagQueries := []string{}
+	tagKeyQueryArgs := []any{}
+	for idx, attrib := range attributes {
+		tagQueries = append(tagQueries, fmt.Sprintf(`(
+			select tagKey, stringTagValue, int64TagValue, float64TagValue
+			from %s.%s
+			where tagKey = $%d and (
+				stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
+			)
+			limit %d
+		)`, r.logsDB, r.logsTagAttributeTable, idx+1, limit))
+
+		tagKeyQueryArgs = append(tagKeyQueryArgs, attrib.Key)
+	}
+
+	query := fmt.Sprintf(`select * from (
+		%s
+	) settings max_threads=2`, strings.Join(tagQueries, " UNION DISTINCT "))
+
+	rows, err := r.db.Query(ctx, query, tagKeyQueryArgs...)
 	if err != nil {
 		zap.L().Error("couldn't query attrib values for suggestions", zap.Error(err))
 		return nil, model.InternalError(fmt.Errorf(
--- a/pkg/query-service/tests/integration/filter_suggestions_test.go
+++ b/pkg/query-service/tests/integration/filter_suggestions_test.go
@ -186,7 +186,7 @@ func (tb *FilterSuggestionsTestBed) mockAttribValuesQueryResponse(
 		{Type: "Nullable(Float64)", Name: "float64TagValue"},
 	}

-	expectedAttribKeysInQuery := []string{}
+	expectedAttribKeysInQuery := []any{}
 	mockResultRows := [][]any{}
 	for idx, attrib := range expectedAttribs {
 		expectedAttribKeysInQuery = append(expectedAttribKeysInQuery, attrib.Key)
@ -198,8 +198,8 @@ func (tb *FilterSuggestionsTestBed) mockAttribValuesQueryResponse(
 	}

 	tb.mockClickhouse.ExpectQuery(
-		"select.*tagKey.*stringTagValue.*int64TagValue.*float64TagValue.*distributed_tag_attributes.*tagKey.*in.*",
-	).WithArgs(expectedAttribKeysInQuery).WillReturnRows(mockhouse.NewRows(resultCols, mockResultRows))
+		"select.*tagKey.*stringTagValue.*int64TagValue.*float64TagValue.*distributed_tag_attributes.*tagKey",
+	).WithArgs(expectedAttribKeysInQuery...).WillReturnRows(mockhouse.NewRows(resultCols, mockResultRows))
 }

 type FilterSuggestionsTestBed struct {