Fix: cheaper query for fetching log attribute values for filter suggestions (#5989)

* chore: change query for fetching multiple log attribs to make sure it is always cheap

* chore: get filter suggestions tests passing
This commit is contained in:
Raj Kamal Singh 2024-09-17 15:49:14 +05:30 committed by GitHub
parent 49dd5f2ef7
commit 8c891f0e87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 58 additions and 37 deletions

View File

@ -4098,44 +4098,65 @@ func (r *ClickHouseReader) GetQBFilterSuggestionsForLogs(
func (r *ClickHouseReader) getValuesForLogAttributes(
ctx context.Context, attributes []v3.AttributeKey, limit uint64,
) ([][]any, *model.ApiError) {
// query top `limit` distinct values seen for `tagKey`s of interest
// ordered by timestamp when the value was seen
/*
The query used here needs to be as cheap as possible, and while uncommon, it is possible for
a tag to have 100s of millions of values (eg: message, request_id)
// we added the settings max_rows_to_group_by=100, group_by_overflow_mode = 'break'
// to avoid query from taking up all the resources when value is high cardinality.
query := fmt.Sprintf(
`
select tagKey, stringTagValue, int64TagValue, float64TagValue
from (
select
tagKey,
stringTagValue,
int64TagValue,
float64TagValue,
row_number() over (partition by tagKey order by ts desc) as rank
from (
select
tagKey,
stringTagValue,
int64TagValue,
float64TagValue,
max(timestamp) as ts
from %s.%s
where tagKey in $1
group by (tagKey, stringTagValue, int64TagValue, float64TagValue) SETTINGS max_rows_to_group_by = 100, group_by_overflow_mode = 'break', max_threads=4
)
Construct a query to UNION the result of querying first `limit` values for each attribute. For example:
```
select * from (
(
select tagKey, stringTagValue, int64TagValue, float64TagValue
from signoz_logs.distributed_tag_attributes
where tagKey = $1 and (
stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
)
limit 2
) UNION DISTINCT (
select tagKey, stringTagValue, int64TagValue, float64TagValue
from signoz_logs.distributed_tag_attributes
where tagKey = $2 and (
stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
)
limit 2
)
) settings max_threads=2
```
Since tag_attributes table uses ReplacingMergeTree, the values would be distinct and no order by
is being used to ensure the `limit` clause minimizes the amount of data scanned.
This query scanned ~30k rows per attribute on fiscalnote-v2 for attributes like `message` and `time`
that had >~110M values each
*/
if len(attributes) > 10 {
zap.L().Error(
"log attribute values requested for too many attributes. This can lead to slow and costly queries",
zap.Int("count", len(attributes)),
)
where rank <= %d
`,
r.logsDB, r.logsTagAttributeTable, limit,
)
attribNames := []string{}
for _, attrib := range attributes {
attribNames = append(attribNames, attrib.Key)
attributes = attributes[:10]
}
rows, err := r.db.Query(ctx, query, attribNames)
tagQueries := []string{}
tagKeyQueryArgs := []any{}
for idx, attrib := range attributes {
tagQueries = append(tagQueries, fmt.Sprintf(`(
select tagKey, stringTagValue, int64TagValue, float64TagValue
from %s.%s
where tagKey = $%d and (
stringTagValue != '' or int64TagValue is not null or float64TagValue is not null
)
limit %d
)`, r.logsDB, r.logsTagAttributeTable, idx+1, limit))
tagKeyQueryArgs = append(tagKeyQueryArgs, attrib.Key)
}
query := fmt.Sprintf(`select * from (
%s
) settings max_threads=2`, strings.Join(tagQueries, " UNION DISTINCT "))
rows, err := r.db.Query(ctx, query, tagKeyQueryArgs...)
if err != nil {
zap.L().Error("couldn't query attrib values for suggestions", zap.Error(err))
return nil, model.InternalError(fmt.Errorf(

View File

@ -186,7 +186,7 @@ func (tb *FilterSuggestionsTestBed) mockAttribValuesQueryResponse(
{Type: "Nullable(Float64)", Name: "float64TagValue"},
}
expectedAttribKeysInQuery := []string{}
expectedAttribKeysInQuery := []any{}
mockResultRows := [][]any{}
for idx, attrib := range expectedAttribs {
expectedAttribKeysInQuery = append(expectedAttribKeysInQuery, attrib.Key)
@ -198,8 +198,8 @@ func (tb *FilterSuggestionsTestBed) mockAttribValuesQueryResponse(
}
tb.mockClickhouse.ExpectQuery(
"select.*tagKey.*stringTagValue.*int64TagValue.*float64TagValue.*distributed_tag_attributes.*tagKey.*in.*",
).WithArgs(expectedAttribKeysInQuery).WillReturnRows(mockhouse.NewRows(resultCols, mockResultRows))
"select.*tagKey.*stringTagValue.*int64TagValue.*float64TagValue.*distributed_tag_attributes.*tagKey",
).WithArgs(expectedAttribKeysInQuery...).WillReturnRows(mockhouse.NewRows(resultCols, mockResultRows))
}
type FilterSuggestionsTestBed struct {