fix: use proper indexes for full text search (#4787)

* fix: use proper indexes for full text search

* fix: tests updated

* feat: lower support only for body and not attributes

* fix: remove default tolower

* fix: add comment for json key split

* fix: remove ilike only for body searches

* fix: minor fixes

* fix: minor fixes
This commit is contained in:
Nityananda Gohain 2024-07-22 17:46:35 +05:30 committed by GitHub
parent 99c68ddbcd
commit 1585065fff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 102 additions and 19 deletions

View File

@ -17,6 +17,7 @@ const (
ARRAY_INT64 = "Array(Int64)"
ARRAY_FLOAT64 = "Array(Float64)"
ARRAY_BOOL = "Array(Bool)"
NGRAM_SIZE = 4
)
var dataTypeMapping = map[string]string{
@ -72,6 +73,7 @@ func getPath(keyArr []string) string {
func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (string, error) {
keyArr := strings.Split(key.Key, ".")
// i.e it should be at least body.name, and not something like body
if len(keyArr) < 2 {
return "", fmt.Errorf("incorrect key, should contain at least 2 parts")
}
@ -106,6 +108,29 @@ func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (
return keyname, nil
}
// takes the path and the values and generates where clauses for better usage of index
func getPathIndexFilter(path string) string {
filters := []string{}
keyArr := strings.Split(path, ".")
if len(keyArr) < 2 {
return ""
}
for i, key := range keyArr {
if i == 0 {
continue
}
key = strings.TrimSuffix(key, "[*]")
if len(key) >= NGRAM_SIZE {
filters = append(filters, strings.ToLower(key))
}
}
if len(filters) > 0 {
return fmt.Sprintf("lower(body) like lower('%%%s%%')", strings.Join(filters, "%"))
}
return ""
}
func GetJSONFilter(item v3.FilterItem) (string, error) {
dataType := item.Key.DataType
@ -154,11 +179,28 @@ func GetJSONFilter(item v3.FilterItem) (string, error) {
return "", fmt.Errorf("unsupported operator: %s", op)
}
filters := []string{}
pathFilter := getPathIndexFilter(item.Key.Key)
if pathFilter != "" {
filters = append(filters, pathFilter)
}
if op == v3.FilterOperatorContains ||
op == v3.FilterOperatorEqual ||
op == v3.FilterOperatorHas {
val, ok := item.Value.(string)
if ok && len(val) >= NGRAM_SIZE {
filters = append(filters, fmt.Sprintf("lower(body) like lower('%%%s%%')", utils.QuoteEscapedString(strings.ToLower(val))))
}
}
// add exists check for non array items as default values of int/float/bool will corrupt the results
if !isArray && !(item.Operator == v3.FilterOperatorExists || item.Operator == v3.FilterOperatorNotExists) {
existsFilter := fmt.Sprintf("JSON_EXISTS(body, '$.%s')", getPath(strings.Split(item.Key.Key, ".")[1:]))
filter = fmt.Sprintf("%s AND %s", existsFilter, filter)
}
return filter, nil
filters = append(filters, filter)
return strings.Join(filters, " AND "), nil
}

View File

@ -168,7 +168,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: "index_service",
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')",
Filter: "lower(body) like lower('%requestor_list%') AND lower(body) like lower('%index_service%') AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')",
},
{
Name: "Array membership int64",
@ -181,7 +181,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: 2,
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)",
Filter: "lower(body) like lower('%int_numbers%') AND has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)",
},
{
Name: "Array membership float64",
@ -194,7 +194,7 @@ var testGetJSONFilterData = []struct {
Operator: "nhas",
Value: 2.2,
},
Filter: "NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)",
Filter: "lower(body) like lower('%nested_num%float_nums%') AND NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)",
},
{
Name: "Array membership bool",
@ -207,7 +207,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: true,
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)",
Filter: "lower(body) like lower('%bool%') AND has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)",
},
{
Name: "eq operator",
@ -220,7 +220,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: "hello",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'",
Filter: "lower(body) like lower('%message%') AND lower(body) like lower('%hello%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'",
},
{
Name: "eq operator number",
@ -233,7 +233,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: 1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1",
Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1",
},
{
Name: "neq operator number",
@ -246,7 +246,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: 1.1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000",
Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000",
},
{
Name: "eq operator bool",
@ -259,7 +259,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: true,
},
Filter: "JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true",
Filter: "lower(body) like lower('%boolkey%') AND JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true",
},
{
Name: "greater than operator",
@ -272,7 +272,7 @@ var testGetJSONFilterData = []struct {
Operator: ">",
Value: 1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1",
Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1",
},
{
Name: "regex operator",
@ -285,7 +285,7 @@ var testGetJSONFilterData = []struct {
Operator: "regex",
Value: "a*",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')",
Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')",
},
{
Name: "contains operator",
@ -298,7 +298,7 @@ var testGetJSONFilterData = []struct {
Operator: "contains",
Value: "a",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'",
Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'",
},
{
Name: "contains operator with quotes",
@ -311,7 +311,7 @@ var testGetJSONFilterData = []struct {
Operator: "contains",
Value: "hello 'world'",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%hello \\'world\\'%'",
Filter: "lower(body) like lower('%message%') AND lower(body) like lower('%hello \\'world\\'%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%hello \\'world\\'%'",
},
{
Name: "exists",
@ -324,7 +324,7 @@ var testGetJSONFilterData = []struct {
Operator: "exists",
Value: "",
},
Filter: "JSON_EXISTS(body, '$.\"message\"')",
Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"')",
},
}

View File

@ -51,6 +51,8 @@ var logOperators = map[v3.FilterOperator]string{
v3.FilterOperatorNotExists: "not has(%s_%s_key, '%s')",
}
const BODY = "body"
func getClickhouseLogsColumnType(columnType v3.AttributeKeyType) string {
if columnType == v3.AttributeKeyTypeTag {
return "attributes"
@ -193,10 +195,24 @@ func buildLogsTimeSeriesFilterQuery(fs *v3.FilterSet, groupBy []v3.AttributeKey,
case v3.FilterOperatorContains, v3.FilterOperatorNotContains:
columnName := getClickhouseColumnName(item.Key)
val := utils.QuoteEscapedString(fmt.Sprintf("%v", item.Value))
if columnName == BODY {
logsOp = strings.Replace(logsOp, "ILIKE", "LIKE", 1) // removing i from ilike and not ilike
conditions = append(conditions, fmt.Sprintf("lower(%s) %s lower('%%%s%%')", columnName, logsOp, val))
} else {
conditions = append(conditions, fmt.Sprintf("%s %s '%%%s%%'", columnName, logsOp, val))
}
default:
columnName := getClickhouseColumnName(item.Key)
fmtVal := utils.ClickHouseFormattedValue(value)
// for use lower for like and ilike
if op == v3.FilterOperatorLike || op == v3.FilterOperatorNotLike {
if columnName == BODY {
logsOp = strings.Replace(logsOp, "ILIKE", "LIKE", 1) // removing i from ilike and not ilike
columnName = fmt.Sprintf("lower(%s)", columnName)
fmtVal = fmt.Sprintf("lower(%s)", fmtVal)
}
}
conditions = append(conditions, fmt.Sprintf("%s %s %s", columnName, logsOp, fmtVal))
}
} else {

View File

@ -130,6 +130,14 @@ var timeSeriesFilterQueryData = []struct {
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'user_name')] = 'john' AND resources_string_value[indexOf(resources_string_key, 'k8s_namespace')] != 'my_service'",
},
{
Name: "Test attribute and resource attribute with different case",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "user_name", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "%JoHn%", Operator: "like"},
{Key: v3.AttributeKey{Key: "k8s_namespace", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeResource}, Value: "%MyService%", Operator: "nlike"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'user_name')] ILIKE '%JoHn%' AND resources_string_value[indexOf(resources_string_key, 'k8s_namespace')] NOT ILIKE '%MyService%'",
},
{
Name: "Test materialized column",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
@ -287,6 +295,22 @@ var timeSeriesFilterQueryData = []struct {
}},
ExpectedFilter: "`attribute_int64_status_exists`=false",
},
{
Name: "Test for body contains and ncontains",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "contains", Value: "test"},
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "ncontains", Value: "test1"},
}},
ExpectedFilter: "lower(body) LIKE lower('%test%') AND lower(body) NOT LIKE lower('%test1%')",
},
{
Name: "Test for body like and nlike",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "like", Value: "test"},
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "nlike", Value: "test1"},
}},
ExpectedFilter: "lower(body) LIKE lower('test') AND lower(body) NOT LIKE lower('test1')",
},
}
func TestBuildLogsTimeSeriesFilterQuery(t *testing.T) {
@ -851,7 +875,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND body ILIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) LIKE lower('%test%') AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
},
{
Name: "Test attribute with same name as top level key",
@ -981,7 +1005,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
},
{
Name: "TABLE: Test count with JSON Filter Array, groupBy, orderBy",
@ -1015,7 +1039,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like lower('%requestor_list%') AND lower(body) like lower('%index_service%') AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
},
}

View File

@ -907,7 +907,8 @@ const (
FilterOperatorNotContains FilterOperator = "ncontains"
FilterOperatorRegex FilterOperator = "regex"
FilterOperatorNotRegex FilterOperator = "nregex"
// (I)LIKE is faster than REGEX and supports index
// (I)LIKE is faster than REGEX
// ilike doesn't support index so internally we use lower(body) like for query
FilterOperatorLike FilterOperator = "like"
FilterOperatorNotLike FilterOperator = "nlike"