fix: use proper indexes for full text search (#4787)

* fix: use proper indexes for full text search

* fix: tests updated

* feat: lower support only for body and not attributes

* fix: remove default tolower

* fix: add comment for json key split

* fix: remove ilike only for body searches

* fix: minor fixes

* fix: minor fixes
This commit is contained in:
Nityananda Gohain 2024-07-22 17:46:35 +05:30 committed by GitHub
parent 99c68ddbcd
commit 1585065fff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 102 additions and 19 deletions

View File

@ -17,6 +17,7 @@ const (
ARRAY_INT64 = "Array(Int64)" ARRAY_INT64 = "Array(Int64)"
ARRAY_FLOAT64 = "Array(Float64)" ARRAY_FLOAT64 = "Array(Float64)"
ARRAY_BOOL = "Array(Bool)" ARRAY_BOOL = "Array(Bool)"
NGRAM_SIZE = 4
) )
var dataTypeMapping = map[string]string{ var dataTypeMapping = map[string]string{
@ -72,6 +73,7 @@ func getPath(keyArr []string) string {
func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (string, error) { func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (string, error) {
keyArr := strings.Split(key.Key, ".") keyArr := strings.Split(key.Key, ".")
// i.e it should be at least body.name, and not something like body
if len(keyArr) < 2 { if len(keyArr) < 2 {
return "", fmt.Errorf("incorrect key, should contain at least 2 parts") return "", fmt.Errorf("incorrect key, should contain at least 2 parts")
} }
@ -106,6 +108,29 @@ func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (
return keyname, nil return keyname, nil
} }
// takes the path and the values and generates where clauses for better usage of index
func getPathIndexFilter(path string) string {
filters := []string{}
keyArr := strings.Split(path, ".")
if len(keyArr) < 2 {
return ""
}
for i, key := range keyArr {
if i == 0 {
continue
}
key = strings.TrimSuffix(key, "[*]")
if len(key) >= NGRAM_SIZE {
filters = append(filters, strings.ToLower(key))
}
}
if len(filters) > 0 {
return fmt.Sprintf("lower(body) like lower('%%%s%%')", strings.Join(filters, "%"))
}
return ""
}
func GetJSONFilter(item v3.FilterItem) (string, error) { func GetJSONFilter(item v3.FilterItem) (string, error) {
dataType := item.Key.DataType dataType := item.Key.DataType
@ -154,11 +179,28 @@ func GetJSONFilter(item v3.FilterItem) (string, error) {
return "", fmt.Errorf("unsupported operator: %s", op) return "", fmt.Errorf("unsupported operator: %s", op)
} }
filters := []string{}
pathFilter := getPathIndexFilter(item.Key.Key)
if pathFilter != "" {
filters = append(filters, pathFilter)
}
if op == v3.FilterOperatorContains ||
op == v3.FilterOperatorEqual ||
op == v3.FilterOperatorHas {
val, ok := item.Value.(string)
if ok && len(val) >= NGRAM_SIZE {
filters = append(filters, fmt.Sprintf("lower(body) like lower('%%%s%%')", utils.QuoteEscapedString(strings.ToLower(val))))
}
}
// add exists check for non array items as default values of int/float/bool will corrupt the results // add exists check for non array items as default values of int/float/bool will corrupt the results
if !isArray && !(item.Operator == v3.FilterOperatorExists || item.Operator == v3.FilterOperatorNotExists) { if !isArray && !(item.Operator == v3.FilterOperatorExists || item.Operator == v3.FilterOperatorNotExists) {
existsFilter := fmt.Sprintf("JSON_EXISTS(body, '$.%s')", getPath(strings.Split(item.Key.Key, ".")[1:])) existsFilter := fmt.Sprintf("JSON_EXISTS(body, '$.%s')", getPath(strings.Split(item.Key.Key, ".")[1:]))
filter = fmt.Sprintf("%s AND %s", existsFilter, filter) filter = fmt.Sprintf("%s AND %s", existsFilter, filter)
} }
return filter, nil filters = append(filters, filter)
return strings.Join(filters, " AND "), nil
} }

View File

@ -168,7 +168,7 @@ var testGetJSONFilterData = []struct {
Operator: "has", Operator: "has",
Value: "index_service", Value: "index_service",
}, },
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')", Filter: "lower(body) like lower('%requestor_list%') AND lower(body) like lower('%index_service%') AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')",
}, },
{ {
Name: "Array membership int64", Name: "Array membership int64",
@ -181,7 +181,7 @@ var testGetJSONFilterData = []struct {
Operator: "has", Operator: "has",
Value: 2, Value: 2,
}, },
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)", Filter: "lower(body) like lower('%int_numbers%') AND has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)",
}, },
{ {
Name: "Array membership float64", Name: "Array membership float64",
@ -194,7 +194,7 @@ var testGetJSONFilterData = []struct {
Operator: "nhas", Operator: "nhas",
Value: 2.2, Value: 2.2,
}, },
Filter: "NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)", Filter: "lower(body) like lower('%nested_num%float_nums%') AND NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)",
}, },
{ {
Name: "Array membership bool", Name: "Array membership bool",
@ -207,7 +207,7 @@ var testGetJSONFilterData = []struct {
Operator: "has", Operator: "has",
Value: true, Value: true,
}, },
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)", Filter: "lower(body) like lower('%bool%') AND has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)",
}, },
{ {
Name: "eq operator", Name: "eq operator",
@ -220,7 +220,7 @@ var testGetJSONFilterData = []struct {
Operator: "=", Operator: "=",
Value: "hello", Value: "hello",
}, },
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'", Filter: "lower(body) like lower('%message%') AND lower(body) like lower('%hello%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'",
}, },
{ {
Name: "eq operator number", Name: "eq operator number",
@ -233,7 +233,7 @@ var testGetJSONFilterData = []struct {
Operator: "=", Operator: "=",
Value: 1, Value: 1,
}, },
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1", Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1",
}, },
{ {
Name: "neq operator number", Name: "neq operator number",
@ -246,7 +246,7 @@ var testGetJSONFilterData = []struct {
Operator: "=", Operator: "=",
Value: 1.1, Value: 1.1,
}, },
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000", Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000",
}, },
{ {
Name: "eq operator bool", Name: "eq operator bool",
@ -259,7 +259,7 @@ var testGetJSONFilterData = []struct {
Operator: "=", Operator: "=",
Value: true, Value: true,
}, },
Filter: "JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true", Filter: "lower(body) like lower('%boolkey%') AND JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true",
}, },
{ {
Name: "greater than operator", Name: "greater than operator",
@ -272,7 +272,7 @@ var testGetJSONFilterData = []struct {
Operator: ">", Operator: ">",
Value: 1, Value: 1,
}, },
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1", Filter: "lower(body) like lower('%status%') AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1",
}, },
{ {
Name: "regex operator", Name: "regex operator",
@ -285,7 +285,7 @@ var testGetJSONFilterData = []struct {
Operator: "regex", Operator: "regex",
Value: "a*", Value: "a*",
}, },
Filter: "JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')", Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')",
}, },
{ {
Name: "contains operator", Name: "contains operator",
@ -298,7 +298,7 @@ var testGetJSONFilterData = []struct {
Operator: "contains", Operator: "contains",
Value: "a", Value: "a",
}, },
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'", Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'",
}, },
{ {
Name: "contains operator with quotes", Name: "contains operator with quotes",
@ -311,7 +311,7 @@ var testGetJSONFilterData = []struct {
Operator: "contains", Operator: "contains",
Value: "hello 'world'", Value: "hello 'world'",
}, },
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%hello \\'world\\'%'", Filter: "lower(body) like lower('%message%') AND lower(body) like lower('%hello \\'world\\'%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%hello \\'world\\'%'",
}, },
{ {
Name: "exists", Name: "exists",
@ -324,7 +324,7 @@ var testGetJSONFilterData = []struct {
Operator: "exists", Operator: "exists",
Value: "", Value: "",
}, },
Filter: "JSON_EXISTS(body, '$.\"message\"')", Filter: "lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"')",
}, },
} }

View File

@ -51,6 +51,8 @@ var logOperators = map[v3.FilterOperator]string{
v3.FilterOperatorNotExists: "not has(%s_%s_key, '%s')", v3.FilterOperatorNotExists: "not has(%s_%s_key, '%s')",
} }
const BODY = "body"
func getClickhouseLogsColumnType(columnType v3.AttributeKeyType) string { func getClickhouseLogsColumnType(columnType v3.AttributeKeyType) string {
if columnType == v3.AttributeKeyTypeTag { if columnType == v3.AttributeKeyTypeTag {
return "attributes" return "attributes"
@ -193,10 +195,24 @@ func buildLogsTimeSeriesFilterQuery(fs *v3.FilterSet, groupBy []v3.AttributeKey,
case v3.FilterOperatorContains, v3.FilterOperatorNotContains: case v3.FilterOperatorContains, v3.FilterOperatorNotContains:
columnName := getClickhouseColumnName(item.Key) columnName := getClickhouseColumnName(item.Key)
val := utils.QuoteEscapedString(fmt.Sprintf("%v", item.Value)) val := utils.QuoteEscapedString(fmt.Sprintf("%v", item.Value))
if columnName == BODY {
logsOp = strings.Replace(logsOp, "ILIKE", "LIKE", 1) // removing i from ilike and not ilike
conditions = append(conditions, fmt.Sprintf("lower(%s) %s lower('%%%s%%')", columnName, logsOp, val))
} else {
conditions = append(conditions, fmt.Sprintf("%s %s '%%%s%%'", columnName, logsOp, val)) conditions = append(conditions, fmt.Sprintf("%s %s '%%%s%%'", columnName, logsOp, val))
}
default: default:
columnName := getClickhouseColumnName(item.Key) columnName := getClickhouseColumnName(item.Key)
fmtVal := utils.ClickHouseFormattedValue(value) fmtVal := utils.ClickHouseFormattedValue(value)
// for use lower for like and ilike
if op == v3.FilterOperatorLike || op == v3.FilterOperatorNotLike {
if columnName == BODY {
logsOp = strings.Replace(logsOp, "ILIKE", "LIKE", 1) // removing i from ilike and not ilike
columnName = fmt.Sprintf("lower(%s)", columnName)
fmtVal = fmt.Sprintf("lower(%s)", fmtVal)
}
}
conditions = append(conditions, fmt.Sprintf("%s %s %s", columnName, logsOp, fmtVal)) conditions = append(conditions, fmt.Sprintf("%s %s %s", columnName, logsOp, fmtVal))
} }
} else { } else {

View File

@ -130,6 +130,14 @@ var timeSeriesFilterQueryData = []struct {
}}, }},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'user_name')] = 'john' AND resources_string_value[indexOf(resources_string_key, 'k8s_namespace')] != 'my_service'", ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'user_name')] = 'john' AND resources_string_value[indexOf(resources_string_key, 'k8s_namespace')] != 'my_service'",
}, },
{
Name: "Test attribute and resource attribute with different case",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "user_name", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "%JoHn%", Operator: "like"},
{Key: v3.AttributeKey{Key: "k8s_namespace", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeResource}, Value: "%MyService%", Operator: "nlike"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'user_name')] ILIKE '%JoHn%' AND resources_string_value[indexOf(resources_string_key, 'k8s_namespace')] NOT ILIKE '%MyService%'",
},
{ {
Name: "Test materialized column", Name: "Test materialized column",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{ FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
@ -287,6 +295,22 @@ var timeSeriesFilterQueryData = []struct {
}}, }},
ExpectedFilter: "`attribute_int64_status_exists`=false", ExpectedFilter: "`attribute_int64_status_exists`=false",
}, },
{
Name: "Test for body contains and ncontains",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "contains", Value: "test"},
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "ncontains", Value: "test1"},
}},
ExpectedFilter: "lower(body) LIKE lower('%test%') AND lower(body) NOT LIKE lower('%test1%')",
},
{
Name: "Test for body like and nlike",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "like", Value: "test"},
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, IsColumn: true}, Operator: "nlike", Value: "test1"},
}},
ExpectedFilter: "lower(body) LIKE lower('test') AND lower(body) NOT LIKE lower('test1')",
},
} }
func TestBuildLogsTimeSeriesFilterQuery(t *testing.T) { func TestBuildLogsTimeSeriesFilterQuery(t *testing.T) {
@ -851,7 +875,7 @@ var testBuildLogsQueryData = []struct {
}, },
}, },
TableName: "logs", TableName: "logs",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND body ILIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC", ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) LIKE lower('%test%') AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
}, },
{ {
Name: "Test attribute with same name as top level key", Name: "Test attribute with same name as top level key",
@ -981,7 +1005,7 @@ var testBuildLogsQueryData = []struct {
}, },
}, },
TableName: "logs", TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC", ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like lower('%message%') AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
}, },
{ {
Name: "TABLE: Test count with JSON Filter Array, groupBy, orderBy", Name: "TABLE: Test count with JSON Filter Array, groupBy, orderBy",
@ -1015,7 +1039,7 @@ var testBuildLogsQueryData = []struct {
}, },
}, },
TableName: "logs", TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC", ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like lower('%requestor_list%') AND lower(body) like lower('%index_service%') AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
}, },
} }

View File

@ -907,7 +907,8 @@ const (
FilterOperatorNotContains FilterOperator = "ncontains" FilterOperatorNotContains FilterOperator = "ncontains"
FilterOperatorRegex FilterOperator = "regex" FilterOperatorRegex FilterOperator = "regex"
FilterOperatorNotRegex FilterOperator = "nregex" FilterOperatorNotRegex FilterOperator = "nregex"
// (I)LIKE is faster than REGEX and supports index // (I)LIKE is faster than REGEX
// ilike doesn't support index so internally we use lower(body) like for query
FilterOperatorLike FilterOperator = "like" FilterOperatorLike FilterOperator = "like"
FilterOperatorNotLike FilterOperator = "nlike" FilterOperatorNotLike FilterOperator = "nlike"