From 62fb05ac5a20f62c75d4c0c0cdbb3ebbddcf41fa Mon Sep 17 00:00:00 2001 From: Shivanshu Raj Shrivastava Date: Fri, 30 May 2025 16:07:06 +0530 Subject: [PATCH] fix: further improve clickhouse queries Signed-off-by: Shivanshu Raj Shrivastava --- pkg/modules/tracefunnel/clickhouse_queries.go | 301 +++++------------- 1 file changed, 78 insertions(+), 223 deletions(-) diff --git a/pkg/modules/tracefunnel/clickhouse_queries.go b/pkg/modules/tracefunnel/clickhouse_queries.go index bac34232f4..c28b3c6f92 100644 --- a/pkg/modules/tracefunnel/clickhouse_queries.go +++ b/pkg/modules/tracefunnel/clickhouse_queries.go @@ -559,94 +559,47 @@ func BuildTwoStepFunnelStepOverviewQuery( ) string { const tpl = ` WITH - %[1]d AS contains_error_t1, - %[2]d AS contains_error_t2, - '%[3]s' AS latency_pointer_t1, - '%[4]s' AS latency_pointer_t2, toDateTime64(%[5]d/1e9,9) AS start_ts, toDateTime64(%[6]d/1e9,9) AS end_ts, - (%[6]d - %[5]d)/1e9 AS time_window_sec, - '%[7]s' AS service_name_t1, - '%[8]s' AS span_name_t1, - '%[9]s' AS service_name_t2, - '%[10]s' AS span_name_t2 + (%[6]d - %[5]d)/1e9 AS time_window_sec, -, step1 AS ( - SELECT - trace_id, - argMin(timestamp, timestamp) AS t1_time, - argMin(has_error, timestamp) AS s1_has_error - FROM signoz_traces.signoz_index_v3 - WHERE - timestamp BETWEEN start_ts AND end_ts - AND serviceName = service_name_t1 - AND name = span_name_t1 - AND (contains_error_t1 = 0 OR has_error = true) - %[11]s - GROUP BY trace_id - LIMIT 100000 -) -, step2 AS ( - SELECT - trace_id, - argMin(timestamp, timestamp) AS t2_time, - argMin(has_error, timestamp) AS s2_has_error - FROM signoz_traces.signoz_index_v3 - WHERE - timestamp BETWEEN start_ts AND end_ts - AND serviceName = service_name_t2 - AND name = span_name_t2 - AND (contains_error_t2 = 0 OR has_error = true) - %[12]s - GROUP BY trace_id - LIMIT 100000 -) -, joined AS ( - SELECT - s1.trace_id, - s1.t1_time, - s2.t2_time, - s1.s1_has_error, - s2.s2_has_error - FROM step1 AS s1 - INNER JOIN step2 AS s2 ON s1.trace_id = s2.trace_id - WHERE s2.t2_time > s1.t1_time -) -, errors_step1 AS (SELECT countIf(s1_has_error) AS errors FROM step1) -, errors_step2 AS (SELECT countIf(s2_has_error) AS errors FROM step2) + ('%[7]s','%[8]s') AS step1, + ('%[9]s','%[10]s') AS step2 SELECT round( - count(DISTINCT trace_id)*100.0 - / (SELECT count(DISTINCT trace_id) FROM step1), 2 - ) AS conversion_rate, - count(DISTINCT trace_id) / time_window_sec AS avg_rate, - greatest( - (SELECT errors FROM errors_step1), - (SELECT errors FROM errors_step2) - ) AS errors, + countIf(t1_time > 0 AND t2_time > t1_time) * 100.0 + / countIf(t1_time > 0), + 2 + ) AS conversion_rate, + countIf(t1_time > 0 AND t2_time > t1_time) + / time_window_sec AS avg_rate, + greatest(sum(s1_error), sum(s2_error)) AS errors, avg( - abs( - CAST(t2_time AS Decimal(20,9)) - - CAST(t1_time AS Decimal(20,9)) - ) * 1000 - ) AS avg_duration, - CASE - WHEN '%[13]s' = 'p99' THEN quantile(0.99)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) * 1000 - ) - WHEN '%[13]s' = 'p95' THEN quantile(0.95)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) * 1000 - ) - WHEN '%[13]s' = 'p90' THEN quantile(0.90)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) * 1000 - ) - ELSE quantile(0.99)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) * 1000 - ) - END AS latency -FROM joined; -` + dateDiff('microseconds', t1_time, t2_time) / 1000.0 + ) AS avg_duration_ms, + quantile(0.99)( + dateDiff('microseconds', t1_time, t2_time) / 1000.0 + ) AS p99_duration_ms +FROM ( + SELECT + trace_id, + minIf(timestamp, serviceName = step1.1 AND name = step1.2) AS t1_time, + minIf(timestamp, serviceName = step2.1 AND name = step2.2) AS t2_time, + toUInt8(anyIf(has_error, serviceName = step1.1 AND name = step1.2)) AS s1_error, + toUInt8(anyIf(has_error, serviceName = step2.1 AND name = step2.2)) AS s2_error + FROM signoz_traces.signoz_index_v3 + WHERE + timestamp BETWEEN start_ts AND end_ts + AND serviceName IN (step1.1, step2.1) + AND name IN (step1.2, step2.2) + AND ((%[1]d = 0) OR (has_error AND serviceName = step1.1 AND name = step1.2)) + AND ((%[2]d = 0) OR (has_error AND serviceName = step2.1 AND name = step2.2)) + %[11]s + %[12]s + GROUP BY trace_id +) AS funnel +WHERE t1_time > 0 AND t2_time > t1_time;` return fmt.Sprintf(tpl, containsErrorT1, @@ -688,162 +641,62 @@ func BuildThreeStepFunnelStepOverviewQuery( latencyTypeT2 string, latencyTypeT3 string, ) string { - base := ` + const tpl = ` WITH - %[1]d AS contains_error_t1, - %[2]d AS contains_error_t2, - %[3]d AS contains_error_t3, - '%[4]s' AS latency_pointer_t1, - '%[5]s' AS latency_pointer_t2, - '%[6]s' AS latency_pointer_t3, toDateTime64(%[7]d/1e9,9) AS start_ts, toDateTime64(%[8]d/1e9,9) AS end_ts, - (%[8]d - %[7]d)/1e9 AS time_window_sec, - '%[9]s' AS service_name_t1, - '%[10]s' AS span_name_t1, - '%[11]s' AS service_name_t2, - '%[12]s' AS span_name_t2, - '%[13]s' AS service_name_t3, - '%[14]s' AS span_name_t3 + (%[8]d - %[7]d)/1e9 AS time_window_sec, -, step1 AS ( + ('%[9]s','%[10]s') AS step1, + ('%[11]s','%[12]s') AS step2, + ('%[13]s','%[14]s') AS step3 + +SELECT + round( + countIf( + t1_time > 0 + AND t2_time > t1_time + AND t3_time > t2_time + ) * 100.0 + / countIf(t1_time > 0), + 2 + ) AS conversion_rate, + countIf(t1_time > 0 AND t2_time > t1_time) + / time_window_sec AS avg_rate_step1_to_2, + greatest(sum(s1_error), sum(s2_error), sum(s3_error)) AS errors, + avg( + dateDiff('microseconds', t1_time, t2_time) / 1000.0 + ) AS avg_duration_ms, + quantile(0.99)( + dateDiff('microseconds', t1_time, t2_time) / 1000.0 + ) AS p99_duration_ms +FROM ( SELECT trace_id, - argMin(timestamp, timestamp) AS t1_time, - argMin(has_error, timestamp) AS s1_has_error + minIf(timestamp, serviceName = step1.1 AND name = step1.2) AS t1_time, + minIf(timestamp, serviceName = step2.1 AND name = step2.2) AS t2_time, + minIf(timestamp, serviceName = step3.1 AND name = step3.2) AS t3_time, + toUInt8(anyIf(has_error, serviceName = step1.1 AND name = step1.2)) AS s1_error, + toUInt8(anyIf(has_error, serviceName = step2.1 AND name = step2.2)) AS s2_error, + toUInt8(anyIf(has_error, serviceName = step3.1 AND name = step3.2)) AS s3_error FROM signoz_traces.signoz_index_v3 WHERE timestamp BETWEEN start_ts AND end_ts - AND serviceName = service_name_t1 - AND name = span_name_t1 - AND (contains_error_t1 = 0 OR has_error = true) + AND serviceName IN (step1.1, step2.1, step3.1) + AND name IN (step1.2, step2.2, step3.2) + AND ((%[1]d = 0) OR (has_error AND serviceName = step1.1 AND name = step1.2)) + AND ((%[2]d = 0) OR (has_error AND serviceName = step2.1 AND name = step2.2)) + AND ((%[3]d = 0) OR (has_error AND serviceName = step3.1 AND name = step3.2)) %[15]s - GROUP BY trace_id - LIMIT 100000 -) -, step2 AS ( - SELECT - trace_id, - argMin(timestamp, timestamp) AS t2_time, - argMin(has_error, timestamp) AS s2_has_error - FROM signoz_traces.signoz_index_v3 - WHERE - timestamp BETWEEN start_ts AND end_ts - AND serviceName = service_name_t2 - AND name = span_name_t2 - AND (contains_error_t2 = 0 OR has_error = true) %[16]s - GROUP BY trace_id - LIMIT 100000 -) -, step3 AS ( - SELECT - trace_id, - argMin(timestamp, timestamp) AS t3_time, - argMin(has_error, timestamp) AS s3_has_error - FROM signoz_traces.signoz_index_v3 - WHERE - timestamp BETWEEN start_ts AND end_ts - AND serviceName = service_name_t3 - AND name = span_name_t3 - AND (contains_error_t3 = 0 OR has_error = true) %[17]s GROUP BY trace_id - LIMIT 100000 -) -, joined_t2 AS ( - SELECT - s1.trace_id, - s1.t1_time, - s2.t2_time, - s1.s1_has_error, - s2.s2_has_error - FROM step1 AS s1 - INNER JOIN step2 AS s2 ON s1.trace_id = s2.trace_id - WHERE s2.t2_time > s1.t1_time -) -, joined_t3 AS ( - SELECT - j.trace_id, - j.t1_time, - j.t2_time, - s3.t3_time, - j.s1_has_error, - j.s2_has_error, - s3.s3_has_error - FROM joined_t2 AS j - INNER JOIN step3 AS s3 ON j.trace_id = s3.trace_id - WHERE s3.t3_time > j.t2_time -) -, errors_step1 AS (SELECT countIf(s1_has_error) AS errors FROM step1) -, errors_step2 AS (SELECT countIf(s2_has_error) AS errors FROM step2) -, errors_step3 AS (SELECT countIf(s3_has_error) AS errors FROM step3) -` +) AS funnel +WHERE t1_time > 0 + AND t2_time > t1_time + AND t3_time > t2_time;` - var sel string - if stepStart == 1 && stepEnd == 2 { - sel = ` -SELECT - round( - count(DISTINCT trace_id)*100.0 - / (SELECT count(DISTINCT trace_id) FROM step1), 2 - ) AS conversion_rate, - count(DISTINCT trace_id) / time_window_sec AS avg_rate, - greatest( - (SELECT errors FROM errors_step1), - (SELECT errors FROM errors_step2) - ) AS errors, - avg( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) *1000 - ) AS avg_duration, - CASE - WHEN '%[19]s' = 'p99' THEN quantile(0.99)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) *1000 - ) - WHEN '%[19]s' = 'p95' THEN quantile(0.95)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) *1000 - ) - WHEN '%[19]s' = 'p90' THEN quantile(0.90)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) *1000 - ) - ELSE quantile(0.99)( - abs(CAST(t2_time AS Decimal(20,9)) - CAST(t1_time AS Decimal(20,9))) *1000 - ) - END AS latency -FROM joined_t2;` - } else { - sel = ` -SELECT - round( - count(DISTINCT trace_id)*100.0 - / (SELECT count(DISTINCT trace_id) FROM joined_t2), 2 - ) AS conversion_rate, - count(DISTINCT trace_id) / time_window_sec AS avg_rate, - greatest( - (SELECT errors FROM errors_step2), - (SELECT errors FROM errors_step3) - ) AS errors, - avg( - abs(CAST(t3_time AS Decimal(20,9)) - CAST(t2_time AS Decimal(20,9))) *1000 - ) AS avg_duration, - CASE - WHEN '%[20]s' = 'p99' THEN quantile(0.99)( - abs(CAST(t3_time AS Decimal(20,9)) - CAST(t2_time AS Decimal(20,9))) *1000 - ) - WHEN '%[20]s' = 'p95' THEN quantile(0.95)( - abs(CAST(t3_time AS Decimal(20,9)) - CAST(t2_time AS Decimal(20,9))) *1000 - ) - WHEN '%[20]s' = 'p90' THEN quantile(0.90)( - abs(CAST(t3_time AS Decimal(20,9)) - CAST(t2_time AS Decimal(20,9))) *1000 - ) - ELSE quantile(0.99)( - abs(CAST(t3_time AS Decimal(20,9)) - CAST(t2_time AS Decimal(20,9))) *1000 - ) - END AS latency -FROM joined_t3;` - } - - return fmt.Sprintf(base+sel, + return fmt.Sprintf(tpl, containsErrorT1, containsErrorT2, containsErrorT3, @@ -861,6 +714,8 @@ FROM joined_t3;` clauseStep1, clauseStep2, clauseStep3, + stepStart, + stepEnd, latencyTypeT2, latencyTypeT3, )