fix: handle-large-traces (#4903)

* fix: handle-large-traces

* feat: add isSubTree key to identify subTrees
Show user a loom video explaining how to navigate large spans

* chore: update icon to warning

* chore: fire telemetry events for all trace detail API calls, large traces

* chore: update MAX_SPANS_IN_TRACE to 250k
This commit is contained in:
Vishal Sharma 2024-05-27 17:20:45 +05:30 committed by GitHub
parent 3085093130
commit 96162d7949
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 161 additions and 68 deletions

View File

@ -2,10 +2,8 @@ package api
import (
"net/http"
"strconv"
"go.signoz.io/signoz/ee/query-service/app/db"
"go.signoz.io/signoz/ee/query-service/constants"
"go.signoz.io/signoz/ee/query-service/model"
baseapp "go.signoz.io/signoz/pkg/query-service/app"
basemodel "go.signoz.io/signoz/pkg/query-service/model"
@ -19,17 +17,13 @@ func (ah *APIHandler) searchTraces(w http.ResponseWriter, r *http.Request) {
ah.APIHandler.SearchTraces(w, r)
return
}
traceId, spanId, levelUpInt, levelDownInt, err := baseapp.ParseSearchTracesParams(r)
searchTracesParams, err := baseapp.ParseSearchTracesParams(r)
if err != nil {
RespondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, "Error reading params")
return
}
spanLimit, err := strconv.Atoi(constants.SpanLimitStr)
if err != nil {
zap.L().Error("Error during strconv.Atoi() on SPAN_LIMIT env variable", zap.Error(err))
return
}
result, err := ah.opts.DataConnector.SearchTraces(r.Context(), traceId, spanId, levelUpInt, levelDownInt, spanLimit, db.SmartTraceAlgorithm)
result, err := ah.opts.DataConnector.SearchTraces(r.Context(), searchTracesParams, db.SmartTraceAlgorithm)
if ah.HandleError(w, err, http.StatusBadRequest) {
return
}

View File

@ -13,6 +13,11 @@ import (
func SmartTraceAlgorithm(payload []basemodel.SearchSpanResponseItem, targetSpanId string, levelUp int, levelDown int, spanLimit int) ([]basemodel.SearchSpansResult, error) {
var spans []*model.SpanForTraceDetails
// if targetSpanId is null or not present then randomly select a span as targetSpanId
if (targetSpanId == "" || targetSpanId == "null") && len(payload) > 0 {
targetSpanId = payload[0].SpanID
}
// Build a slice of spans from the payload
for _, spanItem := range payload {
var parentID string
@ -115,6 +120,7 @@ func SmartTraceAlgorithm(payload []basemodel.SearchSpanResponseItem, targetSpanI
searchSpansResult := []basemodel.SearchSpansResult{{
Columns: []string{"__time", "SpanId", "TraceId", "ServiceName", "Name", "Kind", "DurationNano", "TagsKeys", "TagsValues", "References", "Events", "HasError"},
Events: make([][]interface{}, len(resultSpansSet)),
IsSubTree: true,
},
}

View File

@ -11,7 +11,8 @@ const (
var LicenseSignozIo = "https://license.signoz.io/api/v1"
var LicenseAPIKey = GetOrDefaultEnv("SIGNOZ_LICENSE_API_KEY", "")
var SaasSegmentKey = GetOrDefaultEnv("SIGNOZ_SAAS_SEGMENT_KEY", "")
var SpanLimitStr = GetOrDefaultEnv("SPAN_LIMIT", "5000")
var SpanRenderLimitStr = GetOrDefaultEnv("SPAN_RENDER_LIMIT", "2500")
var MaxSpansInTraceStr = GetOrDefaultEnv("MAX_SPANS_IN_TRACE", "250000")
func GetOrDefaultEnv(key string, fallback string) string {
v := os.Getenv(key)

View File

@ -1,30 +1,21 @@
import { volcano } from '@ant-design/colors';
import { InfoCircleOutlined } from '@ant-design/icons';
import { Popover, Typography } from 'antd';
function PopOverContent(): JSX.Element {
return (
<div>
More details on missing spans{' '}
<a
href="https://signoz.io/docs/userguide/traces/#missing-spans"
rel="noopener noreferrer"
target="_blank"
>
here
</a>
</div>
);
}
import { WarningOutlined } from '@ant-design/icons';
import { Typography } from 'antd';
function MissingSpansMessage(): JSX.Element {
return (
<Popover content={PopOverContent} trigger="hover" placement="bottom">
<Typography>
<InfoCircleOutlined style={{ color: volcano[6], marginRight: '0.3rem' }} />
This trace has missing spans
<WarningOutlined style={{ color: volcano[6], marginRight: '0.3rem' }} />
This trace has missing spans, more details{' '}
<a
href="https://signoz.io/docs/userguide/traces/?utm_source=product&utm_medium=trace-details#missing-spans"
target="_blank"
style={{ textDecoration: 'underline' }}
rel="noreferrer"
>
here
</a>
</Typography>
</Popover>
);
}

View File

@ -0,0 +1,22 @@
import { volcano } from '@ant-design/colors';
import { WarningOutlined } from '@ant-design/icons';
import { Typography } from 'antd';
function SubTreeMessage(): JSX.Element {
return (
<Typography>
<WarningOutlined style={{ color: volcano[6], marginRight: '0.3rem' }} />
Only part of trace is shown, for more info{' '}
<a
href="https://www.loom.com/share/3a26d398278f49919dd185d9c4344b05"
target="_blank"
style={{ textDecoration: 'underline' }}
rel="noreferrer"
>
watch this
</a>
</Typography>
);
}
export default SubTreeMessage;

View File

@ -33,6 +33,7 @@ import MissingSpansMessage from './Missingtrace';
import SelectedSpanDetails from './SelectedSpanDetails';
import * as styles from './styles';
import { FlameGraphMissingSpansContainer, GanttChartWrapper } from './styles';
import SubTreeMessage from './SubTree';
import {
formUrlParams,
getSortedData,
@ -142,9 +143,10 @@ function TraceDetail({ response }: TraceDetailProps): JSX.Element {
Trace Details
</StyledTypography.Title>
<StyledTypography.Text styledclass={[styles.removeMargin]}>
{traceMetaData.totalSpans} Span
{traceMetaData.totalSpans} Spans
</StyledTypography.Text>
{hasMissingSpans && <MissingSpansMessage />}
{response[0]?.isSubTree && <SubTreeMessage />}
</StyledCol>
<Col flex="auto">
{map(tree.spanTree, (tree) => (

View File

@ -14,6 +14,7 @@ export interface PayloadProps {
events: Span[];
segmentID: string;
columns: string[];
isSubTree: boolean;
};
}

View File

@ -1953,7 +1953,39 @@ func (r *ClickHouseReader) GetUsage(ctx context.Context, queryParams *model.GetU
return &usageItems, nil
}
func (r *ClickHouseReader) SearchTraces(ctx context.Context, traceId string, spanId string, levelUp int, levelDown int, spanLimit int, smartTraceAlgorithm func(payload []model.SearchSpanResponseItem, targetSpanId string, levelUp int, levelDown int, spanLimit int) ([]model.SearchSpansResult, error)) (*[]model.SearchSpansResult, error) {
func (r *ClickHouseReader) SearchTraces(ctx context.Context, params *model.SearchTracesParams,
smartTraceAlgorithm func(payload []model.SearchSpanResponseItem, targetSpanId string,
levelUp int, levelDown int, spanLimit int) ([]model.SearchSpansResult, error)) (*[]model.SearchSpansResult, error) {
var countSpans uint64
countQuery := fmt.Sprintf("SELECT count() as count from %s.%s WHERE traceID=$1", r.TraceDB, r.SpansTable)
err := r.db.QueryRow(ctx, countQuery, params.TraceID).Scan(&countSpans)
if err != nil {
zap.L().Error("Error in processing sql query", zap.Error(err))
return nil, fmt.Errorf("error in processing sql query")
}
if countSpans > uint64(params.MaxSpansInTrace) {
zap.L().Error("Max spans allowed in a trace limit reached", zap.Int("MaxSpansInTrace", params.MaxSpansInTrace),
zap.Uint64("Count", countSpans))
userEmail, err := auth.GetEmailFromJwt(ctx)
if err == nil {
data := map[string]interface{}{
"traceSize": countSpans,
"maxSpansInTraceLimit": params.MaxSpansInTrace,
}
telemetry.GetInstance().SendEvent(telemetry.TELEMETRY_EVENT_MAX_SPANS_ALLOWED_LIMIT_REACHED, data, userEmail, true, false)
}
return nil, fmt.Errorf("Max spans allowed in trace limit reached, please contact support for more details")
}
userEmail, err := auth.GetEmailFromJwt(ctx)
if err == nil {
data := map[string]interface{}{
"traceSize": countSpans,
}
telemetry.GetInstance().SendEvent(telemetry.TELEMETRY_EVENT_TRACE_DETAIL_API, data, userEmail, true, false)
}
var searchScanResponses []model.SearchSpanDBResponseItem
@ -1961,7 +1993,7 @@ func (r *ClickHouseReader) SearchTraces(ctx context.Context, traceId string, spa
start := time.Now()
err := r.db.Select(ctx, &searchScanResponses, query, traceId)
err = r.db.Select(ctx, &searchScanResponses, query, params.TraceID)
zap.L().Info(query)
@ -1974,6 +2006,7 @@ func (r *ClickHouseReader) SearchTraces(ctx context.Context, traceId string, spa
searchSpansResult := []model.SearchSpansResult{{
Columns: []string{"__time", "SpanId", "TraceId", "ServiceName", "Name", "Kind", "DurationNano", "TagsKeys", "TagsValues", "References", "Events", "HasError"},
Events: make([][]interface{}, len(searchScanResponses)),
IsSubTree: false,
},
}
@ -1990,14 +2023,22 @@ func (r *ClickHouseReader) SearchTraces(ctx context.Context, traceId string, spa
err = r.featureFlags.CheckFeature(model.SmartTraceDetail)
smartAlgoEnabled := err == nil
if len(searchScanResponses) > spanLimit && spanId != "" && smartAlgoEnabled {
if len(searchScanResponses) > params.SpansRenderLimit && smartAlgoEnabled {
start = time.Now()
searchSpansResult, err = smartTraceAlgorithm(searchSpanResponses, spanId, levelUp, levelDown, spanLimit)
searchSpansResult, err = smartTraceAlgorithm(searchSpanResponses, params.SpanID, params.LevelUp, params.LevelDown, params.SpansRenderLimit)
if err != nil {
return nil, err
}
end = time.Now()
zap.L().Debug("smartTraceAlgo took: ", zap.Duration("duration", end.Sub(start)))
userEmail, err := auth.GetEmailFromJwt(ctx)
if err == nil {
data := map[string]interface{}{
"traceSize": len(searchScanResponses),
"spansRenderLimit": params.SpansRenderLimit,
}
telemetry.GetInstance().SendEvent(telemetry.TELEMETRY_EVENT_LARGE_TRACE_OPENED, data, userEmail, true, false)
}
} else {
for i, item := range searchSpanResponses {
spanEvents := item.GetValues()

View File

@ -1321,13 +1321,13 @@ func (aH *APIHandler) getServicesList(w http.ResponseWriter, r *http.Request) {
func (aH *APIHandler) SearchTraces(w http.ResponseWriter, r *http.Request) {
traceId, spanId, levelUpInt, levelDownInt, err := ParseSearchTracesParams(r)
params, err := ParseSearchTracesParams(r)
if err != nil {
RespondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, "Error reading params")
return
}
result, err := aH.reader.SearchTraces(r.Context(), traceId, spanId, levelUpInt, levelDownInt, 0, nil)
result, err := aH.reader.SearchTraces(r.Context(), params, nil)
if aH.HandleError(w, err, http.StatusBadRequest) {
return
}

View File

@ -17,11 +17,12 @@ import (
promModel "github.com/prometheus/common/model"
"go.uber.org/multierr"
"go.signoz.io/signoz/ee/query-service/constants"
"go.signoz.io/signoz/pkg/query-service/app/metrics"
"go.signoz.io/signoz/pkg/query-service/app/queryBuilder"
"go.signoz.io/signoz/pkg/query-service/auth"
baseconstants "go.signoz.io/signoz/pkg/query-service/constants"
"go.signoz.io/signoz/pkg/query-service/common"
"go.signoz.io/signoz/pkg/query-service/constants"
"go.signoz.io/signoz/pkg/query-service/model"
v3 "go.signoz.io/signoz/pkg/query-service/model/v3"
"go.signoz.io/signoz/pkg/query-service/postprocess"
@ -250,28 +251,46 @@ func parseGetServicesRequest(r *http.Request) (*model.GetServicesParams, error)
return postData, nil
}
func ParseSearchTracesParams(r *http.Request) (string, string, int, int, error) {
func ParseSearchTracesParams(r *http.Request) (*model.SearchTracesParams, error) {
vars := mux.Vars(r)
traceId := vars["traceId"]
spanId := r.URL.Query().Get("spanId")
levelUp := r.URL.Query().Get("levelUp")
levelDown := r.URL.Query().Get("levelDown")
if levelUp == "" || levelUp == "null" {
levelUp = "0"
params := &model.SearchTracesParams{}
params.TraceID = vars["traceId"]
params.SpanID = r.URL.Query().Get("spanId")
levelUpStr := r.URL.Query().Get("levelUp")
levelDownStr := r.URL.Query().Get("levelDown")
SpanRenderLimitStr := r.URL.Query().Get("spanRenderLimit")
if levelUpStr == "" || levelUpStr == "null" {
levelUpStr = "0"
}
if levelDown == "" || levelDown == "null" {
levelDown = "0"
if levelDownStr == "" || levelDownStr == "null" {
levelDownStr = "0"
}
if SpanRenderLimitStr == "" || SpanRenderLimitStr == "null" {
SpanRenderLimitStr = constants.SpanRenderLimitStr
}
levelUpInt, err := strconv.Atoi(levelUp)
levelUpInt, err := strconv.Atoi(levelUpStr)
if err != nil {
return "", "", 0, 0, err
return nil, err
}
levelDownInt, err := strconv.Atoi(levelDown)
levelDownInt, err := strconv.Atoi(levelDownStr)
if err != nil {
return "", "", 0, 0, err
return nil, err
}
return traceId, spanId, levelUpInt, levelDownInt, nil
SpanRenderLimitInt, err := strconv.Atoi(SpanRenderLimitStr)
if err != nil {
return nil, err
}
MaxSpansInTraceInt, err := strconv.Atoi(constants.MaxSpansInTraceStr)
if err != nil {
return nil, err
}
params.LevelUp = levelUpInt
params.LevelDown = levelDownInt
params.SpansRenderLimit = SpanRenderLimitInt
params.MaxSpansInTrace = MaxSpansInTraceInt
return params, nil
}
func DoesExistInSlice(item string, list []string) bool {
@ -327,16 +346,16 @@ func parseFilteredSpansRequest(r *http.Request, aH *APIHandler) (*model.GetFilte
}
if len(postData.Order) != 0 {
if postData.Order != constants.Ascending && postData.Order != constants.Descending {
if postData.Order != baseconstants.Ascending && postData.Order != baseconstants.Descending {
return nil, errors.New("order param is not in correct format")
}
if postData.OrderParam != constants.Duration && postData.OrderParam != constants.Timestamp {
if postData.OrderParam != baseconstants.Duration && postData.OrderParam != baseconstants.Timestamp {
return nil, errors.New("order param is not in correct format")
}
if postData.OrderParam == constants.Duration && !aH.CheckFeature(constants.DurationSort) {
return nil, model.ErrFeatureUnavailable{Key: constants.DurationSort}
} else if postData.OrderParam == constants.Timestamp && !aH.CheckFeature(constants.TimestampSort) {
return nil, model.ErrFeatureUnavailable{Key: constants.TimestampSort}
if postData.OrderParam == baseconstants.Duration && !aH.CheckFeature(baseconstants.DurationSort) {
return nil, model.ErrFeatureUnavailable{Key: baseconstants.DurationSort}
} else if postData.OrderParam == baseconstants.Timestamp && !aH.CheckFeature(baseconstants.TimestampSort) {
return nil, model.ErrFeatureUnavailable{Key: baseconstants.TimestampSort}
}
}
tags, err := extractTagKeys(postData.Tags)
@ -676,7 +695,7 @@ func parseTTLParams(r *http.Request) (*model.TTLParams, error) {
}
// Validate the type parameter
if typeTTL != constants.TraceTTL && typeTTL != constants.MetricsTTL && typeTTL != constants.LogsTTL {
if typeTTL != baseconstants.TraceTTL && typeTTL != baseconstants.MetricsTTL && typeTTL != baseconstants.LogsTTL {
return nil, fmt.Errorf("type param should be metrics|traces|logs, got %v", typeTTL)
}
@ -715,7 +734,7 @@ func parseGetTTL(r *http.Request) (*model.GetTTLParams, error) {
return nil, fmt.Errorf("type param cannot be empty from the query")
} else {
// Validate the type parameter
if typeTTL != constants.TraceTTL && typeTTL != constants.MetricsTTL && typeTTL != constants.LogsTTL {
if typeTTL != baseconstants.TraceTTL && typeTTL != baseconstants.MetricsTTL && typeTTL != baseconstants.LogsTTL {
return nil, fmt.Errorf("type param should be metrics|traces|logs, got %v", typeTTL)
}
}

View File

@ -52,7 +52,7 @@ type Reader interface {
GetNextPrevErrorIDs(ctx context.Context, params *model.GetErrorParams) (*model.NextPrevErrorIDs, *model.ApiError)
// Search Interfaces
SearchTraces(ctx context.Context, traceID string, spanId string, levelUp int, levelDown int, spanLimit int, smartTraceAlgorithm func(payload []model.SearchSpanResponseItem, targetSpanId string, levelUp int, levelDown int, spanLimit int) ([]model.SearchSpansResult, error)) (*[]model.SearchSpansResult, error)
SearchTraces(ctx context.Context, params *model.SearchTracesParams, smartTraceAlgorithm func(payload []model.SearchSpanResponseItem, targetSpanId string, levelUp int, levelDown int, spanLimit int) ([]model.SearchSpansResult, error)) (*[]model.SearchSpansResult, error)
// Setter Interfaces
SetTTL(ctx context.Context, ttlParams *model.TTLParams) (*model.SetTTLResponseItem, *model.ApiError)

View File

@ -420,6 +420,15 @@ type GetFilteredSpanAggregatesParams struct {
End *time.Time
}
type SearchTracesParams struct {
TraceID string `json:"traceId"`
LevelUp int `json:"levelUp"`
LevelDown int `json:"levelDown"`
SpanID string `json:"spanId"`
SpansRenderLimit int `json:"spansRenderLimit"`
MaxSpansInTrace int `json:"maxSpansInTrace"`
}
type SpanFilterParams struct {
TraceID []string `json:"traceID"`
Status []string `json:"status"`

View File

@ -214,6 +214,7 @@ type ServiceOverviewItem struct {
type SearchSpansResult struct {
Columns []string `json:"columns"`
Events [][]interface{} `json:"events"`
IsSubTree bool `json:"isSubTree"`
}
type GetFilterSpansResponseItem struct {

View File

@ -37,6 +37,9 @@ const (
TELEMETRY_EVENT_LANGUAGE = "Language"
TELEMETRY_EVENT_SERVICE = "ServiceName"
TELEMETRY_EVENT_LOGS_FILTERS = "Logs Filters"
TELEMETRY_EVENT_LARGE_TRACE_OPENED = "Large Trace Opened"
TELEMETRY_EVENT_TRACE_DETAIL_API = "Trace Detail API"
TELEMETRY_EVENT_MAX_SPANS_ALLOWED_LIMIT_REACHED = "Max spans in a trace limit reached"
TELEMETRY_EVENT_DISTRIBUTED = "Distributed"
TELEMETRY_EVENT_QUERY_RANGE_API = "Query Range API"
TELEMETRY_EVENT_DASHBOARDS_ALERTS = "Dashboards/Alerts Info"
@ -61,6 +64,9 @@ var SAAS_EVENTS_LIST = map[string]struct{}{
TELEMETRY_EVENT_SUCCESSFUL_DASHBOARD_PANEL_QUERY: {},
TELEMETRY_EVENT_SUCCESSFUL_ALERT_QUERY: {},
TELEMETRY_EVENT_QUERY_RANGE_API: {},
TELEMETRY_EVENT_MAX_SPANS_ALLOWED_LIMIT_REACHED: {},
TELEMETRY_EVENT_LARGE_TRACE_OPENED: {},
TELEMETRY_EVENT_TRACE_DETAIL_API: {},
}
const api_key = "4Gmoa4ixJAUHx2BpJxsjwA1bEfnwEeRz"