Vibhu Pandey 7118829107
fix(test|lint): fix lint and test in go (#7346)
### Summary
 
- fix lint and test in go
2025-03-18 14:01:48 +05:30

384 lines
12 KiB
Go

package alertmanagerserver
import (
"context"
"log/slog"
"strings"
"sync"
"time"
"github.com/prometheus/alertmanager/dispatch"
"github.com/prometheus/alertmanager/featurecontrol"
"github.com/prometheus/alertmanager/inhibit"
"github.com/prometheus/alertmanager/nflog"
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/provider/mem"
"github.com/prometheus/alertmanager/silence"
"github.com/prometheus/alertmanager/template"
"github.com/prometheus/alertmanager/timeinterval"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"go.signoz.io/signoz/pkg/errors"
"go.signoz.io/signoz/pkg/types/alertmanagertypes"
)
var (
// This is not a real file and will never be used. We need this placeholder to ensure maintenance runs on shutdown. See
// https://github.com/prometheus/server/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
// and https://github.com/prometheus/server/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
snapfnoop string = "snapfnoop"
)
type Server struct {
// logger is the logger for the alertmanager
logger *slog.Logger
// registry is the prometheus registry for the alertmanager
registry prometheus.Registerer
// srvConfig is the server config for the alertmanager
srvConfig Config
// alertmanagerConfig is the config of the alertmanager
alertmanagerConfig *alertmanagertypes.Config
// orgID is the orgID for the alertmanager
orgID string
// store is the backing store for the alertmanager
stateStore alertmanagertypes.StateStore
// alertmanager primitives from upstream alertmanager
alerts *mem.Alerts
nflog *nflog.Log
dispatcher *dispatch.Dispatcher
dispatcherMetrics *dispatch.DispatcherMetrics
inhibitor *inhibit.Inhibitor
silencer *silence.Silencer
silences *silence.Silences
timeIntervals map[string][]timeinterval.TimeInterval
pipelineBuilder *notify.PipelineBuilder
marker *alertmanagertypes.MemMarker
tmpl *template.Template
wg sync.WaitGroup
stopc chan struct{}
}
func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore) (*Server, error) {
server := &Server{
logger: logger.With("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver"),
registry: registry,
srvConfig: srvConfig,
orgID: orgID,
stateStore: stateStore,
stopc: make(chan struct{}),
}
// initialize marker
server.marker = alertmanagertypes.NewMarker(server.registry)
// get silences for initial state
state, err := server.stateStore.Get(ctx, server.orgID)
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
return nil, err
}
silencesSnapshot := ""
if state != nil {
silencesSnapshot, err = state.Get(alertmanagertypes.SilenceStateName)
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
return nil, err
}
}
// Initialize silences
server.silences, err = silence.New(silence.Options{
SnapshotReader: strings.NewReader(silencesSnapshot),
Retention: srvConfig.Silences.Retention,
Limits: silence.Limits{
MaxSilences: func() int { return srvConfig.Silences.Max },
MaxSilenceSizeBytes: func() int { return srvConfig.Silences.MaxSizeBytes },
},
Metrics: server.registry,
Logger: server.logger,
})
if err != nil {
return nil, err
}
nflogSnapshot := ""
if state != nil {
nflogSnapshot, err = state.Get(alertmanagertypes.NFLogStateName)
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
return nil, err
}
}
// Initialize notification log
server.nflog, err = nflog.New(nflog.Options{
SnapshotReader: strings.NewReader(nflogSnapshot),
Retention: server.srvConfig.NFLog.Retention,
Metrics: server.registry,
Logger: server.logger,
})
if err != nil {
return nil, err
}
// Start maintenance for silences
server.wg.Add(1)
go func() {
defer server.wg.Done()
server.silences.Maintenance(server.srvConfig.Silences.MaintenanceInterval, snapfnoop, server.stopc, func() (int64, error) {
// Delete silences older than the retention period.
if _, err := server.silences.GC(); err != nil {
server.logger.ErrorContext(ctx, "silence garbage collection", "error", err)
// Don't return here - we need to snapshot our state first.
}
storableSilences, err := server.stateStore.Get(ctx, server.orgID)
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
return 0, err
}
if storableSilences == nil {
storableSilences = alertmanagertypes.NewStoreableState(server.orgID)
}
c, err := storableSilences.Set(alertmanagertypes.SilenceStateName, server.silences)
if err != nil {
return 0, err
}
return c, server.stateStore.Set(ctx, server.orgID, storableSilences)
})
}()
// Start maintenance for notification logs
server.wg.Add(1)
go func() {
defer server.wg.Done()
server.nflog.Maintenance(server.srvConfig.NFLog.MaintenanceInterval, snapfnoop, server.stopc, func() (int64, error) {
if _, err := server.nflog.GC(); err != nil {
server.logger.ErrorContext(ctx, "notification log garbage collection", "error", err)
// Don't return without saving the current state.
}
storableNFLog, err := server.stateStore.Get(ctx, server.orgID)
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
return 0, err
}
if storableNFLog == nil {
storableNFLog = alertmanagertypes.NewStoreableState(server.orgID)
}
c, err := storableNFLog.Set(alertmanagertypes.NFLogStateName, server.nflog)
if err != nil {
return 0, err
}
return c, server.stateStore.Set(ctx, server.orgID, storableNFLog)
})
}()
server.alerts, err = mem.NewAlerts(ctx, server.marker, server.srvConfig.Alerts.GCInterval, nil, server.logger, server.registry)
if err != nil {
return nil, err
}
server.pipelineBuilder = notify.NewPipelineBuilder(server.registry, featurecontrol.NoopFlags{})
server.dispatcherMetrics = dispatch.NewDispatcherMetrics(false, server.registry)
return server, nil
}
func (server *Server) GetAlerts(ctx context.Context, params alertmanagertypes.GettableAlertsParams) (alertmanagertypes.GettableAlerts, error) {
return alertmanagertypes.NewGettableAlertsFromAlertProvider(server.alerts, server.alertmanagerConfig, server.marker.Status, func(labels model.LabelSet) {
server.inhibitor.Mutes(labels)
server.silencer.Mutes(labels)
}, params)
}
func (server *Server) PutAlerts(ctx context.Context, postableAlerts alertmanagertypes.PostableAlerts) error {
alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(postableAlerts, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now())
// Notification sending alert takes precedence over validation errors.
if err := server.alerts.Put(alerts...); err != nil {
return err
}
if err != nil {
return errors.Join(err...)
}
return nil
}
func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertmanagertypes.Config) error {
config := alertmanagerConfig.AlertmanagerConfig()
var err error
server.tmpl, err = alertmanagertypes.FromGlobs(config.Templates)
if err != nil {
return err
}
server.tmpl.ExternalURL = server.srvConfig.ExternalURL
// Build the routing tree and record which receivers are used.
routes := dispatch.NewRoute(config.Route, nil)
activeReceivers := make(map[string]struct{})
routes.Walk(func(r *dispatch.Route) {
activeReceivers[r.RouteOpts.Receiver] = struct{}{}
})
// Build the map of receiver to integrations.
receivers := make(map[string][]notify.Integration, len(activeReceivers))
var integrationsNum int
for _, rcv := range config.Receivers {
if _, found := activeReceivers[rcv.Name]; !found {
// No need to build a receiver if no route is using it.
server.logger.InfoContext(ctx, "skipping creation of receiver not referenced by any route", "receiver", rcv.Name)
continue
}
integrations, err := alertmanagertypes.NewReceiverIntegrations(rcv, server.tmpl, server.logger)
if err != nil {
return err
}
// rcv.Name is guaranteed to be unique across all receivers.
receivers[rcv.Name] = integrations
integrationsNum += len(integrations)
}
// Build the map of time interval names to time interval definitions.
timeIntervals := make(map[string][]timeinterval.TimeInterval, len(config.MuteTimeIntervals)+len(config.TimeIntervals))
for _, ti := range config.MuteTimeIntervals {
timeIntervals[ti.Name] = ti.TimeIntervals
}
for _, ti := range config.TimeIntervals {
timeIntervals[ti.Name] = ti.TimeIntervals
}
intervener := timeinterval.NewIntervener(timeIntervals)
if server.inhibitor != nil {
server.inhibitor.Stop()
}
if server.dispatcher != nil {
server.dispatcher.Stop()
}
server.inhibitor = inhibit.NewInhibitor(server.alerts, config.InhibitRules, server.marker, server.logger)
server.timeIntervals = timeIntervals
server.silencer = silence.NewSilencer(server.silences, server.marker, server.logger)
var pipelinePeer notify.Peer
pipeline := server.pipelineBuilder.New(
receivers,
func() time.Duration { return 0 },
server.inhibitor,
server.silencer,
intervener,
server.marker,
server.nflog,
pipelinePeer,
)
timeoutFunc := func(d time.Duration) time.Duration {
if d < notify.MinTimeout {
d = notify.MinTimeout
}
return d
}
server.dispatcher = dispatch.NewDispatcher(
server.alerts,
routes,
pipeline,
server.marker,
timeoutFunc,
nil,
server.logger,
server.dispatcherMetrics,
)
// Do not try to add these to server.wg as there seems to be a race condition if
// we call Start() and Stop() in quick succession.
// Both these goroutines will run indefinitely.
go server.dispatcher.Run()
go server.inhibitor.Run()
server.alertmanagerConfig = alertmanagerConfig
return nil
}
func (server *Server) TestReceiver(ctx context.Context, receiver alertmanagertypes.Receiver) error {
return alertmanagertypes.TestReceiver(ctx, receiver, server.alertmanagerConfig, server.tmpl, server.logger, alertmanagertypes.NewTestAlert(receiver, time.Now(), time.Now()))
}
func (server *Server) TestAlert(ctx context.Context, postableAlert *alertmanagertypes.PostableAlert, receivers []string) error {
alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(alertmanagertypes.PostableAlerts{postableAlert}, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now())
if err != nil {
return errors.Join(err...)
}
if len(alerts) != 1 {
return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "expected 1 alert, got %d", len(alerts))
}
ch := make(chan error, len(receivers))
for _, receiverName := range receivers {
go func(receiverName string) {
receiver, err := server.alertmanagerConfig.GetReceiver(receiverName)
if err != nil {
ch <- err
return
}
ch <- alertmanagertypes.TestReceiver(ctx, receiver, server.alertmanagerConfig, server.tmpl, server.logger, alerts[0])
}(receiverName)
}
var errs []error
for i := 0; i < len(receivers); i++ {
if err := <-ch; err != nil {
errs = append(errs, err)
}
}
if errs != nil {
return errors.Join(errs...)
}
return nil
}
func (server *Server) Hash() string {
if server.alertmanagerConfig == nil {
return ""
}
return server.alertmanagerConfig.StoreableConfig().Hash
}
func (server *Server) Stop(ctx context.Context) error {
if server.dispatcher != nil {
server.dispatcher.Stop()
}
if server.inhibitor != nil {
server.inhibitor.Stop()
}
// Close the alert provider.
server.alerts.Close()
// Signals maintenance goroutines of server states to stop.
close(server.stopc)
// Wait for all goroutines to finish.
server.wg.Wait()
return nil
}