mirror of
https://git.mirrors.martin98.com/https://github.com/SigNoz/signoz
synced 2025-06-04 11:25:52 +08:00
384 lines
12 KiB
Go
384 lines
12 KiB
Go
package alertmanagerserver
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/alertmanager/dispatch"
|
|
"github.com/prometheus/alertmanager/featurecontrol"
|
|
"github.com/prometheus/alertmanager/inhibit"
|
|
"github.com/prometheus/alertmanager/nflog"
|
|
"github.com/prometheus/alertmanager/notify"
|
|
"github.com/prometheus/alertmanager/provider/mem"
|
|
"github.com/prometheus/alertmanager/silence"
|
|
"github.com/prometheus/alertmanager/template"
|
|
"github.com/prometheus/alertmanager/timeinterval"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/model"
|
|
"go.signoz.io/signoz/pkg/errors"
|
|
"go.signoz.io/signoz/pkg/types/alertmanagertypes"
|
|
)
|
|
|
|
var (
|
|
// This is not a real file and will never be used. We need this placeholder to ensure maintenance runs on shutdown. See
|
|
// https://github.com/prometheus/server/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
|
|
// and https://github.com/prometheus/server/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
|
|
snapfnoop string = "snapfnoop"
|
|
)
|
|
|
|
type Server struct {
|
|
// logger is the logger for the alertmanager
|
|
logger *slog.Logger
|
|
|
|
// registry is the prometheus registry for the alertmanager
|
|
registry prometheus.Registerer
|
|
|
|
// srvConfig is the server config for the alertmanager
|
|
srvConfig Config
|
|
|
|
// alertmanagerConfig is the config of the alertmanager
|
|
alertmanagerConfig *alertmanagertypes.Config
|
|
|
|
// orgID is the orgID for the alertmanager
|
|
orgID string
|
|
|
|
// store is the backing store for the alertmanager
|
|
stateStore alertmanagertypes.StateStore
|
|
|
|
// alertmanager primitives from upstream alertmanager
|
|
alerts *mem.Alerts
|
|
nflog *nflog.Log
|
|
dispatcher *dispatch.Dispatcher
|
|
dispatcherMetrics *dispatch.DispatcherMetrics
|
|
inhibitor *inhibit.Inhibitor
|
|
silencer *silence.Silencer
|
|
silences *silence.Silences
|
|
timeIntervals map[string][]timeinterval.TimeInterval
|
|
pipelineBuilder *notify.PipelineBuilder
|
|
marker *alertmanagertypes.MemMarker
|
|
tmpl *template.Template
|
|
wg sync.WaitGroup
|
|
stopc chan struct{}
|
|
}
|
|
|
|
func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore) (*Server, error) {
|
|
server := &Server{
|
|
logger: logger.With("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver"),
|
|
registry: registry,
|
|
srvConfig: srvConfig,
|
|
orgID: orgID,
|
|
stateStore: stateStore,
|
|
stopc: make(chan struct{}),
|
|
}
|
|
// initialize marker
|
|
server.marker = alertmanagertypes.NewMarker(server.registry)
|
|
|
|
// get silences for initial state
|
|
state, err := server.stateStore.Get(ctx, server.orgID)
|
|
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
|
|
return nil, err
|
|
}
|
|
|
|
silencesSnapshot := ""
|
|
if state != nil {
|
|
silencesSnapshot, err = state.Get(alertmanagertypes.SilenceStateName)
|
|
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
|
|
return nil, err
|
|
}
|
|
}
|
|
// Initialize silences
|
|
server.silences, err = silence.New(silence.Options{
|
|
SnapshotReader: strings.NewReader(silencesSnapshot),
|
|
Retention: srvConfig.Silences.Retention,
|
|
Limits: silence.Limits{
|
|
MaxSilences: func() int { return srvConfig.Silences.Max },
|
|
MaxSilenceSizeBytes: func() int { return srvConfig.Silences.MaxSizeBytes },
|
|
},
|
|
Metrics: server.registry,
|
|
Logger: server.logger,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nflogSnapshot := ""
|
|
if state != nil {
|
|
nflogSnapshot, err = state.Get(alertmanagertypes.NFLogStateName)
|
|
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Initialize notification log
|
|
server.nflog, err = nflog.New(nflog.Options{
|
|
SnapshotReader: strings.NewReader(nflogSnapshot),
|
|
Retention: server.srvConfig.NFLog.Retention,
|
|
Metrics: server.registry,
|
|
Logger: server.logger,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Start maintenance for silences
|
|
server.wg.Add(1)
|
|
go func() {
|
|
defer server.wg.Done()
|
|
server.silences.Maintenance(server.srvConfig.Silences.MaintenanceInterval, snapfnoop, server.stopc, func() (int64, error) {
|
|
// Delete silences older than the retention period.
|
|
if _, err := server.silences.GC(); err != nil {
|
|
server.logger.ErrorContext(ctx, "silence garbage collection", "error", err)
|
|
// Don't return here - we need to snapshot our state first.
|
|
}
|
|
|
|
storableSilences, err := server.stateStore.Get(ctx, server.orgID)
|
|
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
|
|
return 0, err
|
|
}
|
|
|
|
if storableSilences == nil {
|
|
storableSilences = alertmanagertypes.NewStoreableState(server.orgID)
|
|
}
|
|
|
|
c, err := storableSilences.Set(alertmanagertypes.SilenceStateName, server.silences)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return c, server.stateStore.Set(ctx, server.orgID, storableSilences)
|
|
})
|
|
|
|
}()
|
|
|
|
// Start maintenance for notification logs
|
|
server.wg.Add(1)
|
|
go func() {
|
|
defer server.wg.Done()
|
|
server.nflog.Maintenance(server.srvConfig.NFLog.MaintenanceInterval, snapfnoop, server.stopc, func() (int64, error) {
|
|
if _, err := server.nflog.GC(); err != nil {
|
|
server.logger.ErrorContext(ctx, "notification log garbage collection", "error", err)
|
|
// Don't return without saving the current state.
|
|
}
|
|
|
|
storableNFLog, err := server.stateStore.Get(ctx, server.orgID)
|
|
if err != nil && !errors.Ast(err, errors.TypeNotFound) {
|
|
return 0, err
|
|
}
|
|
|
|
if storableNFLog == nil {
|
|
storableNFLog = alertmanagertypes.NewStoreableState(server.orgID)
|
|
}
|
|
|
|
c, err := storableNFLog.Set(alertmanagertypes.NFLogStateName, server.nflog)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return c, server.stateStore.Set(ctx, server.orgID, storableNFLog)
|
|
})
|
|
}()
|
|
|
|
server.alerts, err = mem.NewAlerts(ctx, server.marker, server.srvConfig.Alerts.GCInterval, nil, server.logger, server.registry)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
server.pipelineBuilder = notify.NewPipelineBuilder(server.registry, featurecontrol.NoopFlags{})
|
|
server.dispatcherMetrics = dispatch.NewDispatcherMetrics(false, server.registry)
|
|
|
|
return server, nil
|
|
}
|
|
|
|
func (server *Server) GetAlerts(ctx context.Context, params alertmanagertypes.GettableAlertsParams) (alertmanagertypes.GettableAlerts, error) {
|
|
return alertmanagertypes.NewGettableAlertsFromAlertProvider(server.alerts, server.alertmanagerConfig, server.marker.Status, func(labels model.LabelSet) {
|
|
server.inhibitor.Mutes(labels)
|
|
server.silencer.Mutes(labels)
|
|
}, params)
|
|
}
|
|
|
|
func (server *Server) PutAlerts(ctx context.Context, postableAlerts alertmanagertypes.PostableAlerts) error {
|
|
alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(postableAlerts, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now())
|
|
|
|
// Notification sending alert takes precedence over validation errors.
|
|
if err := server.alerts.Put(alerts...); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err != nil {
|
|
return errors.Join(err...)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertmanagertypes.Config) error {
|
|
config := alertmanagerConfig.AlertmanagerConfig()
|
|
|
|
var err error
|
|
server.tmpl, err = alertmanagertypes.FromGlobs(config.Templates)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
server.tmpl.ExternalURL = server.srvConfig.ExternalURL
|
|
|
|
// Build the routing tree and record which receivers are used.
|
|
routes := dispatch.NewRoute(config.Route, nil)
|
|
activeReceivers := make(map[string]struct{})
|
|
routes.Walk(func(r *dispatch.Route) {
|
|
activeReceivers[r.RouteOpts.Receiver] = struct{}{}
|
|
})
|
|
|
|
// Build the map of receiver to integrations.
|
|
receivers := make(map[string][]notify.Integration, len(activeReceivers))
|
|
var integrationsNum int
|
|
for _, rcv := range config.Receivers {
|
|
if _, found := activeReceivers[rcv.Name]; !found {
|
|
// No need to build a receiver if no route is using it.
|
|
server.logger.InfoContext(ctx, "skipping creation of receiver not referenced by any route", "receiver", rcv.Name)
|
|
continue
|
|
}
|
|
integrations, err := alertmanagertypes.NewReceiverIntegrations(rcv, server.tmpl, server.logger)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// rcv.Name is guaranteed to be unique across all receivers.
|
|
receivers[rcv.Name] = integrations
|
|
integrationsNum += len(integrations)
|
|
}
|
|
|
|
// Build the map of time interval names to time interval definitions.
|
|
timeIntervals := make(map[string][]timeinterval.TimeInterval, len(config.MuteTimeIntervals)+len(config.TimeIntervals))
|
|
for _, ti := range config.MuteTimeIntervals {
|
|
timeIntervals[ti.Name] = ti.TimeIntervals
|
|
}
|
|
|
|
for _, ti := range config.TimeIntervals {
|
|
timeIntervals[ti.Name] = ti.TimeIntervals
|
|
}
|
|
|
|
intervener := timeinterval.NewIntervener(timeIntervals)
|
|
|
|
if server.inhibitor != nil {
|
|
server.inhibitor.Stop()
|
|
}
|
|
if server.dispatcher != nil {
|
|
server.dispatcher.Stop()
|
|
}
|
|
|
|
server.inhibitor = inhibit.NewInhibitor(server.alerts, config.InhibitRules, server.marker, server.logger)
|
|
server.timeIntervals = timeIntervals
|
|
server.silencer = silence.NewSilencer(server.silences, server.marker, server.logger)
|
|
|
|
var pipelinePeer notify.Peer
|
|
pipeline := server.pipelineBuilder.New(
|
|
receivers,
|
|
func() time.Duration { return 0 },
|
|
server.inhibitor,
|
|
server.silencer,
|
|
intervener,
|
|
server.marker,
|
|
server.nflog,
|
|
pipelinePeer,
|
|
)
|
|
|
|
timeoutFunc := func(d time.Duration) time.Duration {
|
|
if d < notify.MinTimeout {
|
|
d = notify.MinTimeout
|
|
}
|
|
return d
|
|
}
|
|
|
|
server.dispatcher = dispatch.NewDispatcher(
|
|
server.alerts,
|
|
routes,
|
|
pipeline,
|
|
server.marker,
|
|
timeoutFunc,
|
|
nil,
|
|
server.logger,
|
|
server.dispatcherMetrics,
|
|
)
|
|
|
|
// Do not try to add these to server.wg as there seems to be a race condition if
|
|
// we call Start() and Stop() in quick succession.
|
|
// Both these goroutines will run indefinitely.
|
|
go server.dispatcher.Run()
|
|
go server.inhibitor.Run()
|
|
|
|
server.alertmanagerConfig = alertmanagerConfig
|
|
return nil
|
|
}
|
|
|
|
func (server *Server) TestReceiver(ctx context.Context, receiver alertmanagertypes.Receiver) error {
|
|
return alertmanagertypes.TestReceiver(ctx, receiver, server.alertmanagerConfig, server.tmpl, server.logger, alertmanagertypes.NewTestAlert(receiver, time.Now(), time.Now()))
|
|
}
|
|
|
|
func (server *Server) TestAlert(ctx context.Context, postableAlert *alertmanagertypes.PostableAlert, receivers []string) error {
|
|
alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(alertmanagertypes.PostableAlerts{postableAlert}, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now())
|
|
if err != nil {
|
|
return errors.Join(err...)
|
|
}
|
|
|
|
if len(alerts) != 1 {
|
|
return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "expected 1 alert, got %d", len(alerts))
|
|
}
|
|
|
|
ch := make(chan error, len(receivers))
|
|
for _, receiverName := range receivers {
|
|
go func(receiverName string) {
|
|
receiver, err := server.alertmanagerConfig.GetReceiver(receiverName)
|
|
if err != nil {
|
|
ch <- err
|
|
return
|
|
}
|
|
ch <- alertmanagertypes.TestReceiver(ctx, receiver, server.alertmanagerConfig, server.tmpl, server.logger, alerts[0])
|
|
}(receiverName)
|
|
}
|
|
|
|
var errs []error
|
|
for i := 0; i < len(receivers); i++ {
|
|
if err := <-ch; err != nil {
|
|
errs = append(errs, err)
|
|
}
|
|
}
|
|
|
|
if errs != nil {
|
|
return errors.Join(errs...)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (server *Server) Hash() string {
|
|
if server.alertmanagerConfig == nil {
|
|
return ""
|
|
}
|
|
|
|
return server.alertmanagerConfig.StoreableConfig().Hash
|
|
}
|
|
|
|
func (server *Server) Stop(ctx context.Context) error {
|
|
if server.dispatcher != nil {
|
|
server.dispatcher.Stop()
|
|
}
|
|
|
|
if server.inhibitor != nil {
|
|
server.inhibitor.Stop()
|
|
}
|
|
|
|
// Close the alert provider.
|
|
server.alerts.Close()
|
|
|
|
// Signals maintenance goroutines of server states to stop.
|
|
close(server.stopc)
|
|
|
|
// Wait for all goroutines to finish.
|
|
server.wg.Wait()
|
|
|
|
return nil
|
|
}
|