signoz/conf/example.yaml
Vibhu Pandey 1f33928bf9
feat(alertmanager): integrate with ruler (#7222)
### Summary

Integrate the new implementations of the alertmanager along with changes to the ruler. This change can be broadly categoried into 3 parts:

#### Frontend
- The earlier `/api/v1/alerts` api was double encoding the response in json and sending it to the frontend. This PR fixes the json response object. 

For instance, we have gone from the response `{
    "status": "success",
    "data": "{\"status\":\"success\",\"data\":[{\"labels\":{\"alertname\":\"[platform][consumer] consumer is above 100% memory utilization\",\"bu\":\"platform\",\"......
}` to the response `{"status":"success","data":[{"labels":{"alertname":"[Metrics] Pod CP......`

- `msteams` has been changed to `msteamsv2` wherever applicable

#### Ruler
The following changes have been done in the ruler component:

- Removal of the old alertmanager and notifier
- The RuleDB methods `Create`, `Edit` and `Delete` have been made transactional
- Introduction of a new `testPrepareNotifyFunc` for sending test notifications
- Integration with the new alertmanager

#### Alertmanager
Although a huge chunk of the alertmanagers have been merged in previous PRs (the list can be found at https://github.com/SigNoz/platform-pod/issues/404), this PR takes care of changes needed in order to incorporate it with the ruler

- Addition of ruleId based matching
- Support for marshalling the global configuration directly from the upstream alertmanager
- Addition of orgId to the legacy alertmanager
- Support for always adding defaults to both routes and receivers while creating them
- Migration to create the required alertmanager tables
- Migration for msteams to msteamsv2 has been added. We will start using msteamv2 config for the new alertmanager and keep using msteams for the old one.

#### Related Issues / PR's

Closes https://github.com/SigNoz/platform-pod/issues/404
Closes https://github.com/SigNoz/platform-pod/issues/176
2025-03-09 20:00:42 +00:00

144 lines
4.8 KiB
YAML

##################### SigNoz Configuration Example #####################
#
# Do not modify this file
#
##################### Instrumentation #####################
instrumentation:
logs:
# The log level to use.
level: info
traces:
# Whether to enable tracing.
enabled: false
processors:
batch:
exporter:
otlp:
endpoint: localhost:4317
metrics:
# Whether to enable metrics.
enabled: true
readers:
pull:
exporter:
prometheus:
host: "0.0.0.0"
port: 9090
##################### Web #####################
web:
# Whether to enable the web frontend
enabled: true
# The prefix to serve web on
prefix: /
# The directory containing the static build files.
directory: /etc/signoz/web
##################### Cache #####################
cache:
# specifies the caching provider to use.
provider: memory
# memory: Uses in-memory caching.
memory:
# Time-to-live for cache entries in memory. Specify the duration in ns
ttl: 60000000000
# The interval at which the cache will be cleaned up
cleanupInterval: 1m
# redis: Uses Redis as the caching backend.
redis:
# The hostname or IP address of the Redis server.
host: localhost
# The port on which the Redis server is running. Default is usually 6379.
port: 6379
# The password for authenticating with the Redis server, if required.
password:
# The Redis database number to use
db: 0
##################### SQLStore #####################
sqlstore:
# specifies the SQLStore provider to use.
provider: sqlite
# The maximum number of open connections to the database.
max_open_conns: 100
sqlite:
# The path to the SQLite database file.
path: /var/lib/signoz/signoz.db
##################### APIServer #####################
apiserver:
timeout:
# Default request timeout.
default: 60s
# Maximum request timeout.
max: 600s
# List of routes to exclude from request timeout.
excluded_routes:
- /api/v1/logs/tail
- /api/v3/logs/livetail
logging:
# List of routes to exclude from request responselogging.
excluded_routes:
- /api/v1/health
##################### TelemetryStore #####################
telemetrystore:
# Specifies the telemetrystore provider to use.
provider: clickhouse
# Maximum number of idle connections in the connection pool.
max_idle_conns: 50
# Maximum number of open connections to the database.
max_open_conns: 100
# Maximum time to wait for a connection to be established.
dial_timeout: 5s
clickhouse:
# The DSN to use for ClickHouse.
dsn: http://localhost:9000
##################### Alertmanager #####################
alertmanager:
# Specifies the alertmanager provider to use.
provider: legacy
legacy:
# The API URL (with prefix) of the legacy Alertmanager instance.
api_url: http://localhost:9093/api
signoz:
# The poll interval for periodically syncing the alertmanager with the config in the store.
poll_interval: 1m
# The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself.
external_url: http://localhost:9093
# The global configuration for the alertmanager. All the exahustive fields can be found in the upstream: https://github.com/prometheus/alertmanager/blob/efa05feffd644ba4accb526e98a8c6545d26a783/config/config.go#L833
global:
# ResolveTimeout is the time after which an alert is declared resolved if it has not been updated.
resolve_timeout: 5m
route:
# GroupByStr is the list of labels to group alerts by.
group_by:
- alertname
# GroupInterval is the interval at which alerts are grouped.
group_interval: 1m
# GroupWait is the time to wait before sending alerts to receivers.
group_wait: 1m
# RepeatInterval is the interval at which alerts are repeated.
repeat_interval: 1h
alerts:
# Interval between garbage collection of alerts.
gc_interval: 30m
silences:
# Maximum number of silences, including expired silences. If negative or zero, no limit is set.
max: 0
# Maximum size of the silences in bytes. If negative or zero, no limit is set.
max_size_bytes: 0
# Interval between garbage collection and snapshotting of the silences. The snapshot will be stored in the state store.
maintenance_interval: 15m
# Retention of the silences.
retention: 120h
nflog:
# Interval between garbage collection and snapshotting of the notification logs. The snapshot will be stored in the state store.
maintenance_interval: 15m
# Retention of the notification logs.
retention: 120h