chore: deployment config changes (#869)

* chore(install-script): 🔧 include missed sudo_cmd variable

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore: 🔧 add .gitkeep in folders to mount

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore(docker-swarm): 🔧 Update deploy configurations

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore(compose-yaml): 🔧 expose otlp ports and restart on failure policy

Signed-off-by: Prashant Shahi <prashant@signoz.io>

Co-authored-by: Ankit Nayan <ankit@signoz.io>
This commit is contained in:
Prashant Shahi 2022-03-21 20:43:43 +05:30 committed by GitHub
parent 044f02c7c7
commit 86bdb9a5ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 223 additions and 156 deletions

3
.gitignore vendored
View File

@ -42,4 +42,7 @@ frontend/cypress.env.json
frontend/*.env
pkg/query-service/signoz.db
# local data
/deploy/docker/clickhouse-setup/data/
/deploy/docker-swarm/clickhouse-setup/data/

View File

@ -0,0 +1,35 @@
global:
resolve_timeout: 1m
slack_api_url: 'https://hooks.slack.com/services/xxx'
route:
receiver: 'slack-notifications'
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#alerts'
send_resolved: true
icon_url: https://avatars3.githubusercontent.com/u/3380462
title: |-
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
{{" "}}(
{{- with .CommonLabels.Remove .GroupLabels.Names }}
{{- range $index, $label := .SortedPairs -}}
{{ if $index }}, {{ end }}
{{- $label.Name }}="{{ $label.Value -}}"
{{- end }}
{{- end -}}
)
{{- end }}
text: >-
{{ range .Alerts -}}
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}

View File

@ -0,0 +1,11 @@
groups:
- name: ExampleCPULoadGroup
rules:
- alert: HighCpuLoad
expr: system_cpu_load_average_1m > 0.1
for: 0m
labels:
severity: warning
annotations:
summary: High CPU load
description: "CPU load is > 0.1\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -1,19 +1,14 @@
version: "3"
version: "3.9"
services:
clickhouse:
image: yandex/clickhouse-server
expose:
- 8123
- 9000
ports:
- 9001:9000
- 8123:8123
image: yandex/clickhouse-server:21.12.3.32
volumes:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- ./docker-entrypoint-initdb.d/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql
- ./data/clickhouse/:/var/lib/clickhouse/
deploy:
restart_policy:
condition: on-failure
healthcheck:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -21,10 +16,20 @@ services:
timeout: 5s
retries: 3
alertmanager:
image: signoz/alertmanager:0.5.0
volumes:
- ./alertmanager.yml:/prometheus/alertmanager.yml
- ./data/alertmanager:/data
command:
- '--config.file=/prometheus/alertmanager.yml'
- '--storage.path=/data'
deploy:
restart_policy:
condition: on-failure
query-service:
image: signoz/query-service:0.4.1
container_name: query-service
restart: always
image: signoz/query-service:0.7.1
command: ["-config=/root/config/prometheus.yml"]
ports:
- "8080:8080"
@ -35,77 +40,75 @@ services:
environment:
- ClickHouseUrl=tcp://clickhouse:9000
- STORAGE=clickhouse
- POSTHOG_API_KEY=H-htDCae7CR3RV57gUzmol6IAKtm5IMCvbcm_fwnL-w
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-swarm
deploy:
restart_policy:
condition: on-failure
depends_on:
- clickhouse
frontend:
image: signoz/frontend:0.4.1
container_name: frontend
image: signoz/frontend:0.7.1
depends_on:
- query-service
links:
- "query-service"
ports:
- "3301:3301"
volumes:
- ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf
otel-collector:
image: signoz/otelcontribcol:0.4.0
command: ["--config=/etc/otel-collector-config.yaml", "--mem-ballast-size-mib=2000"]
image: signoz/otelcontribcol:0.43.0
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "1777:1777" # pprof extension
- "8887:8888" # Prometheus metrics exposed by the agent
- "14268:14268" # Jaeger receiver
- "55678" # OpenCensus receiver
- "55680:55680" # OTLP HTTP/2.0 legacy port
- "55681:55681" # OTLP HTTP/1.0 receiver
- "4317:4317" # OTLP GRPC receiver
- "55679:55679" # zpages extension
- "13133" # health_check
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy receiver
# - "55681:55681" # OTLP HTTP legacy receiver
deploy:
mode: replicated
replicas: 3
restart_policy:
condition: on-failure
resources:
limits:
memory: 2000m
depends_on:
- clickhouse
otel-collector-hostmetrics:
image: signoz/otelcontribcol:0.4.0
command: ["--config=/etc/otel-collector-config-hostmetrics.yaml", "--mem-ballast-size-mib=683"]
otel-collector-metrics:
image: signoz/otelcontribcol:0.43.0
command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes:
- ./otel-collector-config-hostmetrics.yaml:/etc/otel-collector-config-hostmetrics.yaml
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
deploy:
restart_policy:
condition: on-failure
depends_on:
- clickhouse
hotrod:
image: jaegertracing/example-hotrod:latest
container_name: hotrod
ports:
- "9000:8080"
image: jaegertracing/example-hotrod:1.30
command: ["all"]
environment:
- JAEGER_ENDPOINT=http://otel-collector:14268/api/traces
logging:
options:
max-size: 50m
max-file: "3"
load-hotrod:
image: "grubykarol/locust:1.2.3-python3.9-alpine3.12"
container_name: load-hotrod
hostname: load-hotrod
ports:
- "8089:8089"
environment:
ATTACKED_HOST: http://hotrod:8080
LOCUST_MODE: standalone

View File

@ -1,72 +0,0 @@
receivers:
otlp:
protocols:
grpc:
http:
jaeger:
protocols:
grpc:
thrift_http:
hostmetrics:
collection_interval: 60s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
# Data sources: metrics
prometheus:
config:
scrape_configs:
- job_name: "otel-collector"
dns_sd_configs:
- names:
- 'tasks.signoz_otel-collector'
type: 'A'
port: 8888
- job_name: "otel-collector-hostmetrics"
scrape_interval: 10s
static_configs:
- targets: ["otel-collector-hostmetrics:8888"]
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
# Same as --mem-ballast-size-mib CLI argument
ballast_size_mib: 683
# 80% of maximum memory up to 2G
limit_mib: 1500
# 25% of limit up to 2G
spike_limit_mib: 512
check_interval: 5s
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhouse:
datasource: tcp://clickhouse:9000
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
resource_to_telemetry_conversion:
enabled: true
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [batch]
exporters: [clickhouse]
metrics:
receivers: [otlp, prometheus, hostmetrics]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -1,4 +1,8 @@
receivers:
otlp/spanmetrics:
protocols:
grpc:
endpoint: "localhost:12345"
otlp:
protocols:
grpc:
@ -7,18 +11,30 @@ receivers:
protocols:
grpc:
thrift_http:
hostmetrics:
collection_interval: 30s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
# Same as --mem-ballast-size-mib CLI argument
ballast_size_mib: 683
# 80% of maximum memory up to 2G
limit_mib: 1500
# 25% of limit up to 2G
spike_limit_mib: 512
check_interval: 5s
signozspanmetrics/prometheus:
metrics_exporter: prometheus
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
# memory_limiter:
# # Same as --mem-ballast-size-mib CLI argument
# ballast_size_mib: 683
# # 80% of maximum memory up to 2G
# limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
# queued_retry:
# num_workers: 4
# queue_size: 100
@ -33,15 +49,19 @@ exporters:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
resource_to_telemetry_conversion:
enabled: true
prometheus:
endpoint: "0.0.0.0:8889"
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [batch]
processors: [signozspanmetrics/prometheus, batch]
exporters: [clickhouse]
metrics:
receivers: [otlp]
receivers: [otlp, hostmetrics]
processors: [batch]
exporters: [clickhousemetricswrite]
metrics/spanmetrics:
receivers: [otlp/spanmetrics]
exporters: [prometheus]

View File

@ -0,0 +1,44 @@
receivers:
otlp:
protocols:
grpc:
http:
# Data sources: metrics
prometheus:
config:
scrape_configs:
- job_name: "otel-collector"
scrape_interval: 30s
static_configs:
- targets: ["otel-collector:8889"]
processors:
batch:
send_batch_size: 1000
timeout: 10s
# memory_limiter:
# # Same as --mem-ballast-size-mib CLI argument
# ballast_size_mib: 683
# # 80% of maximum memory up to 2G
# limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
service:
extensions: [health_check, zpages]
pipelines:
metrics:
receivers: [otlp, prometheus]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -9,12 +9,13 @@ alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- 'alerts.yml'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.

View File

@ -6,6 +6,7 @@ services:
volumes:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- ./data/clickhouse/:/var/lib/clickhouse/
restart: on-failure
healthcheck:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -36,6 +37,7 @@ services:
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-standalone-arm
restart: on-failure
depends_on:
clickhouse:
@ -57,9 +59,17 @@ services:
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "4317:4317" # OTLP GRPC receiver
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy port
# - "55681:55681" # OTLP HTTP legacy receiver
mem_limit: 2000m
restart: always
restart: on-failure
depends_on:
clickhouse:
condition: service_healthy
@ -69,6 +79,7 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
restart: on-failure
depends_on:
clickhouse:
condition: service_healthy

View File

@ -6,6 +6,7 @@ services:
volumes:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- ./data/clickhouse/:/var/lib/clickhouse/
restart: on-failure
healthcheck:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -39,6 +40,7 @@ services:
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-standalone-amd
restart: on-failure
depends_on:
clickhouse:
@ -60,9 +62,17 @@ services:
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "4317:4317" # OTLP GRPC receiver
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy port
# - "55681:55681" # OTLP HTTP legacy receiver
mem_limit: 2000m
restart: always
restart: on-failure
depends_on:
clickhouse:
condition: service_healthy
@ -72,6 +82,7 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
restart: on-failure
depends_on:
clickhouse:
condition: service_healthy

View File

@ -143,7 +143,7 @@ install_docker() {
echo "Installing docker"
$apt_cmd install docker-ce docker-ce-cli containerd.io
elif [[ $package_manager == zypper ]]; then
zypper_cmd="zypper --quiet --no-gpg-checks --non-interactive"
zypper_cmd="$sudo_cmd zypper --quiet --no-gpg-checks --non-interactive"
echo "Installing docker"
if [[ $os == sles ]]; then
os_sp="$(cat /etc/*-release | awk -F= '$1 == "VERSION_ID" { gsub(/"/, ""); print $2; exit }')"
@ -151,19 +151,19 @@ install_docker() {
SUSEConnect -p sle-module-containers/$os_sp/$os_arch -r ''
fi
$zypper_cmd install docker docker-runc containerd
systemctl enable docker.service
$sudo_cmd systemctl enable docker.service
elif [[ $package_manager == yum && $os == 'amazon linux' ]]; then
echo
echo "Amazon Linux detected ... "
echo
# yum install docker
# service docker start
amazon-linux-extras install docker
$sudo_cmd amazon-linux-extras install docker
else
yum_cmd="yum --assumeyes --quiet"
yum_cmd="$sudo_cmd yum --assumeyes --quiet"
$yum_cmd install yum-utils
yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo
$sudo_cmd yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo
echo "Installing docker"
$yum_cmd install docker-ce docker-ce-cli containerd.io
@ -176,9 +176,9 @@ install_docker_compose() {
if [[ ! -f /usr/bin/docker-compose ]];then
echo "++++++++++++++++++++++++"
echo "Installing docker-compose"
curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
$sudo_cmd curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
$sudo_cmd chmod +x /usr/local/bin/docker-compose
$sudo_cmd ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
echo "docker-compose installed!"
echo ""
fi
@ -198,9 +198,9 @@ start_docker() {
if [ $os = "Mac" ]; then
open --background -a Docker && while ! docker system info > /dev/null 2>&1; do sleep 1; done
else
if ! systemctl is-active docker.service > /dev/null; then
if ! $sudo_cmd systemctl is-active docker.service > /dev/null; then
echo "Starting docker service"
systemctl start docker.service
$sudo_cmd systemctl start docker.service
fi
if [ -z $sudo_cmd ]; then
docker ps > /dev/null && true