chore: deployment config changes (#869)

* chore(install-script): 🔧 include missed sudo_cmd variable

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore: 🔧 add .gitkeep in folders to mount

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore(docker-swarm): 🔧 Update deploy configurations

Signed-off-by: Prashant Shahi <prashant@signoz.io>

* chore(compose-yaml): 🔧 expose otlp ports and restart on failure policy

Signed-off-by: Prashant Shahi <prashant@signoz.io>

Co-authored-by: Ankit Nayan <ankit@signoz.io>
This commit is contained in:
Prashant Shahi 2022-03-21 20:43:43 +05:30 committed by GitHub
parent 044f02c7c7
commit 86bdb9a5ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 223 additions and 156 deletions

3
.gitignore vendored
View File

@ -42,4 +42,7 @@ frontend/cypress.env.json
frontend/*.env frontend/*.env
pkg/query-service/signoz.db pkg/query-service/signoz.db
# local data
/deploy/docker/clickhouse-setup/data/ /deploy/docker/clickhouse-setup/data/
/deploy/docker-swarm/clickhouse-setup/data/

View File

@ -0,0 +1,35 @@
global:
resolve_timeout: 1m
slack_api_url: 'https://hooks.slack.com/services/xxx'
route:
receiver: 'slack-notifications'
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#alerts'
send_resolved: true
icon_url: https://avatars3.githubusercontent.com/u/3380462
title: |-
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
{{" "}}(
{{- with .CommonLabels.Remove .GroupLabels.Names }}
{{- range $index, $label := .SortedPairs -}}
{{ if $index }}, {{ end }}
{{- $label.Name }}="{{ $label.Value -}}"
{{- end }}
{{- end -}}
)
{{- end }}
text: >-
{{ range .Alerts -}}
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}

View File

@ -0,0 +1,11 @@
groups:
- name: ExampleCPULoadGroup
rules:
- alert: HighCpuLoad
expr: system_cpu_load_average_1m > 0.1
for: 0m
labels:
severity: warning
annotations:
summary: High CPU load
description: "CPU load is > 0.1\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -1,30 +1,35 @@
version: "3" version: "3.9"
services: services:
clickhouse: clickhouse:
image: yandex/clickhouse-server image: yandex/clickhouse-server:21.12.3.32
expose: volumes:
- 8123 - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- 9000 - ./data/clickhouse/:/var/lib/clickhouse/
ports: deploy:
- 9001:9000 restart_policy:
- 8123:8123 condition: on-failure
volumes: healthcheck:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
- ./docker-entrypoint-initdb.d/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
- ./data/clickhouse/:/var/lib/clickhouse/ interval: 30s
timeout: 5s
retries: 3
healthcheck: alertmanager:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" image: signoz/alertmanager:0.5.0
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] volumes:
interval: 30s - ./alertmanager.yml:/prometheus/alertmanager.yml
timeout: 5s - ./data/alertmanager:/data
retries: 3 command:
- '--config.file=/prometheus/alertmanager.yml'
- '--storage.path=/data'
deploy:
restart_policy:
condition: on-failure
query-service: query-service:
image: signoz/query-service:0.4.1 image: signoz/query-service:0.7.1
container_name: query-service
restart: always
command: ["-config=/root/config/prometheus.yml"] command: ["-config=/root/config/prometheus.yml"]
ports: ports:
- "8080:8080" - "8080:8080"
@ -35,77 +40,75 @@ services:
environment: environment:
- ClickHouseUrl=tcp://clickhouse:9000 - ClickHouseUrl=tcp://clickhouse:9000
- STORAGE=clickhouse - STORAGE=clickhouse
- POSTHOG_API_KEY=H-htDCae7CR3RV57gUzmol6IAKtm5IMCvbcm_fwnL-w
- GODEBUG=netdns=go - GODEBUG=netdns=go
- TELEMETRY_ENABLED=true - TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-swarm - DEPLOYMENT_TYPE=docker-swarm
deploy: deploy:
restart_policy: restart_policy:
condition: on-failure condition: on-failure
depends_on: depends_on:
- clickhouse - clickhouse
frontend: frontend:
image: signoz/frontend:0.4.1 image: signoz/frontend:0.7.1
container_name: frontend
depends_on: depends_on:
- query-service - query-service
links:
- "query-service"
ports: ports:
- "3301:3301" - "3301:3301"
volumes: volumes:
- ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf - ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf
otel-collector: otel-collector:
image: signoz/otelcontribcol:0.4.0 image: signoz/otelcontribcol:0.43.0
command: ["--config=/etc/otel-collector-config.yaml", "--mem-ballast-size-mib=2000"] command: ["--config=/etc/otel-collector-config.yaml"]
volumes: volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports: ports:
- "1777:1777" # pprof extension - "4317:4317" # OTLP gRPC receiver
- "8887:8888" # Prometheus metrics exposed by the agent - "4318:4318" # OTLP HTTP receiver
- "14268:14268" # Jaeger receiver # - "8889:8889" # Prometheus metrics exposed by the agent
- "55678" # OpenCensus receiver # - "13133" # health_check
- "55680:55680" # OTLP HTTP/2.0 legacy port # - "14268:14268" # Jaeger receiver
- "55681:55681" # OTLP HTTP/1.0 receiver # - "55678:55678" # OpenCensus receiver
- "4317:4317" # OTLP GRPC receiver # - "55679:55679" # zpages extension
- "55679:55679" # zpages extension # - "55680:55680" # OTLP gRPC legacy receiver
- "13133" # health_check # - "55681:55681" # OTLP HTTP legacy receiver
deploy: deploy:
mode: replicated mode: replicated
replicas: 3 replicas: 3
restart_policy:
condition: on-failure
resources:
limits:
memory: 2000m
depends_on: depends_on:
- clickhouse - clickhouse
otel-collector-hostmetrics: otel-collector-metrics:
image: signoz/otelcontribcol:0.4.0 image: signoz/otelcontribcol:0.43.0
command: ["--config=/etc/otel-collector-config-hostmetrics.yaml", "--mem-ballast-size-mib=683"] command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes: volumes:
- ./otel-collector-config-hostmetrics.yaml:/etc/otel-collector-config-hostmetrics.yaml - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
deploy:
restart_policy:
condition: on-failure
depends_on: depends_on:
- clickhouse - clickhouse
hotrod: hotrod:
image: jaegertracing/example-hotrod:latest image: jaegertracing/example-hotrod:1.30
container_name: hotrod
ports:
- "9000:8080"
command: ["all"] command: ["all"]
environment: environment:
- JAEGER_ENDPOINT=http://otel-collector:14268/api/traces - JAEGER_ENDPOINT=http://otel-collector:14268/api/traces
logging:
options:
max-size: 50m
max-file: "3"
load-hotrod: load-hotrod:
image: "grubykarol/locust:1.2.3-python3.9-alpine3.12" image: "grubykarol/locust:1.2.3-python3.9-alpine3.12"
container_name: load-hotrod
hostname: load-hotrod hostname: load-hotrod
ports:
- "8089:8089"
environment: environment:
ATTACKED_HOST: http://hotrod:8080 ATTACKED_HOST: http://hotrod:8080
LOCUST_MODE: standalone LOCUST_MODE: standalone

View File

@ -1,72 +0,0 @@
receivers:
otlp:
protocols:
grpc:
http:
jaeger:
protocols:
grpc:
thrift_http:
hostmetrics:
collection_interval: 60s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
# Data sources: metrics
prometheus:
config:
scrape_configs:
- job_name: "otel-collector"
dns_sd_configs:
- names:
- 'tasks.signoz_otel-collector'
type: 'A'
port: 8888
- job_name: "otel-collector-hostmetrics"
scrape_interval: 10s
static_configs:
- targets: ["otel-collector-hostmetrics:8888"]
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
# Same as --mem-ballast-size-mib CLI argument
ballast_size_mib: 683
# 80% of maximum memory up to 2G
limit_mib: 1500
# 25% of limit up to 2G
spike_limit_mib: 512
check_interval: 5s
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhouse:
datasource: tcp://clickhouse:9000
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
resource_to_telemetry_conversion:
enabled: true
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [batch]
exporters: [clickhouse]
metrics:
receivers: [otlp, prometheus, hostmetrics]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -1,4 +1,8 @@
receivers: receivers:
otlp/spanmetrics:
protocols:
grpc:
endpoint: "localhost:12345"
otlp: otlp:
protocols: protocols:
grpc: grpc:
@ -7,18 +11,30 @@ receivers:
protocols: protocols:
grpc: grpc:
thrift_http: thrift_http:
hostmetrics:
collection_interval: 30s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
processors: processors:
batch: batch:
send_batch_size: 1000 send_batch_size: 1000
timeout: 10s timeout: 10s
memory_limiter: signozspanmetrics/prometheus:
# Same as --mem-ballast-size-mib CLI argument metrics_exporter: prometheus
ballast_size_mib: 683 latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
# 80% of maximum memory up to 2G # memory_limiter:
limit_mib: 1500 # # Same as --mem-ballast-size-mib CLI argument
# 25% of limit up to 2G # ballast_size_mib: 683
spike_limit_mib: 512 # # 80% of maximum memory up to 2G
check_interval: 5s # limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
# queued_retry: # queued_retry:
# num_workers: 4 # num_workers: 4
# queue_size: 100 # queue_size: 100
@ -33,15 +49,19 @@ exporters:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics endpoint: tcp://clickhouse:9000/?database=signoz_metrics
resource_to_telemetry_conversion: resource_to_telemetry_conversion:
enabled: true enabled: true
prometheus:
endpoint: "0.0.0.0:8889"
service: service:
extensions: [health_check, zpages] extensions: [health_check, zpages]
pipelines: pipelines:
traces: traces:
receivers: [jaeger, otlp] receivers: [jaeger, otlp]
processors: [batch] processors: [signozspanmetrics/prometheus, batch]
exporters: [clickhouse] exporters: [clickhouse]
metrics: metrics:
receivers: [otlp] receivers: [otlp, hostmetrics]
processors: [batch] processors: [batch]
exporters: [clickhousemetricswrite] exporters: [clickhousemetricswrite]
metrics/spanmetrics:
receivers: [otlp/spanmetrics]
exporters: [prometheus]

View File

@ -0,0 +1,44 @@
receivers:
otlp:
protocols:
grpc:
http:
# Data sources: metrics
prometheus:
config:
scrape_configs:
- job_name: "otel-collector"
scrape_interval: 30s
static_configs:
- targets: ["otel-collector:8889"]
processors:
batch:
send_batch_size: 1000
timeout: 10s
# memory_limiter:
# # Same as --mem-ballast-size-mib CLI argument
# ballast_size_mib: 683
# # 80% of maximum memory up to 2G
# limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
service:
extensions: [health_check, zpages]
pipelines:
metrics:
receivers: [otlp, prometheus]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -9,12 +9,13 @@ alerting:
alertmanagers: alertmanagers:
- static_configs: - static_configs:
- targets: - targets:
# - alertmanager:9093 - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: rule_files:
# - "first_rules.yml" # - "first_rules.yml"
# - "second_rules.yml" # - "second_rules.yml"
- 'alerts.yml'
# A scrape configuration containing exactly one endpoint to scrape: # A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself. # Here it's Prometheus itself.

View File

@ -6,6 +6,7 @@ services:
volumes: volumes:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- ./data/clickhouse/:/var/lib/clickhouse/ - ./data/clickhouse/:/var/lib/clickhouse/
restart: on-failure
healthcheck: healthcheck:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -36,6 +37,7 @@ services:
- GODEBUG=netdns=go - GODEBUG=netdns=go
- TELEMETRY_ENABLED=true - TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-standalone-arm - DEPLOYMENT_TYPE=docker-standalone-arm
restart: on-failure restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
@ -57,9 +59,17 @@ services:
volumes: volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports: ports:
- "4317:4317" # OTLP GRPC receiver - "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy port
# - "55681:55681" # OTLP HTTP legacy receiver
mem_limit: 2000m mem_limit: 2000m
restart: always restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
condition: service_healthy condition: service_healthy
@ -69,6 +79,7 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"] command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes: volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
condition: service_healthy condition: service_healthy

View File

@ -6,6 +6,7 @@ services:
volumes: volumes:
- ./clickhouse-config.xml:/etc/clickhouse-server/config.xml - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
- ./data/clickhouse/:/var/lib/clickhouse/ - ./data/clickhouse/:/var/lib/clickhouse/
restart: on-failure
healthcheck: healthcheck:
# "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -39,6 +40,7 @@ services:
- GODEBUG=netdns=go - GODEBUG=netdns=go
- TELEMETRY_ENABLED=true - TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-standalone-amd - DEPLOYMENT_TYPE=docker-standalone-amd
restart: on-failure restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
@ -60,9 +62,17 @@ services:
volumes: volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports: ports:
- "4317:4317" # OTLP GRPC receiver - "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy port
# - "55681:55681" # OTLP HTTP legacy receiver
mem_limit: 2000m mem_limit: 2000m
restart: always restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
condition: service_healthy condition: service_healthy
@ -72,6 +82,7 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"] command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes: volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
restart: on-failure
depends_on: depends_on:
clickhouse: clickhouse:
condition: service_healthy condition: service_healthy

View File

@ -143,7 +143,7 @@ install_docker() {
echo "Installing docker" echo "Installing docker"
$apt_cmd install docker-ce docker-ce-cli containerd.io $apt_cmd install docker-ce docker-ce-cli containerd.io
elif [[ $package_manager == zypper ]]; then elif [[ $package_manager == zypper ]]; then
zypper_cmd="zypper --quiet --no-gpg-checks --non-interactive" zypper_cmd="$sudo_cmd zypper --quiet --no-gpg-checks --non-interactive"
echo "Installing docker" echo "Installing docker"
if [[ $os == sles ]]; then if [[ $os == sles ]]; then
os_sp="$(cat /etc/*-release | awk -F= '$1 == "VERSION_ID" { gsub(/"/, ""); print $2; exit }')" os_sp="$(cat /etc/*-release | awk -F= '$1 == "VERSION_ID" { gsub(/"/, ""); print $2; exit }')"
@ -151,19 +151,19 @@ install_docker() {
SUSEConnect -p sle-module-containers/$os_sp/$os_arch -r '' SUSEConnect -p sle-module-containers/$os_sp/$os_arch -r ''
fi fi
$zypper_cmd install docker docker-runc containerd $zypper_cmd install docker docker-runc containerd
systemctl enable docker.service $sudo_cmd systemctl enable docker.service
elif [[ $package_manager == yum && $os == 'amazon linux' ]]; then elif [[ $package_manager == yum && $os == 'amazon linux' ]]; then
echo echo
echo "Amazon Linux detected ... " echo "Amazon Linux detected ... "
echo echo
# yum install docker # yum install docker
# service docker start # service docker start
amazon-linux-extras install docker $sudo_cmd amazon-linux-extras install docker
else else
yum_cmd="yum --assumeyes --quiet" yum_cmd="$sudo_cmd yum --assumeyes --quiet"
$yum_cmd install yum-utils $yum_cmd install yum-utils
yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo $sudo_cmd yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo
echo "Installing docker" echo "Installing docker"
$yum_cmd install docker-ce docker-ce-cli containerd.io $yum_cmd install docker-ce docker-ce-cli containerd.io
@ -176,9 +176,9 @@ install_docker_compose() {
if [[ ! -f /usr/bin/docker-compose ]];then if [[ ! -f /usr/bin/docker-compose ]];then
echo "++++++++++++++++++++++++" echo "++++++++++++++++++++++++"
echo "Installing docker-compose" echo "Installing docker-compose"
curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose $sudo_cmd curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose $sudo_cmd chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose $sudo_cmd ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
echo "docker-compose installed!" echo "docker-compose installed!"
echo "" echo ""
fi fi
@ -198,9 +198,9 @@ start_docker() {
if [ $os = "Mac" ]; then if [ $os = "Mac" ]; then
open --background -a Docker && while ! docker system info > /dev/null 2>&1; do sleep 1; done open --background -a Docker && while ! docker system info > /dev/null 2>&1; do sleep 1; done
else else
if ! systemctl is-active docker.service > /dev/null; then if ! $sudo_cmd systemctl is-active docker.service > /dev/null; then
echo "Starting docker service" echo "Starting docker service"
systemctl start docker.service $sudo_cmd systemctl start docker.service
fi fi
if [ -z $sudo_cmd ]; then if [ -z $sudo_cmd ]; then
docker ps > /dev/null && true docker ps > /dev/null && true