chore: deployment config changes (#869)

* chore(install-script): 🔧 include missed sudo_cmd variable Signed-off-by: Prashant Shahi <prashant@signoz.io> * chore: 🔧 add .gitkeep in folders to mount Signed-off-by: Prashant Shahi <prashant@signoz.io> * chore(docker-swarm): 🔧 Update deploy configurations Signed-off-by: Prashant Shahi <prashant@signoz.io> * chore(compose-yaml): 🔧 expose otlp ports and restart on failure policy Signed-off-by: Prashant Shahi <prashant@signoz.io> Co-authored-by: Ankit Nayan <ankit@signoz.io>
2025-08-12 10:59:02 +08:00 · 2022-03-21 20:43:43 +05:30 · 2022-03-21 20:43:43 +05:30 · 86bdb9a5ad
commit 86bdb9a5ad
parent 044f02c7c7
15 changed files with 223 additions and 156 deletions
--- a/.gitignore
+++ b/.gitignore
@ -42,4 +42,7 @@ frontend/cypress.env.json
 frontend/*.env
 pkg/query-service/signoz.db
 # local data
 /deploy/docker/clickhouse-setup/data/
 /deploy/docker-swarm/clickhouse-setup/data/
--- a/deploy/docker-swarm/clickhouse-setup/alertmanager.yml
+++ b/deploy/docker-swarm/clickhouse-setup/alertmanager.yml
@ -0,0 +1,35 @@
 global:
  resolve_timeout: 1m
  slack_api_url: 'https://hooks.slack.com/services/xxx'
 route:
  receiver: 'slack-notifications'
 receivers:
 - name: 'slack-notifications'
  slack_configs:
  - channel: '#alerts'
    send_resolved: true
    icon_url: https://avatars3.githubusercontent.com/u/3380462
    title: |-
     [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
       {{" "}}(
       {{- with .CommonLabels.Remove .GroupLabels.Names }}
         {{- range $index, $label := .SortedPairs -}}
           {{ if $index }}, {{ end }}
           {{- $label.Name }}="{{ $label.Value -}}"
         {{- end }}
       {{- end -}}
       )
     {{- end }}
    text: >-
     {{ range .Alerts -}}
     *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
     *Description:* {{ .Annotations.description }}
     *Details:*
       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
       {{ end }}
     {{ end }}
--- a/deploy/docker-swarm/clickhouse-setup/alerts.yml
+++ b/deploy/docker-swarm/clickhouse-setup/alerts.yml
@ -0,0 +1,11 @@
 groups:
 - name: ExampleCPULoadGroup
  rules:
  - alert: HighCpuLoad
    expr: system_cpu_load_average_1m > 0.1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: High CPU load
      description: "CPU load is > 0.1\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/deploy/docker-swarm/clickhouse-setup/data/alertmanager/.gitkeep
+++ b/deploy/docker-swarm/clickhouse-setup/data/alertmanager/.gitkeep
--- a/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml
+++ b/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml
@ -1,30 +1,35 @@
-version: "3"
+version: "3.9"
 services:
  clickhouse:
-      image: yandex/clickhouse-server
+    image: yandex/clickhouse-server:21.12.3.32
-      expose:
+    volumes:
-        - 8123
+      - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
-        - 9000
+      - ./data/clickhouse/:/var/lib/clickhouse/
-      ports:
+    deploy:
-        - 9001:9000
+      restart_policy:
-        - 8123:8123
+        condition: on-failure
-      volumes:
+    healthcheck:
-        - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
+      # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
-        - ./docker-entrypoint-initdb.d/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql
+      test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
-        - ./data/clickhouse/:/var/lib/clickhouse/
+      interval: 30s
      timeout: 5s
      retries: 3
-      healthcheck:
+  alertmanager:
-        # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
+    image: signoz/alertmanager:0.5.0
-        test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
+    volumes:
-        interval: 30s
+      - ./alertmanager.yml:/prometheus/alertmanager.yml
-        timeout: 5s
+      - ./data/alertmanager:/data
-        retries: 3
+    command:
      - '--config.file=/prometheus/alertmanager.yml'
      - '--storage.path=/data'
    deploy:
      restart_policy:
        condition: on-failure
  query-service:
-    image: signoz/query-service:0.4.1
+    image: signoz/query-service:0.7.1
    container_name: query-service
    restart: always
    command: ["-config=/root/config/prometheus.yml"]
    ports:
      - "8080:8080"
@ -35,77 +40,75 @@ services:
    environment:
      - ClickHouseUrl=tcp://clickhouse:9000
      - STORAGE=clickhouse
      - POSTHOG_API_KEY=H-htDCae7CR3RV57gUzmol6IAKtm5IMCvbcm_fwnL-w
      - GODEBUG=netdns=go
      - TELEMETRY_ENABLED=true
      - DEPLOYMENT_TYPE=docker-swarm
    deploy:
      restart_policy:
        condition: on-failure
    depends_on:
    - clickhouse
  frontend:
-    image: signoz/frontend:0.4.1
+    image: signoz/frontend:0.7.1
    container_name: frontend
    depends_on:
      - query-service
    links:
      - "query-service"
    ports:
      - "3301:3301"
    volumes:
      - ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf
  otel-collector:
-    image: signoz/otelcontribcol:0.4.0
+    image: signoz/otelcontribcol:0.43.0
-    command: ["--config=/etc/otel-collector-config.yaml", "--mem-ballast-size-mib=2000"]
+    command: ["--config=/etc/otel-collector-config.yaml"]
    volumes:
      - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
-      - "1777:1777"   # pprof extension
+      - "4317:4317"     # OTLP gRPC receiver
-      - "8887:8888"   # Prometheus metrics exposed by the agent
+      - "4318:4318"     # OTLP HTTP receiver
-      - "14268:14268"       # Jaeger receiver
+      # - "8889:8889"     # Prometheus metrics exposed by the agent
-      - "55678"       # OpenCensus receiver
+      # - "13133"         # health_check
-      - "55680:55680"       # OTLP HTTP/2.0 legacy port
+      # - "14268:14268"   # Jaeger receiver
-      - "55681:55681"       # OTLP HTTP/1.0 receiver
+      # - "55678:55678"   # OpenCensus receiver
-      - "4317:4317"       # OTLP GRPC receiver
+      # - "55679:55679"   # zpages extension
-      - "55679:55679" # zpages extension
+      # - "55680:55680"   # OTLP gRPC legacy receiver
-      - "13133"       # health_check
+      # - "55681:55681"   # OTLP HTTP legacy receiver
    deploy:
      mode: replicated
      replicas: 3
      restart_policy:
        condition: on-failure
      resources:
        limits:
          memory: 2000m
    depends_on:
-    - clickhouse
+      - clickhouse
-  otel-collector-hostmetrics:
+  otel-collector-metrics:
-    image: signoz/otelcontribcol:0.4.0
+    image: signoz/otelcontribcol:0.43.0
-    command: ["--config=/etc/otel-collector-config-hostmetrics.yaml", "--mem-ballast-size-mib=683"]
+    command: ["--config=/etc/otel-collector-metrics-config.yaml"]
    volumes:
-      - ./otel-collector-config-hostmetrics.yaml:/etc/otel-collector-config-hostmetrics.yaml
+      - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
    deploy:
      restart_policy:
        condition: on-failure
    depends_on:
-    - clickhouse
+      - clickhouse
  hotrod:
-    image: jaegertracing/example-hotrod:latest
+    image: jaegertracing/example-hotrod:1.30
    container_name: hotrod
    ports:
    - "9000:8080"
    command: ["all"]
    environment:
-    - JAEGER_ENDPOINT=http://otel-collector:14268/api/traces
+      - JAEGER_ENDPOINT=http://otel-collector:14268/api/traces
-
+    logging:
      options:
        max-size: 50m
        max-file: "3"
  load-hotrod:
    image: "grubykarol/locust:1.2.3-python3.9-alpine3.12"
    container_name: load-hotrod
    hostname: load-hotrod
    ports:
     - "8089:8089"
    environment:
      ATTACKED_HOST: http://hotrod:8080
      LOCUST_MODE: standalone
--- a/deploy/docker-swarm/clickhouse-setup/otel-collector-config-hostmetrics.yaml
+++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-config-hostmetrics.yaml
@ -1,72 +0,0 @@
 receivers:
  otlp:
    protocols:
      grpc:
      http:
  jaeger:
    protocols:
      grpc:
      thrift_http:
  hostmetrics:
    collection_interval: 60s
    scrapers:
      cpu:
      load:
      memory:
      disk:
      filesystem:
      network:
  # Data sources: metrics
  prometheus:
    config:
      scrape_configs:
        - job_name: "otel-collector"
          dns_sd_configs:
          - names:
            - 'tasks.signoz_otel-collector'
            type: 'A'
            port: 8888
        - job_name: "otel-collector-hostmetrics"
          scrape_interval: 10s
          static_configs:
            - targets: ["otel-collector-hostmetrics:8888"]
 processors:
  batch:
    send_batch_size: 1000
    timeout: 10s
  memory_limiter:
    # Same as --mem-ballast-size-mib CLI argument
    ballast_size_mib: 683
    # 80% of maximum memory up to 2G
    limit_mib: 1500
    # 25% of limit up to 2G
    spike_limit_mib: 512
    check_interval: 5s
  # queued_retry:
  #   num_workers: 4
  #   queue_size: 100
  #   retry_on_failure: true
 extensions:
  health_check: {}
  zpages: {}
 exporters:
  clickhouse:
    datasource: tcp://clickhouse:9000
  clickhousemetricswrite:
    endpoint: tcp://clickhouse:9000/?database=signoz_metrics
    resource_to_telemetry_conversion:
      enabled: true
 service:
  extensions: [health_check, zpages]
  pipelines:
    traces:
      receivers: [jaeger, otlp]
      processors: [batch]
      exporters: [clickhouse]
    metrics:
      receivers: [otlp, prometheus, hostmetrics]
      processors: [batch]
      exporters: [clickhousemetricswrite]
--- a/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml
+++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml
@ -1,4 +1,8 @@
 receivers:
  otlp/spanmetrics:
    protocols:
      grpc:
        endpoint: "localhost:12345"
  otlp:
    protocols:
      grpc:
@ -7,18 +11,30 @@ receivers:
    protocols:
      grpc:
      thrift_http:
  hostmetrics:
    collection_interval: 30s
    scrapers:
      cpu:
      load:
      memory:
      disk:
      filesystem:
      network:
 processors:
  batch:
    send_batch_size: 1000
    timeout: 10s
-  memory_limiter:
+  signozspanmetrics/prometheus:
-    # Same as --mem-ballast-size-mib CLI argument
+    metrics_exporter: prometheus
-    ballast_size_mib: 683
+    latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
-    # 80% of maximum memory up to 2G
+  # memory_limiter:
-    limit_mib: 1500
+  #   # Same as --mem-ballast-size-mib CLI argument
-    # 25% of limit up to 2G
+  #   ballast_size_mib: 683
-    spike_limit_mib: 512
+  #   # 80% of maximum memory up to 2G
-    check_interval: 5s
+  #   limit_mib: 1500
  #   # 25% of limit up to 2G
  #   spike_limit_mib: 512
  #   check_interval: 5s
  # queued_retry:
  #   num_workers: 4
  #   queue_size: 100
@ -33,15 +49,19 @@ exporters:
    endpoint: tcp://clickhouse:9000/?database=signoz_metrics
    resource_to_telemetry_conversion:
      enabled: true
-
+  prometheus:
    endpoint: "0.0.0.0:8889"
 service:
  extensions: [health_check, zpages]
  pipelines:
    traces:
      receivers: [jaeger, otlp]
-      processors: [batch]
+      processors: [signozspanmetrics/prometheus, batch]
      exporters: [clickhouse]
    metrics:
-      receivers: [otlp]
+      receivers: [otlp, hostmetrics]
      processors: [batch]
      exporters: [clickhousemetricswrite]
    metrics/spanmetrics:
      receivers: [otlp/spanmetrics]
      exporters: [prometheus]
--- a/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml
+++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml
@ -0,0 +1,44 @@
 receivers:
  otlp:
    protocols:
      grpc:
      http:
  # Data sources: metrics
  prometheus:
    config:
      scrape_configs:
        - job_name: "otel-collector"
          scrape_interval: 30s
          static_configs:
            - targets: ["otel-collector:8889"]
 processors:
  batch:
    send_batch_size: 1000
    timeout: 10s
  # memory_limiter:
  #   # Same as --mem-ballast-size-mib CLI argument
  #   ballast_size_mib: 683
  #   # 80% of maximum memory up to 2G
  #   limit_mib: 1500
  #   # 25% of limit up to 2G
  #   spike_limit_mib: 512
  #   check_interval: 5s
  # queued_retry:
  #   num_workers: 4
  #   queue_size: 100
  #   retry_on_failure: true
 extensions:
  health_check: {}
  zpages: {}
 exporters:
  clickhousemetricswrite:
    endpoint: tcp://clickhouse:9000/?database=signoz_metrics
 service:
  extensions: [health_check, zpages]
  pipelines:
    metrics:
      receivers: [otlp, prometheus]
      processors: [batch]
      exporters: [clickhousemetricswrite]
--- a/deploy/docker-swarm/clickhouse-setup/prometheus.yml
+++ b/deploy/docker-swarm/clickhouse-setup/prometheus.yml
@ -9,12 +9,13 @@ alerting:
  alertmanagers:
  - static_configs:
    - targets:
-      # - alertmanager:9093
+      - alertmanager:9093
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - 'alerts.yml'
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
--- a/deploy/docker/clickhouse-setup/data/alertmanager/.gitkeep
+++ b/deploy/docker/clickhouse-setup/data/alertmanager/.gitkeep
--- a/deploy/docker/clickhouse-setup/data/clickhouse/.gitkeep
+++ b/deploy/docker/clickhouse-setup/data/clickhouse/.gitkeep
--- a/deploy/docker/clickhouse-setup/data/signoz/.gitkeep
+++ b/deploy/docker/clickhouse-setup/data/signoz/.gitkeep
--- a/deploy/docker/clickhouse-setup/docker-compose.arm.yaml
+++ b/deploy/docker/clickhouse-setup/docker-compose.arm.yaml
@ -6,6 +6,7 @@ services:
    volumes:
      - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
      - ./data/clickhouse/:/var/lib/clickhouse/
    restart: on-failure
    healthcheck:
      # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
      test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -36,6 +37,7 @@ services:
      - GODEBUG=netdns=go
      - TELEMETRY_ENABLED=true
      - DEPLOYMENT_TYPE=docker-standalone-arm
    restart: on-failure
    depends_on:
      clickhouse:
@ -57,9 +59,17 @@ services:
    volumes:
      - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
-      - "4317:4317"       # OTLP GRPC receiver
+      - "4317:4317"     # OTLP gRPC receiver
      - "4318:4318"     # OTLP HTTP receiver
      # - "8889:8889"     # Prometheus metrics exposed by the agent
      # - "13133"         # health_check
      # - "14268:14268"   # Jaeger receiver
      # - "55678:55678"   # OpenCensus receiver
      # - "55679:55679"   # zpages extension
      # - "55680:55680"   # OTLP gRPC legacy port
      # - "55681:55681"   # OTLP HTTP legacy receiver
    mem_limit: 2000m
-    restart: always
+    restart: on-failure
    depends_on:
      clickhouse:
        condition: service_healthy
@ -69,6 +79,7 @@ services:
    command: ["--config=/etc/otel-collector-metrics-config.yaml"]
    volumes:
      - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
    restart: on-failure
    depends_on:
      clickhouse:
        condition: service_healthy
--- a/deploy/docker/clickhouse-setup/docker-compose.yaml
+++ b/deploy/docker/clickhouse-setup/docker-compose.yaml
@ -6,6 +6,7 @@ services:
    volumes:
      - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml
      - ./data/clickhouse/:/var/lib/clickhouse/
    restart: on-failure
    healthcheck:
      # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'"
      test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"]
@ -39,6 +40,7 @@ services:
      - GODEBUG=netdns=go
      - TELEMETRY_ENABLED=true
      - DEPLOYMENT_TYPE=docker-standalone-amd
    restart: on-failure
    depends_on:
      clickhouse:
@ -60,9 +62,17 @@ services:
    volumes:
      - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
-      - "4317:4317"       # OTLP GRPC receiver
+      - "4317:4317"     # OTLP gRPC receiver
      - "4318:4318"     # OTLP HTTP receiver
      # - "8889:8889"     # Prometheus metrics exposed by the agent
      # - "13133"         # health_check
      # - "14268:14268"   # Jaeger receiver
      # - "55678:55678"   # OpenCensus receiver
      # - "55679:55679"   # zpages extension
      # - "55680:55680"   # OTLP gRPC legacy port
      # - "55681:55681"   # OTLP HTTP legacy receiver
    mem_limit: 2000m
-    restart: always
+    restart: on-failure
    depends_on:
      clickhouse:
        condition: service_healthy
@ -72,6 +82,7 @@ services:
    command: ["--config=/etc/otel-collector-metrics-config.yaml"]
    volumes:
      - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
    restart: on-failure
    depends_on:
      clickhouse:
        condition: service_healthy
--- a/deploy/install.sh
+++ b/deploy/install.sh
@ -143,7 +143,7 @@ install_docker() {
        echo "Installing docker"
        $apt_cmd install docker-ce docker-ce-cli containerd.io
    elif [[ $package_manager == zypper ]]; then
-        zypper_cmd="zypper --quiet --no-gpg-checks --non-interactive"
+        zypper_cmd="$sudo_cmd zypper --quiet --no-gpg-checks --non-interactive"
        echo "Installing docker"
        if [[ $os == sles ]]; then
            os_sp="$(cat /etc/*-release | awk -F= '$1 == "VERSION_ID" { gsub(/"/, ""); print $2; exit }')"
@ -151,19 +151,19 @@ install_docker() {
            SUSEConnect -p sle-module-containers/$os_sp/$os_arch -r ''
        fi
        $zypper_cmd install docker docker-runc containerd
-        systemctl enable docker.service
+        $sudo_cmd systemctl enable docker.service
    elif [[ $package_manager == yum && $os == 'amazon linux' ]]; then
        echo
        echo "Amazon Linux detected ... "
        echo
        # yum install docker
        # service docker start
-        amazon-linux-extras install docker
+        $sudo_cmd amazon-linux-extras install docker
    else
-        yum_cmd="yum --assumeyes --quiet"
+        yum_cmd="$sudo_cmd yum --assumeyes --quiet"
        $yum_cmd install yum-utils
-        yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo
+        $sudo_cmd yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo
        echo "Installing docker"
        $yum_cmd install docker-ce docker-ce-cli containerd.io
@ -176,9 +176,9 @@ install_docker_compose() {
        if [[ ! -f /usr/bin/docker-compose ]];then
            echo "++++++++++++++++++++++++"
            echo "Installing docker-compose"
-            curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+            $sudo_cmd curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
-            chmod +x /usr/local/bin/docker-compose
+            $sudo_cmd chmod +x /usr/local/bin/docker-compose
-            ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
+            $sudo_cmd ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
            echo "docker-compose installed!"
            echo ""
        fi
@ -198,9 +198,9 @@ start_docker() {
    if [ $os = "Mac" ]; then
        open --background -a Docker && while ! docker system info > /dev/null 2>&1; do sleep 1; done
    else 
-        if ! systemctl is-active docker.service > /dev/null; then
+        if ! $sudo_cmd systemctl is-active docker.service > /dev/null; then
            echo "Starting docker service"
-            systemctl start docker.service
+            $sudo_cmd systemctl start docker.service
        fi
        if [ -z $sudo_cmd ]; then
            docker ps > /dev/null && true