diff --git a/.gitignore b/.gitignore index 1556564080..cb49cc98ef 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,7 @@ frontend/cypress.env.json frontend/*.env pkg/query-service/signoz.db + +# local data /deploy/docker/clickhouse-setup/data/ +/deploy/docker-swarm/clickhouse-setup/data/ diff --git a/deploy/docker-swarm/clickhouse-setup/alertmanager.yml b/deploy/docker-swarm/clickhouse-setup/alertmanager.yml new file mode 100644 index 0000000000..d69357f9dd --- /dev/null +++ b/deploy/docker-swarm/clickhouse-setup/alertmanager.yml @@ -0,0 +1,35 @@ +global: + resolve_timeout: 1m + slack_api_url: 'https://hooks.slack.com/services/xxx' + +route: + receiver: 'slack-notifications' + +receivers: +- name: 'slack-notifications' + slack_configs: + - channel: '#alerts' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} \ No newline at end of file diff --git a/deploy/docker-swarm/clickhouse-setup/alerts.yml b/deploy/docker-swarm/clickhouse-setup/alerts.yml new file mode 100644 index 0000000000..810a20750c --- /dev/null +++ b/deploy/docker-swarm/clickhouse-setup/alerts.yml @@ -0,0 +1,11 @@ +groups: +- name: ExampleCPULoadGroup + rules: + - alert: HighCpuLoad + expr: system_cpu_load_average_1m > 0.1 + for: 0m + labels: + severity: warning + annotations: + summary: High CPU load + description: "CPU load is > 0.1\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/deploy/docker-swarm/clickhouse-setup/data/alertmanager/.gitkeep b/deploy/docker-swarm/clickhouse-setup/data/alertmanager/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml b/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml index 46980c9a61..f00a6d3ca0 100644 --- a/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml +++ b/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml @@ -1,30 +1,35 @@ -version: "3" +version: "3.9" services: clickhouse: - image: yandex/clickhouse-server - expose: - - 8123 - - 9000 - ports: - - 9001:9000 - - 8123:8123 - volumes: - - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml - - ./docker-entrypoint-initdb.d/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql - - ./data/clickhouse/:/var/lib/clickhouse/ + image: yandex/clickhouse-server:21.12.3.32 + volumes: + - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml + - ./data/clickhouse/:/var/lib/clickhouse/ + deploy: + restart_policy: + condition: on-failure + healthcheck: + # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" + test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] + interval: 30s + timeout: 5s + retries: 3 - healthcheck: - # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" - test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] - interval: 30s - timeout: 5s - retries: 3 + alertmanager: + image: signoz/alertmanager:0.5.0 + volumes: + - ./alertmanager.yml:/prometheus/alertmanager.yml + - ./data/alertmanager:/data + command: + - '--config.file=/prometheus/alertmanager.yml' + - '--storage.path=/data' + deploy: + restart_policy: + condition: on-failure query-service: - image: signoz/query-service:0.4.1 - container_name: query-service - restart: always + image: signoz/query-service:0.7.1 command: ["-config=/root/config/prometheus.yml"] ports: - "8080:8080" @@ -35,77 +40,75 @@ services: environment: - ClickHouseUrl=tcp://clickhouse:9000 - STORAGE=clickhouse - - POSTHOG_API_KEY=H-htDCae7CR3RV57gUzmol6IAKtm5IMCvbcm_fwnL-w - GODEBUG=netdns=go - TELEMETRY_ENABLED=true - DEPLOYMENT_TYPE=docker-swarm + deploy: restart_policy: condition: on-failure depends_on: - clickhouse - frontend: - image: signoz/frontend:0.4.1 - container_name: frontend - + image: signoz/frontend:0.7.1 depends_on: - query-service - links: - - "query-service" ports: - "3301:3301" volumes: - ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf - otel-collector: - image: signoz/otelcontribcol:0.4.0 - command: ["--config=/etc/otel-collector-config.yaml", "--mem-ballast-size-mib=2000"] + image: signoz/otelcontribcol:0.43.0 + command: ["--config=/etc/otel-collector-config.yaml"] volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml ports: - - "1777:1777" # pprof extension - - "8887:8888" # Prometheus metrics exposed by the agent - - "14268:14268" # Jaeger receiver - - "55678" # OpenCensus receiver - - "55680:55680" # OTLP HTTP/2.0 legacy port - - "55681:55681" # OTLP HTTP/1.0 receiver - - "4317:4317" # OTLP GRPC receiver - - "55679:55679" # zpages extension - - "13133" # health_check + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + # - "8889:8889" # Prometheus metrics exposed by the agent + # - "13133" # health_check + # - "14268:14268" # Jaeger receiver + # - "55678:55678" # OpenCensus receiver + # - "55679:55679" # zpages extension + # - "55680:55680" # OTLP gRPC legacy receiver + # - "55681:55681" # OTLP HTTP legacy receiver deploy: mode: replicated replicas: 3 + restart_policy: + condition: on-failure + resources: + limits: + memory: 2000m depends_on: - - clickhouse + - clickhouse - otel-collector-hostmetrics: - image: signoz/otelcontribcol:0.4.0 - command: ["--config=/etc/otel-collector-config-hostmetrics.yaml", "--mem-ballast-size-mib=683"] + otel-collector-metrics: + image: signoz/otelcontribcol:0.43.0 + command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - - ./otel-collector-config-hostmetrics.yaml:/etc/otel-collector-config-hostmetrics.yaml + - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + deploy: + restart_policy: + condition: on-failure depends_on: - - clickhouse - + - clickhouse hotrod: - image: jaegertracing/example-hotrod:latest - container_name: hotrod - ports: - - "9000:8080" + image: jaegertracing/example-hotrod:1.30 command: ["all"] environment: - - JAEGER_ENDPOINT=http://otel-collector:14268/api/traces - + - JAEGER_ENDPOINT=http://otel-collector:14268/api/traces + logging: + options: + max-size: 50m + max-file: "3" load-hotrod: image: "grubykarol/locust:1.2.3-python3.9-alpine3.12" - container_name: load-hotrod hostname: load-hotrod - ports: - - "8089:8089" environment: ATTACKED_HOST: http://hotrod:8080 LOCUST_MODE: standalone @@ -115,4 +118,4 @@ services: QUIET_MODE: "${QUIET_MODE:-false}" LOCUST_OPTS: "--headless -u 10 -r 1" volumes: - - ../common/locust-scripts:/locust \ No newline at end of file + - ../common/locust-scripts:/locust diff --git a/deploy/docker-swarm/clickhouse-setup/otel-collector-config-hostmetrics.yaml b/deploy/docker-swarm/clickhouse-setup/otel-collector-config-hostmetrics.yaml deleted file mode 100644 index 28033e9f2e..0000000000 --- a/deploy/docker-swarm/clickhouse-setup/otel-collector-config-hostmetrics.yaml +++ /dev/null @@ -1,72 +0,0 @@ -receivers: - otlp: - protocols: - grpc: - http: - jaeger: - protocols: - grpc: - thrift_http: - - hostmetrics: - collection_interval: 60s - scrapers: - cpu: - load: - memory: - disk: - filesystem: - network: - - # Data sources: metrics - prometheus: - config: - scrape_configs: - - job_name: "otel-collector" - dns_sd_configs: - - names: - - 'tasks.signoz_otel-collector' - type: 'A' - port: 8888 - - job_name: "otel-collector-hostmetrics" - scrape_interval: 10s - static_configs: - - targets: ["otel-collector-hostmetrics:8888"] -processors: - batch: - send_batch_size: 1000 - timeout: 10s - memory_limiter: - # Same as --mem-ballast-size-mib CLI argument - ballast_size_mib: 683 - # 80% of maximum memory up to 2G - limit_mib: 1500 - # 25% of limit up to 2G - spike_limit_mib: 512 - check_interval: 5s - # queued_retry: - # num_workers: 4 - # queue_size: 100 - # retry_on_failure: true -extensions: - health_check: {} - zpages: {} -exporters: - clickhouse: - datasource: tcp://clickhouse:9000 - clickhousemetricswrite: - endpoint: tcp://clickhouse:9000/?database=signoz_metrics - resource_to_telemetry_conversion: - enabled: true - -service: - extensions: [health_check, zpages] - pipelines: - traces: - receivers: [jaeger, otlp] - processors: [batch] - exporters: [clickhouse] - metrics: - receivers: [otlp, prometheus, hostmetrics] - processors: [batch] - exporters: [clickhousemetricswrite] \ No newline at end of file diff --git a/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml b/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml index 7d41a4cb83..a4a2641daa 100644 --- a/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml +++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml @@ -1,4 +1,8 @@ receivers: + otlp/spanmetrics: + protocols: + grpc: + endpoint: "localhost:12345" otlp: protocols: grpc: @@ -7,18 +11,30 @@ receivers: protocols: grpc: thrift_http: + hostmetrics: + collection_interval: 30s + scrapers: + cpu: + load: + memory: + disk: + filesystem: + network: processors: batch: send_batch_size: 1000 timeout: 10s - memory_limiter: - # Same as --mem-ballast-size-mib CLI argument - ballast_size_mib: 683 - # 80% of maximum memory up to 2G - limit_mib: 1500 - # 25% of limit up to 2G - spike_limit_mib: 512 - check_interval: 5s + signozspanmetrics/prometheus: + metrics_exporter: prometheus + latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ] + # memory_limiter: + # # Same as --mem-ballast-size-mib CLI argument + # ballast_size_mib: 683 + # # 80% of maximum memory up to 2G + # limit_mib: 1500 + # # 25% of limit up to 2G + # spike_limit_mib: 512 + # check_interval: 5s # queued_retry: # num_workers: 4 # queue_size: 100 @@ -33,15 +49,19 @@ exporters: endpoint: tcp://clickhouse:9000/?database=signoz_metrics resource_to_telemetry_conversion: enabled: true - + prometheus: + endpoint: "0.0.0.0:8889" service: extensions: [health_check, zpages] pipelines: traces: receivers: [jaeger, otlp] - processors: [batch] + processors: [signozspanmetrics/prometheus, batch] exporters: [clickhouse] metrics: - receivers: [otlp] + receivers: [otlp, hostmetrics] processors: [batch] - exporters: [clickhousemetricswrite] \ No newline at end of file + exporters: [clickhousemetricswrite] + metrics/spanmetrics: + receivers: [otlp/spanmetrics] + exporters: [prometheus] \ No newline at end of file diff --git a/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml b/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml new file mode 100644 index 0000000000..3af039268c --- /dev/null +++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml @@ -0,0 +1,44 @@ +receivers: + otlp: + protocols: + grpc: + http: + + # Data sources: metrics + prometheus: + config: + scrape_configs: + - job_name: "otel-collector" + scrape_interval: 30s + static_configs: + - targets: ["otel-collector:8889"] +processors: + batch: + send_batch_size: 1000 + timeout: 10s + # memory_limiter: + # # Same as --mem-ballast-size-mib CLI argument + # ballast_size_mib: 683 + # # 80% of maximum memory up to 2G + # limit_mib: 1500 + # # 25% of limit up to 2G + # spike_limit_mib: 512 + # check_interval: 5s + # queued_retry: + # num_workers: 4 + # queue_size: 100 + # retry_on_failure: true +extensions: + health_check: {} + zpages: {} +exporters: + clickhousemetricswrite: + endpoint: tcp://clickhouse:9000/?database=signoz_metrics + +service: + extensions: [health_check, zpages] + pipelines: + metrics: + receivers: [otlp, prometheus] + processors: [batch] + exporters: [clickhousemetricswrite] \ No newline at end of file diff --git a/deploy/docker-swarm/clickhouse-setup/prometheus.yml b/deploy/docker-swarm/clickhouse-setup/prometheus.yml index 7d04428a42..16e65ff18c 100644 --- a/deploy/docker-swarm/clickhouse-setup/prometheus.yml +++ b/deploy/docker-swarm/clickhouse-setup/prometheus.yml @@ -9,12 +9,13 @@ alerting: alertmanagers: - static_configs: - targets: - # - alertmanager:9093 + - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" + - 'alerts.yml' # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. diff --git a/deploy/docker/clickhouse-setup/data/alertmanager/.gitkeep b/deploy/docker/clickhouse-setup/data/alertmanager/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deploy/docker/clickhouse-setup/data/clickhouse/.gitkeep b/deploy/docker/clickhouse-setup/data/clickhouse/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deploy/docker/clickhouse-setup/data/signoz/.gitkeep b/deploy/docker/clickhouse-setup/data/signoz/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deploy/docker/clickhouse-setup/docker-compose.arm.yaml b/deploy/docker/clickhouse-setup/docker-compose.arm.yaml index 32375e83c9..c6616a10bb 100644 --- a/deploy/docker/clickhouse-setup/docker-compose.arm.yaml +++ b/deploy/docker/clickhouse-setup/docker-compose.arm.yaml @@ -6,6 +6,7 @@ services: volumes: - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml - ./data/clickhouse/:/var/lib/clickhouse/ + restart: on-failure healthcheck: # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] @@ -36,6 +37,7 @@ services: - GODEBUG=netdns=go - TELEMETRY_ENABLED=true - DEPLOYMENT_TYPE=docker-standalone-arm + restart: on-failure depends_on: clickhouse: @@ -57,9 +59,17 @@ services: volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml ports: - - "4317:4317" # OTLP GRPC receiver + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + # - "8889:8889" # Prometheus metrics exposed by the agent + # - "13133" # health_check + # - "14268:14268" # Jaeger receiver + # - "55678:55678" # OpenCensus receiver + # - "55679:55679" # zpages extension + # - "55680:55680" # OTLP gRPC legacy port + # - "55681:55681" # OTLP HTTP legacy receiver mem_limit: 2000m - restart: always + restart: on-failure depends_on: clickhouse: condition: service_healthy @@ -69,6 +79,7 @@ services: command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + restart: on-failure depends_on: clickhouse: condition: service_healthy diff --git a/deploy/docker/clickhouse-setup/docker-compose.yaml b/deploy/docker/clickhouse-setup/docker-compose.yaml index 2a73859312..94f579bc84 100644 --- a/deploy/docker/clickhouse-setup/docker-compose.yaml +++ b/deploy/docker/clickhouse-setup/docker-compose.yaml @@ -6,6 +6,7 @@ services: volumes: - ./clickhouse-config.xml:/etc/clickhouse-server/config.xml - ./data/clickhouse/:/var/lib/clickhouse/ + restart: on-failure healthcheck: # "clickhouse", "client", "-u ${CLICKHOUSE_USER}", "--password ${CLICKHOUSE_PASSWORD}", "-q 'SELECT 1'" test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] @@ -39,6 +40,7 @@ services: - GODEBUG=netdns=go - TELEMETRY_ENABLED=true - DEPLOYMENT_TYPE=docker-standalone-amd + restart: on-failure depends_on: clickhouse: @@ -60,9 +62,17 @@ services: volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml ports: - - "4317:4317" # OTLP GRPC receiver + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + # - "8889:8889" # Prometheus metrics exposed by the agent + # - "13133" # health_check + # - "14268:14268" # Jaeger receiver + # - "55678:55678" # OpenCensus receiver + # - "55679:55679" # zpages extension + # - "55680:55680" # OTLP gRPC legacy port + # - "55681:55681" # OTLP HTTP legacy receiver mem_limit: 2000m - restart: always + restart: on-failure depends_on: clickhouse: condition: service_healthy @@ -72,6 +82,7 @@ services: command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + restart: on-failure depends_on: clickhouse: condition: service_healthy diff --git a/deploy/install.sh b/deploy/install.sh index 6fc59ceba1..27f80d4331 100755 --- a/deploy/install.sh +++ b/deploy/install.sh @@ -143,7 +143,7 @@ install_docker() { echo "Installing docker" $apt_cmd install docker-ce docker-ce-cli containerd.io elif [[ $package_manager == zypper ]]; then - zypper_cmd="zypper --quiet --no-gpg-checks --non-interactive" + zypper_cmd="$sudo_cmd zypper --quiet --no-gpg-checks --non-interactive" echo "Installing docker" if [[ $os == sles ]]; then os_sp="$(cat /etc/*-release | awk -F= '$1 == "VERSION_ID" { gsub(/"/, ""); print $2; exit }')" @@ -151,19 +151,19 @@ install_docker() { SUSEConnect -p sle-module-containers/$os_sp/$os_arch -r '' fi $zypper_cmd install docker docker-runc containerd - systemctl enable docker.service + $sudo_cmd systemctl enable docker.service elif [[ $package_manager == yum && $os == 'amazon linux' ]]; then echo echo "Amazon Linux detected ... " echo # yum install docker # service docker start - amazon-linux-extras install docker + $sudo_cmd amazon-linux-extras install docker else - yum_cmd="yum --assumeyes --quiet" + yum_cmd="$sudo_cmd yum --assumeyes --quiet" $yum_cmd install yum-utils - yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo + $sudo_cmd yum-config-manager --add-repo https://download.docker.com/linux/$os/docker-ce.repo echo "Installing docker" $yum_cmd install docker-ce docker-ce-cli containerd.io @@ -176,9 +176,9 @@ install_docker_compose() { if [[ ! -f /usr/bin/docker-compose ]];then echo "++++++++++++++++++++++++" echo "Installing docker-compose" - curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose - chmod +x /usr/local/bin/docker-compose - ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose + $sudo_cmd curl -L "https://github.com/docker/compose/releases/download/1.26.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + $sudo_cmd chmod +x /usr/local/bin/docker-compose + $sudo_cmd ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose echo "docker-compose installed!" echo "" fi @@ -198,9 +198,9 @@ start_docker() { if [ $os = "Mac" ]; then open --background -a Docker && while ! docker system info > /dev/null 2>&1; do sleep 1; done else - if ! systemctl is-active docker.service > /dev/null; then + if ! $sudo_cmd systemctl is-active docker.service > /dev/null; then echo "Starting docker service" - systemctl start docker.service + $sudo_cmd systemctl start docker.service fi if [ -z $sudo_cmd ]; then docker ps > /dev/null && true