From 6fb7e34dbc7a5b6380f16c3792d81bada64bb193 Mon Sep 17 00:00:00 2001 From: Prashant Shahi Date: Thu, 14 Jul 2022 19:36:19 +0530 Subject: [PATCH] =?UTF-8?q?chore:=20=F0=9F=94=A7=20otel-collector=20config?= =?UTF-8?q?=20changes=20(#1388)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: 🔧 otel-collector config changes * chore: 🗑️ remove redundant users.xml * chore: 🔧 otel-config changes - seperate scraper job for otel-collector and otel-collector-metrcs internal metrics - use resourcedetection only for hostmetrics - add swarm service name and task name in resource attributes env Signed-off-by: Prashant Shahi --- .../clickhouse-setup/docker-compose.yaml | 21 ++- .../otel-collector-config.yaml | 59 +++++++-- .../otel-collector-metrics-config.yaml | 44 +++++-- .../clickhouse-setup/docker-compose.yaml | 21 ++- .../otel-collector-config.yaml | 58 +++++++-- .../otel-collector-metrics-config.yaml | 39 +++++- deploy/docker/clickhouse-setup/users.xml | 123 ------------------ .../tests/test-deploy/docker-compose.yaml | 2 + .../test-deploy/otel-collector-config.yaml | 71 +++++++--- .../otel-collector-metrics-config.yaml | 53 ++++++-- 10 files changed, 281 insertions(+), 210 deletions(-) delete mode 100644 deploy/docker/clickhouse-setup/users.xml diff --git a/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml b/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml index 2bd2a48bde..148f0aa77b 100644 --- a/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml +++ b/deploy/docker-swarm/clickhouse-setup/docker-compose.yaml @@ -86,15 +86,19 @@ services: volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml ports: + # - "1777:1777" # pprof extension - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver - # - "8889:8889" # Prometheus metrics exposed by the agent - # - "13133:13133" # health_check - # - "14268:14268" # Jaeger receiver + # - "8888:8888" # OtelCollector internal metrics + # - "8889:8889" # signoz spanmetrics exposed by the agent + # - "9411:9411" # Zipkin port + # - "13133:13133" # Health check extension + # - "14250:14250" # Jaeger gRPC + # - "14268:14268" # Jaeger thrift HTTP # - "55678:55678" # OpenCensus receiver - # - "55679:55679" # zpages extension - # - "55680:55680" # OTLP gRPC legacy receiver - # - "55681:55681" # OTLP HTTP legacy receiver + # - "55679:55679" # zPages extension + environment: + - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}},dockerswarm.service.name={{.Service.Name}},dockerswarm.task.name={{.Task.Name}} deploy: mode: replicated replicas: 3 @@ -111,6 +115,11 @@ services: command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + # ports: + # - "1777:1777" # pprof extension + # - "8888:8888" # OtelCollector internal metrics + # - "13133:13133" # Health check extension + # - "55679:55679" # zPages extension deploy: restart_policy: condition: on-failure diff --git a/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml b/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml index a998d93ab9..61292c5781 100644 --- a/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml +++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-config.yaml @@ -1,30 +1,46 @@ receivers: + opencensus: + endpoint: 0.0.0.0:55678 otlp/spanmetrics: protocols: grpc: - endpoint: "localhost:12345" + endpoint: localhost:12345 otlp: protocols: grpc: + endpoint: 0.0.0.0:4317 http: + endpoint: 0.0.0.0:4318 jaeger: protocols: grpc: + endpoint: 0.0.0.0:14250 thrift_http: + endpoint: 0.0.0.0:14268 + # thrift_compact: + # endpoint: 0.0.0.0:6831 + # thrift_binary: + # endpoint: 0.0.0.0:6832 hostmetrics: collection_interval: 60s scrapers: - cpu: - load: - memory: - disk: - filesystem: - network: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + processors: batch: send_batch_size: 10000 send_batch_max_size: 11000 timeout: 10s + resourcedetection: + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + detectors: [env, system] # include ec2 for AWS, gce for GCP and azure for Azure. + timeout: 2s + override: false signozspanmetrics/prometheus: metrics_exporter: prometheus latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ] @@ -49,9 +65,7 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true -extensions: - health_check: {} - zpages: {} + exporters: clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces @@ -60,18 +74,35 @@ exporters: resource_to_telemetry_conversion: enabled: true prometheus: - endpoint: "0.0.0.0:8889" + endpoint: 0.0.0.0:8889 + # logging: {} + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] pipelines: traces: receivers: [jaeger, otlp] processors: [signozspanmetrics/prometheus, batch] exporters: [clickhousetraces] metrics: - receivers: [otlp, hostmetrics] + receivers: [otlp] processors: [batch] exporters: [clickhousemetricswrite] + metrics/hostmetrics: + receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [clickhousemetricswrite] metrics/spanmetrics: receivers: [otlp/spanmetrics] - exporters: [prometheus] \ No newline at end of file + exporters: [prometheus] diff --git a/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml b/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml index 3aa39b5f7e..a01f356437 100644 --- a/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml +++ b/deploy/docker-swarm/clickhouse-setup/otel-collector-metrics-config.yaml @@ -1,17 +1,26 @@ receivers: - otlp: - protocols: - grpc: - http: - - # Data sources: metrics prometheus: config: scrape_configs: + # otel-collector internal metrics - job_name: "otel-collector" scrape_interval: 60s static_configs: - - targets: ["otel-collector:8889"] + - targets: + - otel-collector:8888 + # otel-collector-metrics internal metrics + - job_name: "otel-collector-metrics" + scrape_interval: 60s + static_configs: + - targets: + - localhost:8888 + # SigNoz span metrics + - job_name: "signozspanmetrics-collector" + scrape_interval: 60s + static_configs: + - targets: + - otel-collector:8889 + processors: batch: send_batch_size: 10000 @@ -32,17 +41,26 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true -extensions: - health_check: {} - zpages: {} + exporters: clickhousemetricswrite: endpoint: tcp://clickhouse:9000/?database=signoz_metrics +extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] pipelines: metrics: - receivers: [otlp, prometheus] + receivers: [prometheus] processors: [batch] - exporters: [clickhousemetricswrite] \ No newline at end of file + exporters: [clickhousemetricswrite] diff --git a/deploy/docker/clickhouse-setup/docker-compose.yaml b/deploy/docker/clickhouse-setup/docker-compose.yaml index f8c2954446..5a47b6a461 100644 --- a/deploy/docker/clickhouse-setup/docker-compose.yaml +++ b/deploy/docker/clickhouse-setup/docker-compose.yaml @@ -82,16 +82,20 @@ services: command: ["--config=/etc/otel-collector-config.yaml"] volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + environment: + - OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux ports: + # - "1777:1777" # pprof extension - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver - # - "8889:8889" # Prometheus metrics exposed by the agent - # - "13133:13133" # health_check - # - "14268:14268" # Jaeger receiver + # - "8888:8888" # OtelCollector internal metrics + # - "8889:8889" # signoz spanmetrics exposed by the agent + # - "9411:9411" # Zipkin port + # - "13133:13133" # health check extension + # - "14250:14250" # Jaeger gRPC + # - "14268:14268" # Jaeger thrift HTTP # - "55678:55678" # OpenCensus receiver - # - "55679:55679" # zpages extension - # - "55680:55680" # OTLP gRPC legacy receiver - # - "55681:55681" # OTLP HTTP legacy receiver + # - "55679:55679" # zPages extension mem_limit: 2000m restart: on-failure depends_on: @@ -103,6 +107,11 @@ services: command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + # ports: + # - "1777:1777" # pprof extension + # - "8888:8888" # OtelCollector internal metrics + # - "13133:13133" # Health check extension + # - "55679:55679" # zPages extension restart: on-failure depends_on: clickhouse: diff --git a/deploy/docker/clickhouse-setup/otel-collector-config.yaml b/deploy/docker/clickhouse-setup/otel-collector-config.yaml index e363f015df..0717cf4c45 100644 --- a/deploy/docker/clickhouse-setup/otel-collector-config.yaml +++ b/deploy/docker/clickhouse-setup/otel-collector-config.yaml @@ -1,25 +1,36 @@ receivers: + opencensus: + endpoint: 0.0.0.0:55678 otlp/spanmetrics: protocols: grpc: - endpoint: "localhost:12345" + endpoint: localhost:12345 otlp: protocols: grpc: + endpoint: 0.0.0.0:4317 http: + endpoint: 0.0.0.0:4318 jaeger: protocols: grpc: + endpoint: 0.0.0.0:14250 thrift_http: + endpoint: 0.0.0.0:14268 + # thrift_compact: + # endpoint: 0.0.0.0:6831 + # thrift_binary: + # endpoint: 0.0.0.0:6832 hostmetrics: collection_interval: 60s scrapers: - cpu: - load: - memory: - disk: - filesystem: - network: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + processors: batch: send_batch_size: 10000 @@ -49,9 +60,20 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + resourcedetection: + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + detectors: [env, system] # include ec2 for AWS, gce for GCP and azure for Azure. + timeout: 2s + override: false + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces @@ -60,18 +82,30 @@ exporters: resource_to_telemetry_conversion: enabled: true prometheus: - endpoint: "0.0.0.0:8889" + endpoint: 0.0.0.0:8889 + # logging: {} + service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: traces: receivers: [jaeger, otlp] processors: [signozspanmetrics/prometheus, batch] exporters: [clickhousetraces] metrics: - receivers: [otlp, hostmetrics] + receivers: [otlp] processors: [batch] exporters: [clickhousemetricswrite] + metrics/hostmetrics: + receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [clickhousemetricswrite] metrics/spanmetrics: receivers: [otlp/spanmetrics] exporters: [prometheus] diff --git a/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml b/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml index 26c629ba60..fdc5830f57 100644 --- a/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml +++ b/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml @@ -3,15 +3,28 @@ receivers: protocols: grpc: http: - - # Data sources: metrics prometheus: config: scrape_configs: + # otel-collector internal metrics - job_name: "otel-collector" scrape_interval: 60s static_configs: - - targets: ["otel-collector:8889"] + - targets: + - otel-collector:8888 + # otel-collector-metrics internal metrics + - job_name: "otel-collector-metrics" + scrape_interval: 60s + static_configs: + - targets: + - localhost:8888 + # SigNoz span metrics + - job_name: "signozspanmetrics-collector" + scrape_interval: 60s + static_configs: + - targets: + - otel-collector:8889 + processors: batch: send_batch_size: 10000 @@ -32,17 +45,29 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousemetricswrite: endpoint: tcp://clickhouse:9000/?database=signoz_metrics service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: metrics: - receivers: [otlp, prometheus] + receivers: [prometheus] processors: [batch] exporters: [clickhousemetricswrite] diff --git a/deploy/docker/clickhouse-setup/users.xml b/deploy/docker/clickhouse-setup/users.xml deleted file mode 100644 index f18562071d..0000000000 --- a/deploy/docker/clickhouse-setup/users.xml +++ /dev/null @@ -1,123 +0,0 @@ - - - - - - - - - - 10000000000 - - - random - - - - - 1 - - - - - - - - - - - - - ::/0 - - - - default - - - default - - - - - - - - - - - - - - 3600 - - - 0 - 0 - 0 - 0 - 0 - - - - diff --git a/pkg/query-service/tests/test-deploy/docker-compose.yaml b/pkg/query-service/tests/test-deploy/docker-compose.yaml index 6191c18fa0..9ef7cb1bfc 100644 --- a/pkg/query-service/tests/test-deploy/docker-compose.yaml +++ b/pkg/query-service/tests/test-deploy/docker-compose.yaml @@ -63,6 +63,8 @@ services: command: ["--config=/etc/otel-collector-config.yaml"] volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + environment: + - OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux ports: - "4317:4317" # OTLP GRPC receiver mem_limit: 2000m diff --git a/pkg/query-service/tests/test-deploy/otel-collector-config.yaml b/pkg/query-service/tests/test-deploy/otel-collector-config.yaml index d6c12ddcc1..b343350a34 100644 --- a/pkg/query-service/tests/test-deploy/otel-collector-config.yaml +++ b/pkg/query-service/tests/test-deploy/otel-collector-config.yaml @@ -1,28 +1,40 @@ receivers: + opencensus: + endpoint: 0.0.0.0:55678 otlp/spanmetrics: protocols: grpc: - endpoint: "localhost:12345" + endpoint: localhost:12345 otlp: protocols: grpc: + endpoint: 0.0.0.0:4317 http: + endpoint: 0.0.0.0:4318 jaeger: protocols: grpc: + endpoint: 0.0.0.0:14250 thrift_http: + endpoint: 0.0.0.0:14268 + # thrift_compact: + # endpoint: 0.0.0.0:6831 + # thrift_binary: + # endpoint: 0.0.0.0:6832 hostmetrics: - collection_interval: 30s + collection_interval: 60s scrapers: - cpu: - load: - memory: - disk: - filesystem: - network: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + processors: batch: - send_batch_size: 1000 + send_batch_size: 10000 + send_batch_max_size: 11000 timeout: 10s signozspanmetrics/prometheus: metrics_exporter: prometheus @@ -34,20 +46,33 @@ processors: - name: deployment.environment default: default # memory_limiter: - # # Same as --mem-ballast-size-mib CLI argument - # ballast_size_mib: 683 # # 80% of maximum memory up to 2G # limit_mib: 1500 # # 25% of limit up to 2G # spike_limit_mib: 512 # check_interval: 5s + # + # # 50% of the maximum memory + # limit_percentage: 50 + # # 20% of max memory usage spike expected + # spike_limit_percentage: 20 # queued_retry: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + resourcedetection: + detectors: [env, system] + timeout: 2s + override: false + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces @@ -56,18 +81,30 @@ exporters: resource_to_telemetry_conversion: enabled: true prometheus: - endpoint: "0.0.0.0:8889" + endpoint: 0.0.0.0:8889 + # logging: {} + service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: traces: receivers: [jaeger, otlp] processors: [signozspanmetrics/prometheus, batch] exporters: [clickhousetraces] metrics: - receivers: [otlp, hostmetrics] + receivers: [otlp] processors: [batch] exporters: [clickhousemetricswrite] + metrics/hostmetrics: + receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [clickhousemetricswrite] metrics/spanmetrics: receivers: [otlp/spanmetrics] - exporters: [prometheus] \ No newline at end of file + exporters: [prometheus] diff --git a/pkg/query-service/tests/test-deploy/otel-collector-metrics-config.yaml b/pkg/query-service/tests/test-deploy/otel-collector-metrics-config.yaml index 3af039268c..fdc5830f57 100644 --- a/pkg/query-service/tests/test-deploy/otel-collector-metrics-config.yaml +++ b/pkg/query-service/tests/test-deploy/otel-collector-metrics-config.yaml @@ -3,42 +3,71 @@ receivers: protocols: grpc: http: - - # Data sources: metrics prometheus: config: scrape_configs: + # otel-collector internal metrics - job_name: "otel-collector" - scrape_interval: 30s + scrape_interval: 60s static_configs: - - targets: ["otel-collector:8889"] + - targets: + - otel-collector:8888 + # otel-collector-metrics internal metrics + - job_name: "otel-collector-metrics" + scrape_interval: 60s + static_configs: + - targets: + - localhost:8888 + # SigNoz span metrics + - job_name: "signozspanmetrics-collector" + scrape_interval: 60s + static_configs: + - targets: + - otel-collector:8889 + processors: batch: - send_batch_size: 1000 + send_batch_size: 10000 + send_batch_max_size: 11000 timeout: 10s # memory_limiter: - # # Same as --mem-ballast-size-mib CLI argument - # ballast_size_mib: 683 # # 80% of maximum memory up to 2G # limit_mib: 1500 # # 25% of limit up to 2G # spike_limit_mib: 512 # check_interval: 5s + # + # # 50% of the maximum memory + # limit_percentage: 50 + # # 20% of max memory usage spike expected + # spike_limit_percentage: 20 # queued_retry: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousemetricswrite: endpoint: tcp://clickhouse:9000/?database=signoz_metrics service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: metrics: - receivers: [otlp, prometheus] + receivers: [prometheus] processors: [batch] - exporters: [clickhousemetricswrite] \ No newline at end of file + exporters: [clickhousemetricswrite]