chore: 🔧 otel-collector config changes (#1388)

* chore: 🔧 otel-collector config changes

* chore: 🗑️  remove redundant users.xml

* chore: 🔧 otel-config changes

- seperate scraper job for otel-collector and otel-collector-metrcs internal metrics
- use resourcedetection only for hostmetrics
- add swarm service name and task name in resource attributes env

Signed-off-by: Prashant Shahi <prashant@signoz.io>
This commit is contained in:
Prashant Shahi 2022-07-14 19:36:19 +05:30 committed by GitHub
parent a2e1c41343
commit 6fb7e34dbc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 281 additions and 210 deletions

View File

@ -86,15 +86,19 @@ services:
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
# - "1777:1777" # pprof extension
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133:13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "8888:8888" # OtelCollector internal metrics
# - "8889:8889" # signoz spanmetrics exposed by the agent
# - "9411:9411" # Zipkin port
# - "13133:13133" # Health check extension
# - "14250:14250" # Jaeger gRPC
# - "14268:14268" # Jaeger thrift HTTP
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy receiver
# - "55681:55681" # OTLP HTTP legacy receiver
# - "55679:55679" # zPages extension
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}},dockerswarm.service.name={{.Service.Name}},dockerswarm.task.name={{.Task.Name}}
deploy:
mode: replicated
replicas: 3
@ -111,6 +115,11 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
# ports:
# - "1777:1777" # pprof extension
# - "8888:8888" # OtelCollector internal metrics
# - "13133:13133" # Health check extension
# - "55679:55679" # zPages extension
deploy:
restart_policy:
condition: on-failure

View File

@ -1,30 +1,46 @@
receivers:
opencensus:
endpoint: 0.0.0.0:55678
otlp/spanmetrics:
protocols:
grpc:
endpoint: "localhost:12345"
endpoint: localhost:12345
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
# thrift_compact:
# endpoint: 0.0.0.0:6831
# thrift_binary:
# endpoint: 0.0.0.0:6832
hostmetrics:
collection_interval: 60s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
cpu: {}
load: {}
memory: {}
disk: {}
filesystem: {}
network: {}
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors: [env, system] # include ec2 for AWS, gce for GCP and azure for Azure.
timeout: 2s
override: false
signozspanmetrics/prometheus:
metrics_exporter: prometheus
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
@ -49,9 +65,7 @@ processors:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
@ -60,18 +74,35 @@ exporters:
resource_to_telemetry_conversion:
enabled: true
prometheus:
endpoint: "0.0.0.0:8889"
endpoint: 0.0.0.0:8889
# logging: {}
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions: [health_check, zpages, pprof]
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [signozspanmetrics/prometheus, batch]
exporters: [clickhousetraces]
metrics:
receivers: [otlp, hostmetrics]
receivers: [otlp]
processors: [batch]
exporters: [clickhousemetricswrite]
metrics/hostmetrics:
receivers: [hostmetrics]
processors: [resourcedetection, batch]
exporters: [clickhousemetricswrite]
metrics/spanmetrics:
receivers: [otlp/spanmetrics]
exporters: [prometheus]

View File

@ -1,17 +1,26 @@
receivers:
otlp:
protocols:
grpc:
http:
# Data sources: metrics
prometheus:
config:
scrape_configs:
# otel-collector internal metrics
- job_name: "otel-collector"
scrape_interval: 60s
static_configs:
- targets: ["otel-collector:8889"]
- targets:
- otel-collector:8888
# otel-collector-metrics internal metrics
- job_name: "otel-collector-metrics"
scrape_interval: 60s
static_configs:
- targets:
- localhost:8888
# SigNoz span metrics
- job_name: "signozspanmetrics-collector"
scrape_interval: 60s
static_configs:
- targets:
- otel-collector:8889
processors:
batch:
send_batch_size: 10000
@ -32,17 +41,26 @@ processors:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
exporters:
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions: [health_check, zpages, pprof]
pipelines:
metrics:
receivers: [otlp, prometheus]
receivers: [prometheus]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -82,16 +82,20 @@ services:
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
ports:
# - "1777:1777" # pprof extension
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
# - "8889:8889" # Prometheus metrics exposed by the agent
# - "13133:13133" # health_check
# - "14268:14268" # Jaeger receiver
# - "8888:8888" # OtelCollector internal metrics
# - "8889:8889" # signoz spanmetrics exposed by the agent
# - "9411:9411" # Zipkin port
# - "13133:13133" # health check extension
# - "14250:14250" # Jaeger gRPC
# - "14268:14268" # Jaeger thrift HTTP
# - "55678:55678" # OpenCensus receiver
# - "55679:55679" # zpages extension
# - "55680:55680" # OTLP gRPC legacy receiver
# - "55681:55681" # OTLP HTTP legacy receiver
# - "55679:55679" # zPages extension
mem_limit: 2000m
restart: on-failure
depends_on:
@ -103,6 +107,11 @@ services:
command: ["--config=/etc/otel-collector-metrics-config.yaml"]
volumes:
- ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml
# ports:
# - "1777:1777" # pprof extension
# - "8888:8888" # OtelCollector internal metrics
# - "13133:13133" # Health check extension
# - "55679:55679" # zPages extension
restart: on-failure
depends_on:
clickhouse:

View File

@ -1,25 +1,36 @@
receivers:
opencensus:
endpoint: 0.0.0.0:55678
otlp/spanmetrics:
protocols:
grpc:
endpoint: "localhost:12345"
endpoint: localhost:12345
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
# thrift_compact:
# endpoint: 0.0.0.0:6831
# thrift_binary:
# endpoint: 0.0.0.0:6832
hostmetrics:
collection_interval: 60s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
cpu: {}
load: {}
memory: {}
disk: {}
filesystem: {}
network: {}
processors:
batch:
send_batch_size: 10000
@ -49,9 +60,20 @@ processors:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors: [env, system] # include ec2 for AWS, gce for GCP and azure for Azure.
timeout: 2s
override: false
extensions:
health_check: {}
zpages: {}
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
@ -60,18 +82,30 @@ exporters:
resource_to_telemetry_conversion:
enabled: true
prometheus:
endpoint: "0.0.0.0:8889"
endpoint: 0.0.0.0:8889
# logging: {}
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- zpages
- pprof
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [signozspanmetrics/prometheus, batch]
exporters: [clickhousetraces]
metrics:
receivers: [otlp, hostmetrics]
receivers: [otlp]
processors: [batch]
exporters: [clickhousemetricswrite]
metrics/hostmetrics:
receivers: [hostmetrics]
processors: [resourcedetection, batch]
exporters: [clickhousemetricswrite]
metrics/spanmetrics:
receivers: [otlp/spanmetrics]
exporters: [prometheus]

View File

@ -3,15 +3,28 @@ receivers:
protocols:
grpc:
http:
# Data sources: metrics
prometheus:
config:
scrape_configs:
# otel-collector internal metrics
- job_name: "otel-collector"
scrape_interval: 60s
static_configs:
- targets: ["otel-collector:8889"]
- targets:
- otel-collector:8888
# otel-collector-metrics internal metrics
- job_name: "otel-collector-metrics"
scrape_interval: 60s
static_configs:
- targets:
- localhost:8888
# SigNoz span metrics
- job_name: "signozspanmetrics-collector"
scrape_interval: 60s
static_configs:
- targets:
- otel-collector:8889
processors:
batch:
send_batch_size: 10000
@ -32,17 +45,29 @@ processors:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- zpages
- pprof
pipelines:
metrics:
receivers: [otlp, prometheus]
receivers: [prometheus]
processors: [batch]
exporters: [clickhousemetricswrite]

View File

@ -1,123 +0,0 @@
<?xml version="1.0"?>
<clickhouse>
<!-- See also the files in users.d directory where the settings can be overridden. -->
<!-- Profiles of settings. -->
<profiles>
<!-- Default settings. -->
<default>
<!-- Maximum memory usage for processing single query, in bytes. -->
<max_memory_usage>10000000000</max_memory_usage>
<!-- How to choose between replicas during distributed query processing.
random - choose random replica from set of replicas with minimum number of errors
nearest_hostname - from set of replicas with minimum number of errors, choose replica
with minimum number of different symbols between replica's hostname and local hostname
(Hamming distance).
in_order - first live replica is chosen in specified order.
first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
-->
<load_balancing>random</load_balancing>
</default>
<!-- Profile that allows only read queries. -->
<readonly>
<readonly>1</readonly>
</readonly>
</profiles>
<!-- Users and ACL. -->
<users>
<!-- If user name was not specified, 'default' user is used. -->
<default>
<!-- See also the files in users.d directory where the password can be overridden.
Password could be specified in plaintext or in SHA256 (in hex format).
If you want to specify password in plaintext (not recommended), place it in 'password' element.
Example: <password>qwerty</password>.
Password could be empty.
If you want to specify SHA256, place it in 'password_sha256_hex' element.
Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
place its name in 'server' element inside 'ldap' element.
Example: <ldap><server>my_ldap_server</server></ldap>
If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
place 'kerberos' element instead of 'password' (and similar) elements.
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
whose initiator's realm matches it.
Example: <kerberos />
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
How to generate decent password:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
In first line will be password and in second - corresponding SHA256.
How to generate double SHA1:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
In first line will be password and in second - corresponding double SHA1.
-->
<password></password>
<!-- List of networks with open access.
To open access from everywhere, specify:
<ip>::/0</ip>
To open access only from localhost, specify:
<ip>::1</ip>
<ip>127.0.0.1</ip>
Each element of list has one of the following forms:
<ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
<host> Hostname. Example: server01.clickhouse.com.
To check access, DNS query is performed, and all received addresses compared to peer address.
<host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
To check access, DNS PTR query is performed for peer address and then regexp is applied.
Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
Strongly recommended that regexp is ends with $
All results of DNS requests are cached till server restart.
-->
<networks>
<ip>::/0</ip>
</networks>
<!-- Settings profile for user. -->
<profile>default</profile>
<!-- Quota for user. -->
<quota>default</quota>
<!-- User can create other users and grant rights to them. -->
<!-- <access_management>1</access_management> -->
</default>
</users>
<!-- Quotas. -->
<quotas>
<!-- Name of quota. -->
<default>
<!-- Limits for time interval. You could specify many intervals with different limits. -->
<interval>
<!-- Length of interval. -->
<duration>3600</duration>
<!-- No limits. Just calculate resource usage for time interval. -->
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</clickhouse>

View File

@ -63,6 +63,8 @@ services:
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
ports:
- "4317:4317" # OTLP GRPC receiver
mem_limit: 2000m

View File

@ -1,28 +1,40 @@
receivers:
opencensus:
endpoint: 0.0.0.0:55678
otlp/spanmetrics:
protocols:
grpc:
endpoint: "localhost:12345"
endpoint: localhost:12345
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
# thrift_compact:
# endpoint: 0.0.0.0:6831
# thrift_binary:
# endpoint: 0.0.0.0:6832
hostmetrics:
collection_interval: 30s
collection_interval: 60s
scrapers:
cpu:
load:
memory:
disk:
filesystem:
network:
cpu: {}
load: {}
memory: {}
disk: {}
filesystem: {}
network: {}
processors:
batch:
send_batch_size: 1000
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
signozspanmetrics/prometheus:
metrics_exporter: prometheus
@ -34,20 +46,33 @@ processors:
- name: deployment.environment
default: default
# memory_limiter:
# # Same as --mem-ballast-size-mib CLI argument
# ballast_size_mib: 683
# # 80% of maximum memory up to 2G
# limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
#
# # 50% of the maximum memory
# limit_percentage: 50
# # 20% of max memory usage spike expected
# spike_limit_percentage: 20
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
resourcedetection:
detectors: [env, system]
timeout: 2s
override: false
extensions:
health_check: {}
zpages: {}
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
@ -56,18 +81,30 @@ exporters:
resource_to_telemetry_conversion:
enabled: true
prometheus:
endpoint: "0.0.0.0:8889"
endpoint: 0.0.0.0:8889
# logging: {}
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- zpages
- pprof
pipelines:
traces:
receivers: [jaeger, otlp]
processors: [signozspanmetrics/prometheus, batch]
exporters: [clickhousetraces]
metrics:
receivers: [otlp, hostmetrics]
receivers: [otlp]
processors: [batch]
exporters: [clickhousemetricswrite]
metrics/hostmetrics:
receivers: [hostmetrics]
processors: [resourcedetection, batch]
exporters: [clickhousemetricswrite]
metrics/spanmetrics:
receivers: [otlp/spanmetrics]
exporters: [prometheus]

View File

@ -3,42 +3,71 @@ receivers:
protocols:
grpc:
http:
# Data sources: metrics
prometheus:
config:
scrape_configs:
# otel-collector internal metrics
- job_name: "otel-collector"
scrape_interval: 30s
scrape_interval: 60s
static_configs:
- targets: ["otel-collector:8889"]
- targets:
- otel-collector:8888
# otel-collector-metrics internal metrics
- job_name: "otel-collector-metrics"
scrape_interval: 60s
static_configs:
- targets:
- localhost:8888
# SigNoz span metrics
- job_name: "signozspanmetrics-collector"
scrape_interval: 60s
static_configs:
- targets:
- otel-collector:8889
processors:
batch:
send_batch_size: 1000
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
# memory_limiter:
# # Same as --mem-ballast-size-mib CLI argument
# ballast_size_mib: 683
# # 80% of maximum memory up to 2G
# limit_mib: 1500
# # 25% of limit up to 2G
# spike_limit_mib: 512
# check_interval: 5s
#
# # 50% of the maximum memory
# limit_percentage: 50
# # 20% of max memory usage spike expected
# spike_limit_percentage: 20
# queued_retry:
# num_workers: 4
# queue_size: 100
# retry_on_failure: true
extensions:
health_check: {}
zpages: {}
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
service:
extensions: [health_check, zpages]
telemetry:
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- zpages
- pprof
pipelines:
metrics:
receivers: [otlp, prometheus]
receivers: [prometheus]
processors: [batch]
exporters: [clickhousemetricswrite]