Calico Monitoring
bash-5.0# cat calico.yml
groups:
- name: calico.rules
rules:
#----------------------------------- Calico Node -------------------------------------
# Felix Prometheus statistics: https://projectcalico.docs.tigera.io/reference/felix/prometheus
- alert: PromHTTPRequestErrors
expr: (sum(rate(promhttp_metric_handler_requests_total{job="calico-node",code=~"(4|5).."}[5m] offset 5m )) by (instance, job, cluster, host) / sum(rate(promhttp_metric_handler_requests_total{job="calico-node"}[5m] offset 5m )) by (instance, job, cluster, host)) * 100 > 1
for: 10m
labels:
severity: warning
type: calico-node
annotations:
description: "Cluster {{ $labels.cluster }}: HTTP requests errors on host {{ $labels.host }}."
summary: Calico HTTP requests errors on cluster {{ $labels.cluster }}
- alert: CalicoDatapaneFailuresHigh
expr: increase(felix_int_dataplane_failures[1h] offset 5m) > 5
for: 1h
labels:
severity: warning
type: calico-node
annotations:
description: 'Felix cluster {{ $labels.cluster }} has seen {{ $value }} dataplane failures within the last hour'
summary: 'A high number of dataplane failures within Felix are happening'
- alert: CalicoIpsetErrorsHigh
expr: increase(felix_ipset_errors[1h] offset 5m) > 5
for: 1h
labels:
severity: warning
type: calico-node
annotations:
description: 'Felix cluster {{ $labels.cluster }} has seen {{ $value }} ipset errors within the last hour'
summary: 'A high number of ipset errors within Felix are happening'
- alert: CalicoIptableSaveErrorsHigh
expr: increase(felix_iptables_save_errors[1h] offset 5m) > 5
for: 1h
labels:
severity: warning
type: calico-node
annotations:
description: 'Felix cluster {{ $labels.cluster }} has seen {{ $value }} iptable save errors within the last hour'
summary: 'A high number of iptable save errors within Felix are happening'
- alert: CalicoIptableRestoreErrorsHigh
expr: increase(felix_iptables_restore_errors[1h] offset 5m) > 5
for: 1h
labels:
severity: warning
type: calico-node
annotations:
description: 'Felix cluster {{ $labels.cluster }} has seen {{ $value }} iptable restore errors within the last hour'
summary: 'A high number of iptable restore errors within Felix are happening'
#----------------------------------- Calico Kube Controllers -----------------------
# kube-controllers Prometheus statistics: https://projectcalico.docs.tigera.io/reference/kube-controllers/prometheus
# curl http://<pod_ip>:9094/metrics
#----------------------------------- Calico Typha ----------------------------------
# Typha Prometheus statistics: https://projectcalico.docs.tigera.io/reference/typha/prometheus
- alert: TyphaPingLatency
expr: rate(typha_ping_latency_sum[1m] offset 5m ) / rate(typha_ping_latency_count[1m] offset 5m ) > 0.1 and rate(typha_ping_latency_count[1m] offset 5m ) > 0
for: 2m
labels:
severity: warning
type: calico-typha
annotations:
summary: Typha Round-trip ping latency to client (cluster {{ $labels.cluster }})
description: "Typha latency is growing (ping operations > 100ms)<br> VALUE = {{ $value }}<br> LABELS = {{ $labels }}"
- alert: TyphaClientWriteLatency
expr: rate(typha_client_write_latency_secs_sum[1m] offset 5m) / rate(typha_client_write_latency_secs_count[1m] offset 5m) > 0.1 and rate(typha_client_write_latency_secs_count[1m] offset 5m ) > 0
for: 2m
labels:
severity: warning
type: calico-typha
annotations:
summary: Typha unusual write latency (instance {{ $labels.cluster }})
description: "Typha client latency is growing (write operations > 100ms)<br> VALUE = {{ $value }}<br> LABELS = {{ $labels }}"
bash-5.0#Last updated
Was this helpful?
