Kubernetes Monitoring
bash-5.0# cat k8s.yml
groups:
- name: kube-apiserver
rules:
#----------------------------------- API server -------------------------------------
- alert: KubeAPIServerDown
expr: up{job="kube-apiserver"} offset 5m == 0
for: 1m
labels:
severity: critical
type: k8s
annotations:
description: Kubernetes API server down on cluster {{ $labels.cluster }}
summary: "Cluster {{ $labels.cluster }}: Kube api server down on host {{ $labels.host }}"
- alert: KubernetesApiServerErrors
expr: sum(rate(apiserver_request_total{code=~"(4|5).."}[1m] offset 5m )) by (cluster, host) / sum(rate(apiserver_request_total[1m] offset 5m )) by (cluster, host) * 100 > 3
for: 2m
labels:
severity: critical
type: k8s
annotations:
summary: Kubernetes API server errors on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Kubernetes API server is experiencing high error rate on host {{ $labels.host }}"
- alert: KubernetesApiClientErrors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m] offset 5m )) by (cluster, host) / sum(rate(rest_client_requests_total[1m] offset 5m )) by (cluster, host)) * 100 > 1
for: 2m
labels:
severity: critical
type: k8s
annotations:
summary: Kubernetes API client errors on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Kubernetes API client is experiencing high error rate on host {{ $labels.host }}"
#----------------------------------- Scheduler -------------------------------------
- alert: KubeSchedulerDown
expr: up{job="kube-scheduler"} offset 5m == 0
for: 1m
labels:
severity: critical
type: k8s
annotations:
summary: Kubernetes Scheduler down on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Kube scheduler down on host {{ $labels.host }}"
#----------------------------------- Controller -------------------------------------
- alert: KubeControllerManagerDown
expr: up{job="kube-controller"} offset 5m == 0
for: 1m
labels:
severity: critical
type: k8s
annotations:
summary: Kubernetes Controller down on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Kube controller down on host {{ $labels.host }}"
#----------------------------------- KubeProxy -------------------------------------
- alert: KubeProxyDown
expr: up{job="kube-proxy"} offset 5m == 0
for: 1m
labels:
severity: critical
type: k8s
annotations:
summary: KubeProxy down on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: kube proxy down on host {{ $labels.host }}"
- alert: KubeProxyRuleSyncLatency
expr: histogram_quantile(0.99, sum by(le, cluster, host) (rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket[5m] offset 5m ))) > 60
for: 2m
labels:
severity: warning
type: k8s
annotations:
summary: Cluster {{ $labels.cluster }} is taking too long, on average, to apply kubernetes service rules to iptables.
description: "Cluster {{ $labels.cluster }}: network rules synchronization slowing down, VALUE = {{ $value }} on host {{ $labels.host }}"bash-5.0#Last updated
Was this helpful?
