Node Os Monitoring
bash-5.0# cat kube-state-metrics.yml
groups:
- name: kube-state-metrics-nodes
rules:
#----------------------------------- k8s node -------------------------------------
- alert: KubeNodeNotReady
expr: |
kube_node_status_condition{job="ksm",condition="Ready",status="true"} == 0
for: 1m
labels:
severity: warning
type: ksm
annotations:
summary: Kubernetes Node not ready for cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Node {{ $labels.node }} not ready"
- alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
type: ksm
annotations:
summary: Kubernetes memory pressure for node of cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Node {{ $labels.node }} has MemoryPressure condition"
- alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
type: ksm
annotations:
summary: Kubernetes disk pressure for node of cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Node {{ $labels.node }} has DiskPressure condition"
#----------------------------------- Pods -------------------------------------
- name: kube-state-metrics-pods
rules:
- alert: KubernetesJobFailed
expr: kube_job_status_failed offset 5m > 0
for: 0m
labels:
severity: warning
type: ksm
annotations:
summary: Kubernetes Job failed for cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Job {{$labels.namespace}}/{{$labels.job_name}} failed to complete reason: {{$labels.reason}}"
- alert: KubernetesContainerTerminated
expr: increase(kube_pod_container_status_last_terminated_reason[10m] offset 5m) > 0
for: 1m
labels:
severity: warning
type: ksm
annotations:
summary: Kubernetes container terminated on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been terminated, reason: {{ $labels.reason }}"
- alert: KubePodCrashLooping
expr: |
increase(kube_pod_container_status_restarts_total{job="ksm"}[10m] offset 5m) > 0
for: 1m
labels:
severity: high
type: ksm
annotations:
summary: Pod crashing looping on cluster {{ $labels.cluster }}
message: "Cluster {{ $labels.cluster }}: Pod {{ $labels.namespace }}/{{ $labels.pod }} container: {{ $labels.container}} is restarting in last 10 minutes."
- alert: KubePodNotReady
expr: |
sum by (cluster, namespace, pod) (kube_pod_status_phase{job="ksm", phase=~"Failed|Pending|Unknown"} offset 5m) > 0
for: 1m
labels:
severity: high
type: ksm
annotations:
summary: Kubernetes pod not ready on cluster {{ $labels.cluster }}
message: "Cluster {{ $labels.cluster }}: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 10 minutes."
#----------------------------------- Deployment -------------------------------------
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{job="ksm"} offset 5m != kube_deployment_status_replicas_available{job="ksm"} offset 5m
for: 1m
labels:
severity: high
type: ksm
annotations:
summary: Deployment has not converged on cluster {{ $labels.cluster }}
message: "Cluster {{ $labels.cluster }}: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes"
#----------------------------------- Statefulset -------------------------------------
- alert: KubeStatefulSetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{job="ksm"} offset 5m != kube_statefulset_status_replicas{job="ksm"} offset 5m
for: 1m
labels:
severity: high
type: ksm
annotations:
summary: Statefulset not ready on cluster {{ $labels.cluster }}
message: "Cluster {{ $labels.cluster }}: Statefulset {{ $labels.namespace }}/{{ $labels.statefulset }} has been in a non-ready state for longer than 15 minutes."
#----------------------------------- Daemonset -------------------------------------
- alert: KubeDaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready{job="ksm"} offset 5m / kube_daemonset_status_desired_number_scheduled{job="ksm"} offset 5m * 100 < 100
for: 1m
labels:
severity: high
type: ksm
annotations:
summary: Kubernetes daemonset stuck on cluster {{ $labels.cluster }}
message: "Cluster {{ $labels.cluster }}: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready"
#----------------------------------- Storage -------------------------------------
- name: kube-state-metrics-storage
rules:
- alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} offset 5m == 1
for: 2m
labels:
severity: high
type: ksm
annotations:
summary: Kubernetes PersistentVolumeClaim pending on cluster {{ $labels.cluster }}
description: "Cluster {{ $labels.cluster }}: PVC pending {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="ksm"} > 0
for: 0m
labels:
severity: high
type: ksm
annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
description: "Persistent volume is in bad state<br> VALUE = {{ $value }}<br> LABELS = {{ $labels }}"
bash-5.0#Last updated
Was this helpful?
