Node Os Monitoring
bash-5.0# cat node-exporter.yml
groups:
- name: node-exporter
rules:
#----------------------------------- CPU -------------------------------------
- alert: HostHighCpuLoad
expr: 100 - (avg by(host,cluster) (rate(node_cpu_seconds_total{mode="idle",job="ne"}[2m] offset 5m)) * 100) > 80
for: 0m
labels:
severity: warning
type: node
annotations:
summary: High CPU load on host {{ $labels.host }}
description: "Cluster {{ $labels.cluster }}: CPU load is greater than 80% on host {{ $labels.host }}"
#----------------------------------- Memory -------------------------------------
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes{job="ne"} / node_memory_MemTotal_bytes{job="ne"} * 100 < 10
for: 1m
labels:
severity: high
type: node
annotations:
summary: Host {{ $labels.host }} out of memory
description: "Cluster {{ $labels.cluster }}: Node memory is filling up (less than 10% left) on host {{ $labels.host }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault{job="ne"}[10m] offset 5m ) > 100
for: 2m
labels:
severity: high
type: node
annotations:
summary: Host memory for {{ $labels.host }} under memory pressure
description: "Cluster {{ $labels.cluster }}: The host {{ $labels.host }} is under heavy memory pressure. High rate of major page faults VALUE = {{ $value }}"
#----------------------------------- FS -------------------------------------
- alert: NodeFilesystemAlmostOutOfSpace
expr: node_filesystem_avail_bytes{mountpoint="/",job="ne"} / node_filesystem_size_bytes{mountpoint="/",job="ne"} * 100 < 5
for: 2m
labels:
severity: high
type: node
annotations:
description: Node {{ $labels.host }} has only {{ printf "%.2f" $value }}% available space left on {{ $labels.device }}.
summary: "Cluster {{ $labels.cluster }}: Filesystem on Node {{ $labels.host }} has less than 5% space left."
- alert: NodeFilesystemAlmostOutOfFiles
expr: node_filesystem_files_free{mountpoint="/",job="ne"} / node_filesystem_files{mountpoint="/",job="ne"} * 100 < 5
for: 2m
labels:
severity: high
type: node
annotations:
description: Node {{ $labels.host }} has only {{ printf "%.2f" $value }}% available space left on {{ $labels.device }}.
summary: "Cluster {{ $labels.cluster }}: Filesystem on Node {{ $labels.host }} has less than 5% inodes left."
#----------------------------------- Network -------------------------------------
- alert: HostUnusualNetworkThroughputIn
expr: sum by (host) (rate(node_network_receive_bytes_total{job="ne"}[2m] offset 5m)) / 1024 / 1024 > 50
for: 1m
labels:
severity: warning
type: node
annotations:
summary: Unusual network throughput In on host {{ $labels.host }}
description: "Cluster {{ $labels.cluster }}: Host network interfaces are probably receiving too much data (greater than 100 MB/s)<br> VALUE = {{ $value }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (host) (rate(node_network_transmit_bytes_total{job="ne"}[2m] offset 5m)) / 1024 / 1024 > 50
for: 1m
labels:
severity: warning
type: node
annotations:
summary: Unusual network throughput Out on host {{ $labels.host }}
description: "Cluster {{ $labels.cluster }}: Host network interfaces are probably sending too much data (greater than 100 MB/s)<br> VALUE = {{ $value }}"
- alert: NodeNetworkReceiveErrs
expr: |
increase(node_network_receive_errs_total{job="ne"}[2m] offset 5m) > 10
for: 1m
labels:
severity: high
type: node
annotations:
description: 'Cluster {{ $labels.cluster }}: {{ $labels.host }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors on {{ $labels.host }}
- alert: NodeNetworkTransmitErrs
expr: |
increase(node_network_transmit_errs_total{job="ne"}[2m] offset 5m ) > 10
for: 1m
labels:
severity: high
type: node
annotations:
description: 'Cluster {{ $labels.cluster }}: {{ $labels.host }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors on {{ $labels.host }}
#----------------------------------- Disk -------------------------------------
- alert: HostUnusualDiskWriteRate
expr: sum by (host) (rate(node_disk_written_bytes_total{job="ne"}[2m] offset 5m )) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
type: node
annotations:
summary: Unusual disk write rate on host {{ $labels.host }}
description: "Cluster {{ $labels.cluster }}: Disk is probably writing too much data (greater than 50 MB/s)<br> VALUE = {{ $value }}"
- alert: HostUnusualDiskReadRate
expr: sum by (host) (rate(node_disk_read_bytes_total{job="ne"}[2m] offset 5m )) / 1024 / 1024 > 50
for: 1m
labels:
severity: warning
type: node
annotations:
summary: Unusual disk read rate on host {{ $labels.host }}
description: "Cluster {{ $labels.cluster }}: Disk is probably reading too much data (greater than 50 MB/s)<br> VALUE = {{ $value }}<br> LABELS = {{ $labels }}"
bash-5.0#Last updated
Was this helpful?
