diff --git a/cluster/longhorn-system/monitoring.yaml b/cluster/longhorn-system/monitoring.yaml new file mode 100644 index 000000000..6eaa9b2ed --- /dev/null +++ b/cluster/longhorn-system/monitoring.yaml @@ -0,0 +1,109 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: longhorn-prometheus-servicemonitor + namespace: longhorn-system + labels: + name: longhorn-prometheus-servicemonitor +spec: + selector: + matchLabels: + app: longhorn-manager + namespaceSelector: + matchNames: + - longhorn-system + endpoints: + - port: manager +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: longhorn + role: alert-rules + name: prometheus-longhorn-rules + namespace: monitoring +spec: + groups: + - name: longhorn.rules + rules: + - alert: LonghornVolumeActualSpaceUsedWarning + annotations: + description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The actual used space of Longhorn volume is over 90% of the capacity. + expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90 + for: 5m + labels: + issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. + severity: warning + - alert: LonghornVolumeStatusCritical + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for + more than 2 minutes. + summary: Longhorn volume {{$labels.volume}} is Fault + expr: longhorn_volume_robustness == 3 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Fault. + severity: critical + - alert: LonghornVolumeStatusWarning + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for + more than 5 minutes. + summary: Longhorn volume {{$labels.volume}} is Degraded + expr: longhorn_volume_robustness == 2 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Degraded. + severity: warning + - alert: LonghornNodeStorageWarning + annotations: + description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of node is over 70% of the capacity. + expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of node {{$labels.node}} is high. + severity: warning + - alert: LonghornDiskStorageWarning + annotations: + description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of disk is over 70% of the capacity. + expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. + severity: warning + - alert: LonghornNodeDown + annotations: + description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. + summary: Longhorn nodes is offline + expr: longhorn_node_total - (count(longhorn_node_status{condition="ready"}==1) OR on() vector(0)) + for: 5m + labels: + issue: There are {{$value}} Longhorn nodes are offline + severity: critical + - alert: LonghornIntanceManagerCPUUsageWarning + annotations: + description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for + more than 5 minutes. + summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. + expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 + for: 5m + labels: + issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} comsumes 3 times the CPU request. + severity: warning + - alert: LonghornNodeCPUUsageWarning + annotations: + description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for + more than 5 minutes. + summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. + expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 + for: 5m + labels: + issue: Longhorn node {{$labels.node}} experiences high CPU pressure. + severity: warning diff --git a/cluster/longhorn-system/serviceMonitor.yaml b/cluster/longhorn-system/serviceMonitor.yaml deleted file mode 100644 index 88cb3b696..000000000 --- a/cluster/longhorn-system/serviceMonitor.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: longhorn-prometheus-servicemonitor - namespace: longhorn-system - labels: - name: longhorn-prometheus-servicemonitor -spec: - selector: - matchLabels: - app: longhorn-manager - namespaceSelector: - matchNames: - - longhorn-system - endpoints: - - port: manager