mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
131 lines
4.1 KiB
YAML
131 lines
4.1 KiB
YAML
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: loki-alerting-rules
|
|
namespace: monitoring
|
|
data:
|
|
loki-alerting-rules.yaml: |-
|
|
groups:
|
|
#
|
|
# SMART Failures
|
|
#
|
|
- name: smart-failure
|
|
rules:
|
|
- alert: SmartFailures
|
|
expr: |
|
|
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "SMART has reported failures on host {{ $labels.hostname }}"
|
|
#
|
|
# zigbee2mqtt
|
|
#
|
|
- name: zigbee2mqtt
|
|
rules:
|
|
- alert: ZigbeeUnableToReachMQTT
|
|
expr: |
|
|
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Zigbee2mqtt is unable to reach MQTT"
|
|
#
|
|
# zwavejs2mqtt
|
|
#
|
|
- name: zwavejs2mqtt
|
|
rules:
|
|
- alert: ZwaveUnableToReachMQTT
|
|
expr: |
|
|
sum(count_over_time({app="zwavejs2mqtt"} |~ "(?i)error while connecting mqtt"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Zwavejs2mqtt is unable to reach MQTT"
|
|
#
|
|
# frigate
|
|
#
|
|
- name: frigate
|
|
rules:
|
|
- alert: FrigateUnableToReachMQTT
|
|
expr: |
|
|
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Frigate is unable to reach MQTT"
|
|
#
|
|
# *arr
|
|
#
|
|
- name: arr
|
|
rules:
|
|
- alert: ArrDatabaseIsLocked
|
|
expr: |
|
|
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "{{ $labels.app }} is experiencing locked database issues"
|
|
- alert: ArrDatabaseIsMalformed
|
|
expr: |
|
|
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "{{ $labels.app }} is experiencing malformed database disk image issues"
|
|
#
|
|
# home-assistant
|
|
#
|
|
- name: home-assistant
|
|
rules:
|
|
- alert: HomeAssistantUnableToReachPostgresql
|
|
expr: |
|
|
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Home Assistant is unable to connect to postgresql"
|
|
#
|
|
# valetudo
|
|
#
|
|
- name: valetudo
|
|
rules:
|
|
- alert: ValetudoUnableToReachMQTT
|
|
expr: |
|
|
sum by (hostname) (count_over_time({hostname="valetudo"} |~ "(?i).*error.*mqtt.*"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Valetudo is unable to connect to mqtt"
|
|
#
|
|
# node-red
|
|
#
|
|
- name: node-red
|
|
rules:
|
|
- alert: NodeRedUnableToReachHomeAssistant
|
|
expr: |
|
|
sum by (app) (count_over_time({app="node-red"} |~ "(?i)home assistant.*connecting to undefined"[2m])) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: logs
|
|
annotations:
|
|
summary: "Node-Red is unable to connect to Home Assistant"
|