--- groups: - name: smart rules: - alert: SMARTFailure expr: | sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0 for: 2m labels: severity: critical category: logs annotations: hostname: "{{ $labels.hostname }}" summary: "{{ $labels.hostname }} has reported SMART failures" - name: zigbee2mqtt rules: - alert: ZigbeeMQTTUnreachable expr: | sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0 for: 2m labels: severity: critical category: logs annotations: app: "{{ $labels.app }}" summary: "{{ $labels.app }} is unable to reach MQTT" - name: zwave-js-ui rules: - alert: ZwaveMQTTUnreachable expr: | sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0 for: 2m labels: severity: critical category: logs annotations: app: "{{ $labels.app }}" summary: "{{ $labels.app }} is unable to reach MQTT" - name: frigate rules: - alert: FrigateMQTTUnreachable expr: | sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0 for: 2m labels: severity: critical category: logs annotations: app: "{{ $labels.app }}" summary: "{{ $labels.app }} is unable to reach MQTT" - name: home-assistant rules: - alert: HomeAssistantPostgresUnreachable expr: | sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0 for: 2m labels: severity: critical category: logs annotations: app: "{{ $labels.app }}" summary: "{{ $labels.app }} is unable to connect to postgres" - name: bazarr rules: - alert: BazarrJobRaisedException expr: | sum by (app) (count_over_time({app="bazarr"} |~ "(?i)Job(.+)Update(.+)from(.+)raised an exception"[2m])) > 0 for: 2m labels: severity: warning category: logs annotations: app: "{{ $labels.app }}" summary: "{{ $labels.app }} is raising job exceptions"