Files
auricom-home-cluster/cluster/apps/monitoring/thanos/prometheus-rule.yaml
2021-08-02 01:34:57 +02:00

40 lines
1.4 KiB
YAML

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: thanos.rules
namespace: monitoring
spec:
groups:
- name: thanos.rules
rules:
- alert: ThanosCompactionHalted
expr: |
thanos_compactor_halted == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compaction halted on {{ $labels.instance }}"
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactBucketOperationFailure
expr: |
rate(thanos_objstore_bucket_operation_failures_total[1m])
> 0
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactNotRun
expr: |
(time() - thanos_objstore_bucket_last_successful_upload_time)
> 24*60*60
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compact not run on {{ $labels.instance }}"
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"