feat: prometheus rules

2025-09-27 12:33:58 +02:00 · 2021-08-02 01:25:16 +02:00
parent 57b78a9c39
commit f009843300
6 changed files with 217 additions and 0 deletions
--- a/cluster/apps/monitoring/loki-stack/kustomization.yaml
+++ b/cluster/apps/monitoring/loki-stack/kustomization.yaml
@@ -1,4 +1,6 @@
+---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helm-release.yaml
+  - prometheus-rule.yaml
--- a/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
+++ b/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
@@ -0,0 +1,109 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: loki.rules
+  namespace: monitoring
+spec:
+  groups:
+    - name: loki.rules
+      rules:
+        - alert: LokiRequestErrors
+          annotations:
+            message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
+          expr: |
+            100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
+              /
+            sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
+              > 10
+          for: 15m
+          labels:
+            severity: critical
+        - alert: LokiRequestPanics
+          annotations:
+            message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
+          expr: |
+            sum(increase(loki_panic_total[10m])) by (namespace, job)
+              > 0
+          labels:
+            severity: critical
+        - alert: LokiRequestLatency
+          annotations:
+            message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
+          expr: |
+            namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
+              > 1
+          for: 15m
+          labels:
+            severity: critical
+        - expr: |
+            histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job))
+          record: job:loki_request_duration_seconds:99quantile
+        - expr: |
+            histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job))
+          record: job:loki_request_duration_seconds:50quantile
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
+              /
+            sum(rate(loki_request_duration_seconds_count[1m])) by (job)
+          record: job:loki_request_duration_seconds:avg
+        - expr: |
+            sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job)
+          record: job:loki_request_duration_seconds_bucket:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
+          record: job:loki_request_duration_seconds_sum:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_count[1m])) by (job)
+          record: job:loki_request_duration_seconds_count:sum_rate
+        - expr: |
+            histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job, route))
+          record: job_route:loki_request_duration_seconds:99quantile
+        - expr: |
+            histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job, route))
+          record: job_route:loki_request_duration_seconds:50quantile
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
+              /
+            sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
+          record: job_route:loki_request_duration_seconds:avg
+        - expr: |
+            sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, job, route)
+          record: job_route:loki_request_duration_seconds_bucket:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
+          record: job_route:loki_request_duration_seconds_sum:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
+          record: job_route:loki_request_duration_seconds_count:sum_rate
+        - expr: |
+            histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, namespace, job, route))
+          record: namespace_job_route:loki_request_duration_seconds:99quantile
+        - expr: |
+            histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, namespace, job, route))
+          record: namespace_job_route:loki_request_duration_seconds:50quantile
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
+              /
+            sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
+          record: namespace_job_route:loki_request_duration_seconds:avg
+        - expr: |
+            sum(rate(loki_request_duration_seconds_bucket[1m]))
+            by (le, namespace, job, route)
+          record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_sum[1m]))
+            by (namespace, job, route)
+          record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
+        - expr: |
+            sum(rate(loki_request_duration_seconds_count[1m]))
+            by (namespace, job, route)
+          record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
--- a/cluster/apps/monitoring/thanos/kustomization.yaml
+++ b/cluster/apps/monitoring/thanos/kustomization.yaml
@@ -1,4 +1,6 @@
+---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helm-release.yaml
+  # - prometheus-rule.yaml
--- a/cluster/apps/monitoring/thanos/prometheus-rule.yaml
+++ b/cluster/apps/monitoring/thanos/prometheus-rule.yaml
@@ -0,0 +1,39 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: thanos.rules
+  namespace: monitoring
+spec:
+  groups:
+    - name: thanos.rules
+      rules:
+        - alert: ThanosCompactionHalted
+          expr: |
+            thanos_compactor_halted == 1
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Thanos compaction halted on {{ $labels.instance }}"
+            description: "Thanos compaction has failed to run and is now halted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: ThanosCompactBucketOperationFailure
+          expr: |
+            rate(thanos_objstore_bucket_operation_failures_total[1m])
+              > 0
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
+            description: "Thanos compaction has failing storage operations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: ThanosCompactNotRun
+          expr: |
+            (time() - thanos_objstore_bucket_last_successful_upload_time)
+              > 24*60*60
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Thanos compact not run on {{ $labels.instance }}"
+            description: "Thanos compaction has not run in 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/cluster/core/cert-manager/kustomization.yaml
+++ b/cluster/core/cert-manager/kustomization.yaml
@@ -1,3 +1,4 @@
+---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
@@ -7,4 +8,5 @@ resources:
  - cert-manager-webhook-ovh-helm-release.yaml
  - letsencrypt-production.yaml
  - letsencrypt-staging.yaml
+  - prometheus-rule.yaml
  - secret.enc.yaml
--- a/cluster/core/cert-manager/prometheus-rule.yaml
+++ b/cluster/core/cert-manager/prometheus-rule.yaml
@@ -0,0 +1,63 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cert-manager.rules
+  namespace: cert-manager
+spec:
+  groups:
+    - name: cert-manager
+      rules:
+        - alert: CertManagerAbsent
+          expr: |
+            absent(up{job="cert-manager"})
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            description: "New certificates will not be able to be minted, and existing
+              ones can't be renewed until cert-manager is back."
+            runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
+            summary: "Cert Manager has dissapeared from Prometheus service discovery."
+    - name: certificates
+      rules:
+        - alert: CertManagerCertExpirySoon
+          expr: |
+            avg by (exported_namespace, namespace, name) (
+            certmanager_certificate_expiration_timestamp_seconds - time())
+              < (21 * 24 * 3600)
+          for: 1h
+          labels:
+            severity: warning
+          annotations:
+            description: "The domain that this cert covers will be unavailable after
+              {{ $value | humanizeDuration }}. Clients using endpoints that this cert
+              protects will start to fail in {{ $value | humanizeDuration }}."
+            runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
+            summary: "The cert {{ $labels.name }} is {{ $value | humanizeDuration }}
+              from expiry, it should have renewed over a week ago."
+        - alert: CertManagerCertNotReady
+          expr: |
+            max by (name, exported_namespace, namespace, condition) (
+            certmanager_certificate_ready_status{condition!="True"} == 1)
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            description: "This certificate has not been ready to serve traffic for at least
+              10m. If the cert is being renewed or there is another valid cert, the ingress
+              controller _may_ be able to serve that instead."
+            runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
+            summary: "The cert {{ $labels.name }} is not ready to serve traffic."
+        - alert: CertManagerHittingRateLimits
+          expr: |
+            sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m]))
+              > 0
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            description: "Depending on the rate limit, cert-manager may be unable to generate
+              certificates for up to a week."
+            runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
+            summary: "Cert manager hitting LetsEncrypt rate limits."