mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-27 12:33:58 +02:00
feat: prometheus rules
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- prometheus-rule.yaml
|
||||
|
109
cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
Normal file
109
cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
Normal file
@@ -0,0 +1,109 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: loki.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: loki.rules
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
@@ -1,4 +1,6 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
# - prometheus-rule.yaml
|
||||
|
39
cluster/apps/monitoring/thanos/prometheus-rule.yaml
Normal file
39
cluster/apps/monitoring/thanos/prometheus-rule.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: thanos.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: thanos.rules
|
||||
rules:
|
||||
- alert: ThanosCompactionHalted
|
||||
expr: |
|
||||
thanos_compactor_halted == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compaction halted on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: ThanosCompactBucketOperationFailure
|
||||
expr: |
|
||||
rate(thanos_objstore_bucket_operation_failures_total[1m])
|
||||
> 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: ThanosCompactNotRun
|
||||
expr: |
|
||||
(time() - thanos_objstore_bucket_last_successful_upload_time)
|
||||
> 24*60*60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compact not run on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
@@ -1,3 +1,4 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
@@ -7,4 +8,5 @@ resources:
|
||||
- cert-manager-webhook-ovh-helm-release.yaml
|
||||
- letsencrypt-production.yaml
|
||||
- letsencrypt-staging.yaml
|
||||
- prometheus-rule.yaml
|
||||
- secret.enc.yaml
|
||||
|
63
cluster/core/cert-manager/prometheus-rule.yaml
Normal file
63
cluster/core/cert-manager/prometheus-rule.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: cert-manager.rules
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
groups:
|
||||
- name: cert-manager
|
||||
rules:
|
||||
- alert: CertManagerAbsent
|
||||
expr: |
|
||||
absent(up{job="cert-manager"})
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: "New certificates will not be able to be minted, and existing
|
||||
ones can't be renewed until cert-manager is back."
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
|
||||
summary: "Cert Manager has dissapeared from Prometheus service discovery."
|
||||
- name: certificates
|
||||
rules:
|
||||
- alert: CertManagerCertExpirySoon
|
||||
expr: |
|
||||
avg by (exported_namespace, namespace, name) (
|
||||
certmanager_certificate_expiration_timestamp_seconds - time())
|
||||
< (21 * 24 * 3600)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "The domain that this cert covers will be unavailable after
|
||||
{{ $value | humanizeDuration }}. Clients using endpoints that this cert
|
||||
protects will start to fail in {{ $value | humanizeDuration }}."
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
|
||||
summary: "The cert {{ $labels.name }} is {{ $value | humanizeDuration }}
|
||||
from expiry, it should have renewed over a week ago."
|
||||
- alert: CertManagerCertNotReady
|
||||
expr: |
|
||||
max by (name, exported_namespace, namespace, condition) (
|
||||
certmanager_certificate_ready_status{condition!="True"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: "This certificate has not been ready to serve traffic for at least
|
||||
10m. If the cert is being renewed or there is another valid cert, the ingress
|
||||
controller _may_ be able to serve that instead."
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
|
||||
summary: "The cert {{ $labels.name }} is not ready to serve traffic."
|
||||
- alert: CertManagerHittingRateLimits
|
||||
expr: |
|
||||
sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m]))
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: "Depending on the rate limit, cert-manager may be unable to generate
|
||||
certificates for up to a week."
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
|
||||
summary: "Cert manager hitting LetsEncrypt rate limits."
|
Reference in New Issue
Block a user