mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-27 04:21:27 +02:00
feat: prometheus rules
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- prometheus-rule.yaml
|
||||
|
109
cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
Normal file
109
cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
Normal file
@@ -0,0 +1,109 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: loki.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: loki.rules
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
Reference in New Issue
Block a user