mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
110 lines
4.9 KiB
YAML
110 lines
4.9 KiB
YAML
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: loki.rules
|
|
namespace: monitoring
|
|
spec:
|
|
groups:
|
|
- name: loki.rules
|
|
rules:
|
|
- alert: LokiRequestErrors
|
|
annotations:
|
|
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
|
expr: |
|
|
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
|
/
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
> 10
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: LokiRequestPanics
|
|
annotations:
|
|
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
|
expr: |
|
|
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
|
> 0
|
|
labels:
|
|
severity: critical
|
|
- alert: LokiRequestLatency
|
|
annotations:
|
|
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
|
expr: |
|
|
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job))
|
|
record: job:loki_request_duration_seconds:99quantile
|
|
- expr: |
|
|
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job))
|
|
record: job:loki_request_duration_seconds:50quantile
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
|
/
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
|
record: job:loki_request_duration_seconds:avg
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job)
|
|
record: job:loki_request_duration_seconds_bucket:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
|
record: job:loki_request_duration_seconds_sum:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
|
record: job:loki_request_duration_seconds_count:sum_rate
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job, route))
|
|
record: job_route:loki_request_duration_seconds:99quantile
|
|
- expr: |
|
|
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job, route))
|
|
record: job_route:loki_request_duration_seconds:50quantile
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
|
/
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
|
record: job_route:loki_request_duration_seconds:avg
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, job, route)
|
|
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
|
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
|
record: job_route:loki_request_duration_seconds_count:sum_rate
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, namespace, job, route))
|
|
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
|
- expr: |
|
|
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, namespace, job, route))
|
|
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
|
/
|
|
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
record: namespace_job_route:loki_request_duration_seconds:avg
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
by (le, namespace, job, route)
|
|
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_sum[1m]))
|
|
by (namespace, job, route)
|
|
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
|
- expr: |
|
|
sum(rate(loki_request_duration_seconds_count[1m]))
|
|
by (namespace, job, route)
|
|
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|