feat: prometheus rules

This commit is contained in:
auricom
2021-08-02 01:25:16 +02:00
parent 57b78a9c39
commit f009843300
6 changed files with 217 additions and 0 deletions

View File

@@ -1,4 +1,6 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- prometheus-rule.yaml

View File

@@ -0,0 +1,109 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: loki.rules
namespace: monitoring
spec:
groups:
- name: loki.rules
rules:
- alert: LokiRequestErrors
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job)
> 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
> 1
for: 15m
labels:
severity: critical
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate