--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: loki.rules namespace: monitoring spec: groups: - name: loki.rules rules: - alert: LokiRequestErrors annotations: message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors." expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 for: 15m labels: severity: critical - alert: LokiRequestPanics annotations: message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics." expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency." expr: | namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 for: 15m labels: severity: critical - expr: | histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)) record: job:loki_request_duration_seconds:99quantile - expr: | histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)) record: job:loki_request_duration_seconds:50quantile - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job) record: job:loki_request_duration_seconds:avg - expr: | sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) record: job:loki_request_duration_seconds_bucket:sum_rate - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (job) record: job:loki_request_duration_seconds_sum:sum_rate - expr: | sum(rate(loki_request_duration_seconds_count[1m])) by (job) record: job:loki_request_duration_seconds_count:sum_rate - expr: | histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)) record: job_route:loki_request_duration_seconds:99quantile - expr: | histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)) record: job_route:loki_request_duration_seconds:50quantile - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) record: job_route:loki_request_duration_seconds:avg - expr: | sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) record: job_route:loki_request_duration_seconds_bucket:sum_rate - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) record: job_route:loki_request_duration_seconds_sum:sum_rate - expr: | sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) record: job_route:loki_request_duration_seconds_count:sum_rate - expr: | histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)) record: namespace_job_route:loki_request_duration_seconds:99quantile - expr: | histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)) record: namespace_job_route:loki_request_duration_seconds:50quantile - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) record: namespace_job_route:loki_request_duration_seconds:avg - expr: | sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route) record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - expr: | sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate - expr: | sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) record: namespace_job_route:loki_request_duration_seconds_count:sum_rate