Files
auricom-home-cluster/cluster/apps/networking/traefik/prometheus-rules.yaml
2021-08-10 16:42:40 +02:00

73 lines
2.5 KiB
YAML

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app: traefik
name: traefik.rules
namespace: networking
spec:
groups:
- name: traefik.rules
rules:
- alert: TraefikAbsent
annotations:
summary: "Traefik has disappeared from Prometheus service discovery."
description: "Ingresses will be down until the Traefik reverse proxy is back up."
expr: |
absent(up{job="traefik"})
for: 5m
labels:
severity: critical
- alert: TraefikConfigError
annotations:
summary: "Traefik config error."
description:
"Traefik has failed to load the config file. Check Traefik
logs for exact parsing error."
expr: |
traefik_config_last_reload_failure{job="traefik"} == 1
for: 5m
labels:
severity: critical
- alert: TraefikHighHttp4xxErrorRateService
annotations:
summary: "Traefik has a high HTTP 4xx error rate."
description:
"Traefik is reporting {{ $value | humanizePercentage }} of 4xx
errors on {{ $labels.exported_service }}"
expr: |
sum(rate(traefik_service_requests_total{code=~"4.*"}[1m])) by (exported_service)
/
sum(rate(traefik_service_requests_total[1m])) by (exported_service)
> .10
for: 5m
labels:
severity: critical
- alert: TraefikHighHttp5xxErrorRateService
annotations:
summary: "Traefik has a high HTTP 5xx error rate."
description:
"Traefik is reporting {{ $value | humanizePercentage }} of 5xx
errors on {{ $labels.exported_service }}"
expr: |
sum(rate(traefik_service_requests_total{code=~"5.*"}[1m])) by (exported_service)
/
sum(rate(traefik_service_requests_total[1m])) by (exported_service)
> .10
for: 5m
labels:
severity: critical
- alert: TraefikTooManyRequest
annotations:
summary: "Traefik has too many open connections"
description:
"Traefik is reporting {{ $value }} of open connections on entrypoint
{{ $labels.entrypoint }}"
expr: |
avg(traefik_entrypoint_open_connections{job="traefik"})
> 5
for: 5m
labels:
severity: critical