--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app: traefik name: traefik.rules namespace: networking spec: groups: - name: traefik.rules rules: - alert: TraefikAbsent annotations: summary: "Traefik has disappeared from Prometheus service discovery." description: "Ingresses will be down until the Traefik reverse proxy is back up." expr: | absent(up{job="traefik"}) for: 5m labels: severity: critical - alert: TraefikConfigError annotations: summary: "Traefik config error." description: "Traefik has failed to load the config file. Check Traefik logs for exact parsing error." expr: | traefik_config_last_reload_failure{job="traefik"} == 1 for: 5m labels: severity: critical - alert: TraefikHighHttp4xxErrorRateService annotations: summary: "Traefik has a high HTTP 4xx error rate." description: "Traefik is reporting {{ $value | humanizePercentage }} of 4xx errors on {{ $labels.exported_service }}" expr: | sum(rate(traefik_service_requests_total{code=~"4.*"}[1m])) by (exported_service) / sum(rate(traefik_service_requests_total[1m])) by (exported_service) > .10 for: 5m labels: severity: critical - alert: TraefikHighHttp5xxErrorRateService annotations: summary: "Traefik has a high HTTP 5xx error rate." description: "Traefik is reporting {{ $value | humanizePercentage }} of 5xx errors on {{ $labels.exported_service }}" expr: | sum(rate(traefik_service_requests_total{code=~"5.*"}[1m])) by (exported_service) / sum(rate(traefik_service_requests_total[1m])) by (exported_service) > .10 for: 5m labels: severity: critical - alert: TraefikTooManyRequest annotations: summary: "Traefik has too many open connections" description: "Traefik is reporting {{ $value }} of open connections on entrypoint {{ $labels.entrypoint }}" expr: | avg(traefik_entrypoint_open_connections{job="traefik"}) > 5 for: 5m labels: severity: critical