--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: cert-manager.rules namespace: cert-manager spec: groups: - name: cert-manager rules: - alert: CertManagerAbsent expr: | absent(up{job="cert-manager"}) for: 10m labels: severity: critical annotations: description: "New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back." runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent summary: "Cert Manager has dissapeared from Prometheus service discovery." - name: certificates rules: - alert: CertManagerCertExpirySoon expr: | avg by (exported_namespace, namespace, name) ( certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600) for: 1h labels: severity: warning annotations: description: "The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}." runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon summary: "The cert {{ $labels.name }} is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago." - alert: CertManagerCertNotReady expr: | max by (name, exported_namespace, namespace, condition) ( certmanager_certificate_ready_status{condition!="True"} == 1) for: 10m labels: severity: critical annotations: description: "This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead." runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready summary: "The cert {{ $labels.name }} is not ready to serve traffic." - alert: CertManagerHittingRateLimits expr: | sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0 for: 5m labels: severity: critical annotations: description: "Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week." runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits summary: "Cert manager hitting LetsEncrypt rate limits."