mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-28 21:12:35 +02:00
feat: kube scrape stuff
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: miscellaneous-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: dockerhub
|
||||
rules:
|
||||
- alert: BootstrapRateLimitRisk
|
||||
annotations:
|
||||
summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
|
||||
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: oom
|
||||
rules:
|
||||
- alert: OOMKilled
|
||||
annotations:
|
||||
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
|
||||
labels:
|
||||
severity: critical
|
||||
- name: zfs
|
||||
rules:
|
||||
- alert: ZfsUnexpectedPoolState
|
||||
annotations:
|
||||
summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
|
||||
expr: node_zfs_zpool_state{state!="online"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
Reference in New Issue
Block a user