diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml index b03aba357..2a992f32e 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml @@ -153,28 +153,6 @@ spec: existingSecret: name: thanos-objstore-secret key: objstore.yml - additionalScrapeConfigs: - - job_name: "opnsense" - scrape_interval: 60s - metrics_path: "/metrics" - static_configs: - - targets: ["${LOCAL_LAN_OPNSENSE}:9273"] - labels: - app: "opnsense" - - job_name: "truenas" - scrape_interval: 60s - metrics_path: "/metrics" - static_configs: - - targets: ["192.168.9.10:9273"] - labels: - app: "truenas" - - job_name: "truenas-remote" - scrape_interval: 60s - metrics_path: "/metrics" - static_configs: - - targets: ["${LOCAL_LAN_TRUENAS_REMOTE}:9273"] - labels: - app: "truenas-remote" thanosService: enabled: true thanosServiceMonitor: diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml index 72c948232..6080e4e51 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml @@ -5,3 +5,5 @@ kind: Kustomization namespace: monitoring resources: - ./helmrelease.yaml + - ./prometheusrule.yaml + - ./scrapeconfig.yaml diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/prometheusrule.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/prometheusrule.yaml new file mode 100644 index 000000000..fb877abf1 --- /dev/null +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/prometheusrule.yaml @@ -0,0 +1,34 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: miscellaneous-rules +spec: + groups: + - name: dockerhub + rules: + - alert: BootstrapRateLimitRisk + annotations: + summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap + expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100 + for: 15m + labels: + severity: critical + - name: oom + rules: + - alert: OOMKilled + annotations: + summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + labels: + severity: critical + - name: zfs + rules: + - alert: ZfsUnexpectedPoolState + annotations: + summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}} + expr: node_zfs_zpool_state{state!="online"} > 0 + for: 15m + labels: + severity: critical diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/scrapeconfig.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/scrapeconfig.yaml new file mode 100644 index 000000000..cbad724c5 --- /dev/null +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/scrapeconfig.yaml @@ -0,0 +1,59 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: &name node-exporter +spec: + staticConfigs: + - targets: + - pikvm.${SECRET_INTERNAL_DOMAIN}:9100 + - opnsense.${SECRET_INTERNAL_DOMAIN}:9273 + - storage.${SECRET_INTERNAL_DOMAIN}:9100 + metricsPath: /metrics + relabelings: + - action: replace + targetLabel: job + replacement: *name +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: &name podman-exporter +spec: + staticConfigs: + - targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9882"] + metricsPath: /metrics + relabelings: + - action: replace + targetLabel: job + replacement: *name +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: &name pikvm +spec: + staticConfigs: + - targets: ["pikvm.${SECRET_INTERNAL_DOMAIN}"] + metricsPath: /api/export/prometheus/metrics + relabelings: + - action: replace + targetLabel: job + replacement: *name +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: &name zrepl +spec: + staticConfigs: + - targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9811"] + metricsPath: /metrics + relabelings: + - action: replace + targetLabel: job + replacement: *name