feat: kube scrape stuff

This commit is contained in:
auricom
2024-05-14 00:37:33 +02:00
parent dfae7dc573
commit ee2d4180cd
4 changed files with 95 additions and 22 deletions

View File

@@ -153,28 +153,6 @@ spec:
existingSecret:
name: thanos-objstore-secret
key: objstore.yml
additionalScrapeConfigs:
- job_name: "opnsense"
scrape_interval: 60s
metrics_path: "/metrics"
static_configs:
- targets: ["${LOCAL_LAN_OPNSENSE}:9273"]
labels:
app: "opnsense"
- job_name: "truenas"
scrape_interval: 60s
metrics_path: "/metrics"
static_configs:
- targets: ["192.168.9.10:9273"]
labels:
app: "truenas"
- job_name: "truenas-remote"
scrape_interval: 60s
metrics_path: "/metrics"
static_configs:
- targets: ["${LOCAL_LAN_TRUENAS_REMOTE}:9273"]
labels:
app: "truenas-remote"
thanosService:
enabled: true
thanosServiceMonitor:

View File

@@ -5,3 +5,5 @@ kind: Kustomization
namespace: monitoring
resources:
- ./helmrelease.yaml
- ./prometheusrule.yaml
- ./scrapeconfig.yaml

View File

@@ -0,0 +1,34 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: miscellaneous-rules
spec:
groups:
- name: dockerhub
rules:
- alert: BootstrapRateLimitRisk
annotations:
summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
for: 15m
labels:
severity: critical
- name: oom
rules:
- alert: OOMKilled
annotations:
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
labels:
severity: critical
- name: zfs
rules:
- alert: ZfsUnexpectedPoolState
annotations:
summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
expr: node_zfs_zpool_state{state!="online"} > 0
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,59 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name node-exporter
spec:
staticConfigs:
- targets:
- pikvm.${SECRET_INTERNAL_DOMAIN}:9100
- opnsense.${SECRET_INTERNAL_DOMAIN}:9273
- storage.${SECRET_INTERNAL_DOMAIN}:9100
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name podman-exporter
spec:
staticConfigs:
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9882"]
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name pikvm
spec:
staticConfigs:
- targets: ["pikvm.${SECRET_INTERNAL_DOMAIN}"]
metricsPath: /api/export/prometheus/metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name zrepl
spec:
staticConfigs:
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9811"]
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name