feat: overhaul

This commit is contained in:
auricom
2025-01-04 00:00:04 +01:00
parent b14022014b
commit 0c9529c7a2
408 changed files with 3187 additions and 2380 deletions

View File

@@ -0,0 +1,24 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: kube-prometheus-stack
spec:
secretStoreRef:
kind: ClusterSecretStore
name: onepassword-connect
target:
name: kube-prometheus-stack-secret
template:
engineVersion: v2
data:
# App
MINIO_PROM_TOKEN: "{{ .MINIO__PROMETHEUS_TOKEN }}"
PIKVM_USERNAME: "{{ .username }}"
PIKVM_PASSWORD: "{{ .password }}"
dataFrom:
- extract:
key: minio
- extract:
key: PiKVM

View File

@@ -0,0 +1,257 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-prometheus-stack
spec:
interval: 30m
chart:
spec:
chart: kube-prometheus-stack
version: 67.5.0
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
interval: 5m
install:
crds: Skip
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: Skip
remediation:
strategy: rollback
retries: 3
dependsOn:
- name: kube-prometheus-stack-crds
namespace: observability
- name: rook-ceph-cluster
namespace: rook-ceph
values:
crds:
enabled: false
###
### Component values
###
kubeApiServer:
enabled: true
serviceMonitor:
metricRelabelings:
- action: replace
sourceLabels:
- node
targetLabel: instance
kubeProxy:
enabled: false
kubeControllerManager:
enabled: false
kubeEtcd:
enabled: false
kubeScheduler:
enabled: false
kubeStateMetrics:
metricLabelsAllowlist:
- persistentvolumeclaims=[*]
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
grafana:
enabled: false
forceDeployDashboards: true
prometheus-node-exporter:
resources:
requests:
cpu: 23m
memory: 64M
limits:
memory: 64M
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
###
### Prometheus operator values
###
prometheusOperator:
prometheusConfigReloader:
resources:
requests:
cpu: 100m
memory: 50Mi
limits:
cpu: 300m
memory: 100Mi
###
### Prometheus instance values
###
prometheus:
ingress:
enabled: true
pathType: Prefix
ingressClassName: internal
annotations:
nginx.ingress.kubernetes.io/auth-method: GET
nginx.ingress.kubernetes.io/auth-url: http://authelia.default.svc.cluster.local.:8888/api/verify
nginx.ingress.kubernetes.io/auth-signin: https://auth.${SECRET_EXTERNAL_DOMAIN}?rm=$request_method
nginx.ingress.kubernetes.io/auth-response-headers: Remote-User,Remote-Name,Remote-Groups,Remote-Email
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Method $request_method;
hajimari.io/appName: Prometheus
hajimari.io/icon: simple-icons:prometheus
gethomepage.dev/enabled: "true"
gethomepage.dev/name: Prometheus
gethomepage.dev/description: Systems and service monitoring system.
gethomepage.dev/group: Infrastructure
gethomepage.dev/icon: prometheus.png
gethomepage.dev/pod-selector: >-
app in (
prometheus-kube-prometheus-stack-prometheus
)
hosts: ["prometheus.${SECRET_EXTERNAL_DOMAIN}"]
tls:
- hosts:
- "prometheus.${SECRET_EXTERNAL_DOMAIN}"
prometheusSpec:
replicas: 2
replicaExternalLabelName: replica
scrapeInterval: 1m # Must match interval in Grafana Helm chart
podMonitorSelector: &selector
matchLabels: null
probeSelector: *selector
ruleSelector: *selector
scrapeConfigSelector: *selector
serviceMonitorSelector: *selector
retention: 14d
retentionSize: 50GB
enableAdminAPI: true
walCompression: true
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: rook-ceph-block
resources:
requests:
storage: 20Gi
alertmanager:
config:
global:
resolve_timeout: 5m
receivers:
- name: "null"
- name: pushover
pushover_configs:
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
send_resolved: true
html: true
priority: |-
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
url_title: View in Alert Manager
title: |-
[{{ .Status | toUpper -}}
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
] {{ .CommonLabels.alertname }}
message: |-
{{- range .Alerts }}
{{- if ne .Labels.severity "" }}
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
{{- else }}
<b>Severity:</b> <i>N/A</i>
{{- end }}
{{- if ne .Annotations.description "" }}
<b>Description:</b> <i>{{ .Annotations.description }}</i>
{{- else if ne .Annotations.summary "" }}
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
{{- else if ne .Annotations.message "" }}
<b>Message:</b> <i>{{ .Annotations.message }}</i>
{{- else }}
<b>Description:</b> <i>N/A</i>
{{- end }}
{{- if gt (len .Labels.SortedPairs) 0 }}
<b>Details:</b>
{{- range .Labels.SortedPairs }}
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
{{- end }}
{{- end }}
{{- end }}
route:
receiver: pushover
routes:
- receiver: "null"
matchers:
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
- receiver: pushover
matchers:
- severity = "critical"
continue: true
inhibit_rules:
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
equal: [alertname, namespace]
alertmanagerSpec:
replicas: 1
podAntiAffinity: hard
storage:
volumeClaimTemplate:
spec:
storageClassName: rook-ceph-block
resources:
requests:
storage: 1Gi
ingress:
enabled: true
pathType: Prefix
ingressClassName: internal
annotations:
nginx.ingress.kubernetes.io/auth-method: GET
nginx.ingress.kubernetes.io/auth-url: http://authelia.default.svc.cluster.local.:8888/api/verify
nginx.ingress.kubernetes.io/auth-signin: https://auth.${SECRET_EXTERNAL_DOMAIN}?rm=$request_method
nginx.ingress.kubernetes.io/auth-response-headers: Remote-User,Remote-Name,Remote-Groups,Remote-Email
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Method $request_method;
hajimari.io/appName: Alert Manager
hajimari.io/icon: mdi:alert-decagram-outline
gethomepage.dev/enabled: "true"
gethomepage.dev/name: Alert-Manager
gethomepage.dev/description: Handles alerts sent by Prometheus.
gethomepage.dev/group: Infrastructure
gethomepage.dev/icon: alertmanager.png
gethomepage.dev/pod-selector: >-
app in (
alertmanager-kube-prometheus-stack-alertmanager
)
hosts: ["alert-manager.${SECRET_EXTERNAL_DOMAIN}"]
tls:
- hosts:
- "alert-manager.${SECRET_EXTERNAL_DOMAIN}"
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node

View File

@@ -0,0 +1,9 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./externalsecret.yaml
- ./helmrelease.yaml
- ./prometheusrule.yaml
- ./scrapeconfig.yaml

View File

@@ -0,0 +1,34 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: miscellaneous-rules
spec:
groups:
- name: dockerhub
rules:
- alert: BootstrapRateLimitRisk
annotations:
summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
for: 15m
labels:
severity: critical
- name: oom
rules:
- alert: OOMKilled
annotations:
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
labels:
severity: critical
- name: zfs
rules:
- alert: ZfsUnexpectedPoolState
annotations:
summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
expr: node_zfs_zpool_state{state!="online"} > 0
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,86 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name node-exporter
spec:
staticConfigs:
- targets:
- pikvm.${SECRET_INTERNAL_DOMAIN}:9100
- opnsense.${SECRET_INTERNAL_DOMAIN}:9273
- storage.${SECRET_INTERNAL_DOMAIN}:9100
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name podman-exporter
spec:
staticConfigs:
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9882"]
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name pikvm
spec:
staticConfigs:
- targets: ["pikvm.${SECRET_INTERNAL_DOMAIN}"]
metricsPath: /api/export/prometheus/metrics
basicAuth:
username:
name: kube-prometheus-stack-secret
key: PIKVM_USERNAME
password:
name: kube-prometheus-stack-secret
key: PIKVM_PASSWORD
scheme: HTTPS
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name zrepl
spec:
staticConfigs:
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9811"]
metricsPath: /metrics
relabelings:
- action: replace
targetLabel: job
replacement: *name
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
name: &name minio
spec:
staticConfigs:
- targets: ["s3.${SECRET_INTERNAL_DOMAIN}"]
metricsPath: /minio/v2/metrics/cluster
authorization:
credentials:
name: kube-prometheus-stack-secret
key: MINIO_PROM_TOKEN
scheme: HTTPS
relabelings:
- action: replace
targetLabel: job
replacement: *name