mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
feat: overhaul
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json
|
||||
apiVersion: external-secrets.io/v1beta1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: kube-prometheus-stack
|
||||
spec:
|
||||
secretStoreRef:
|
||||
kind: ClusterSecretStore
|
||||
name: onepassword-connect
|
||||
target:
|
||||
name: kube-prometheus-stack-secret
|
||||
template:
|
||||
engineVersion: v2
|
||||
data:
|
||||
# App
|
||||
MINIO_PROM_TOKEN: "{{ .MINIO__PROMETHEUS_TOKEN }}"
|
||||
PIKVM_USERNAME: "{{ .username }}"
|
||||
PIKVM_PASSWORD: "{{ .password }}"
|
||||
dataFrom:
|
||||
- extract:
|
||||
key: minio
|
||||
- extract:
|
||||
key: PiKVM
|
@@ -0,0 +1,257 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: kube-prometheus-stack
|
||||
spec:
|
||||
interval: 30m
|
||||
chart:
|
||||
spec:
|
||||
chart: kube-prometheus-stack
|
||||
version: 67.5.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: prometheus-community
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
install:
|
||||
crds: Skip
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
cleanupOnFail: true
|
||||
crds: Skip
|
||||
remediation:
|
||||
strategy: rollback
|
||||
retries: 3
|
||||
dependsOn:
|
||||
- name: kube-prometheus-stack-crds
|
||||
namespace: observability
|
||||
- name: rook-ceph-cluster
|
||||
namespace: rook-ceph
|
||||
values:
|
||||
crds:
|
||||
enabled: false
|
||||
###
|
||||
### Component values
|
||||
###
|
||||
kubeApiServer:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
metricRelabelings:
|
||||
- action: replace
|
||||
sourceLabels:
|
||||
- node
|
||||
targetLabel: instance
|
||||
|
||||
kubeProxy:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
|
||||
kubeStateMetrics:
|
||||
metricLabelsAllowlist:
|
||||
- persistentvolumeclaims=[*]
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
|
||||
grafana:
|
||||
enabled: false
|
||||
forceDeployDashboards: true
|
||||
|
||||
prometheus-node-exporter:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 23m
|
||||
memory: 64M
|
||||
limits:
|
||||
memory: 64M
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
###
|
||||
### Prometheus operator values
|
||||
###
|
||||
prometheusOperator:
|
||||
prometheusConfigReloader:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 300m
|
||||
memory: 100Mi
|
||||
|
||||
###
|
||||
### Prometheus instance values
|
||||
###
|
||||
prometheus:
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: internal
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/auth-method: GET
|
||||
nginx.ingress.kubernetes.io/auth-url: http://authelia.default.svc.cluster.local.:8888/api/verify
|
||||
nginx.ingress.kubernetes.io/auth-signin: https://auth.${SECRET_EXTERNAL_DOMAIN}?rm=$request_method
|
||||
nginx.ingress.kubernetes.io/auth-response-headers: Remote-User,Remote-Name,Remote-Groups,Remote-Email
|
||||
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Method $request_method;
|
||||
hajimari.io/appName: Prometheus
|
||||
hajimari.io/icon: simple-icons:prometheus
|
||||
gethomepage.dev/enabled: "true"
|
||||
gethomepage.dev/name: Prometheus
|
||||
gethomepage.dev/description: Systems and service monitoring system.
|
||||
gethomepage.dev/group: Infrastructure
|
||||
gethomepage.dev/icon: prometheus.png
|
||||
gethomepage.dev/pod-selector: >-
|
||||
app in (
|
||||
prometheus-kube-prometheus-stack-prometheus
|
||||
)
|
||||
hosts: ["prometheus.${SECRET_EXTERNAL_DOMAIN}"]
|
||||
tls:
|
||||
- hosts:
|
||||
- "prometheus.${SECRET_EXTERNAL_DOMAIN}"
|
||||
prometheusSpec:
|
||||
replicas: 2
|
||||
replicaExternalLabelName: replica
|
||||
scrapeInterval: 1m # Must match interval in Grafana Helm chart
|
||||
podMonitorSelector: &selector
|
||||
matchLabels: null
|
||||
probeSelector: *selector
|
||||
ruleSelector: *selector
|
||||
scrapeConfigSelector: *selector
|
||||
serviceMonitorSelector: *selector
|
||||
retention: 14d
|
||||
retentionSize: 50GB
|
||||
enableAdminAPI: true
|
||||
walCompression: true
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: rook-ceph-block
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
alertmanager:
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
receivers:
|
||||
- name: "null"
|
||||
- name: pushover
|
||||
pushover_configs:
|
||||
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
|
||||
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: |-
|
||||
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
||||
url_title: View in Alert Manager
|
||||
title: |-
|
||||
[{{ .Status | toUpper -}}
|
||||
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
||||
] {{ .CommonLabels.alertname }}
|
||||
message: |-
|
||||
{{- range .Alerts }}
|
||||
{{- if ne .Labels.severity "" }}
|
||||
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
|
||||
{{- else }}
|
||||
<b>Severity:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if ne .Annotations.description "" }}
|
||||
<b>Description:</b> <i>{{ .Annotations.description }}</i>
|
||||
{{- else if ne .Annotations.summary "" }}
|
||||
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
|
||||
{{- else if ne .Annotations.message "" }}
|
||||
<b>Message:</b> <i>{{ .Annotations.message }}</i>
|
||||
{{- else }}
|
||||
<b>Description:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if gt (len .Labels.SortedPairs) 0 }}
|
||||
<b>Details:</b>
|
||||
{{- range .Labels.SortedPairs }}
|
||||
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
route:
|
||||
receiver: pushover
|
||||
routes:
|
||||
- receiver: "null"
|
||||
matchers:
|
||||
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
|
||||
- receiver: pushover
|
||||
matchers:
|
||||
- severity = "critical"
|
||||
continue: true
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = "critical"
|
||||
target_matchers:
|
||||
- severity = "warning"
|
||||
equal: [alertname, namespace]
|
||||
alertmanagerSpec:
|
||||
replicas: 1
|
||||
podAntiAffinity: hard
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: rook-ceph-block
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: internal
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/auth-method: GET
|
||||
nginx.ingress.kubernetes.io/auth-url: http://authelia.default.svc.cluster.local.:8888/api/verify
|
||||
nginx.ingress.kubernetes.io/auth-signin: https://auth.${SECRET_EXTERNAL_DOMAIN}?rm=$request_method
|
||||
nginx.ingress.kubernetes.io/auth-response-headers: Remote-User,Remote-Name,Remote-Groups,Remote-Email
|
||||
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Method $request_method;
|
||||
hajimari.io/appName: Alert Manager
|
||||
hajimari.io/icon: mdi:alert-decagram-outline
|
||||
gethomepage.dev/enabled: "true"
|
||||
gethomepage.dev/name: Alert-Manager
|
||||
gethomepage.dev/description: Handles alerts sent by Prometheus.
|
||||
gethomepage.dev/group: Infrastructure
|
||||
gethomepage.dev/icon: alertmanager.png
|
||||
gethomepage.dev/pod-selector: >-
|
||||
app in (
|
||||
alertmanager-kube-prometheus-stack-alertmanager
|
||||
)
|
||||
hosts: ["alert-manager.${SECRET_EXTERNAL_DOMAIN}"]
|
||||
tls:
|
||||
- hosts:
|
||||
- "alert-manager.${SECRET_EXTERNAL_DOMAIN}"
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
@@ -0,0 +1,9 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ./externalsecret.yaml
|
||||
- ./helmrelease.yaml
|
||||
- ./prometheusrule.yaml
|
||||
- ./scrapeconfig.yaml
|
@@ -0,0 +1,34 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: miscellaneous-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: dockerhub
|
||||
rules:
|
||||
- alert: BootstrapRateLimitRisk
|
||||
annotations:
|
||||
summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
|
||||
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: oom
|
||||
rules:
|
||||
- alert: OOMKilled
|
||||
annotations:
|
||||
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
|
||||
labels:
|
||||
severity: critical
|
||||
- name: zfs
|
||||
rules:
|
||||
- alert: ZfsUnexpectedPoolState
|
||||
annotations:
|
||||
summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
|
||||
expr: node_zfs_zpool_state{state!="online"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
@@ -0,0 +1,86 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ScrapeConfig
|
||||
metadata:
|
||||
name: &name node-exporter
|
||||
spec:
|
||||
staticConfigs:
|
||||
- targets:
|
||||
- pikvm.${SECRET_INTERNAL_DOMAIN}:9100
|
||||
- opnsense.${SECRET_INTERNAL_DOMAIN}:9273
|
||||
- storage.${SECRET_INTERNAL_DOMAIN}:9100
|
||||
metricsPath: /metrics
|
||||
relabelings:
|
||||
- action: replace
|
||||
targetLabel: job
|
||||
replacement: *name
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ScrapeConfig
|
||||
metadata:
|
||||
name: &name podman-exporter
|
||||
spec:
|
||||
staticConfigs:
|
||||
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9882"]
|
||||
metricsPath: /metrics
|
||||
relabelings:
|
||||
- action: replace
|
||||
targetLabel: job
|
||||
replacement: *name
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ScrapeConfig
|
||||
metadata:
|
||||
name: &name pikvm
|
||||
spec:
|
||||
staticConfigs:
|
||||
- targets: ["pikvm.${SECRET_INTERNAL_DOMAIN}"]
|
||||
metricsPath: /api/export/prometheus/metrics
|
||||
basicAuth:
|
||||
username:
|
||||
name: kube-prometheus-stack-secret
|
||||
key: PIKVM_USERNAME
|
||||
password:
|
||||
name: kube-prometheus-stack-secret
|
||||
key: PIKVM_PASSWORD
|
||||
scheme: HTTPS
|
||||
relabelings:
|
||||
- action: replace
|
||||
targetLabel: job
|
||||
replacement: *name
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ScrapeConfig
|
||||
metadata:
|
||||
name: &name zrepl
|
||||
spec:
|
||||
staticConfigs:
|
||||
- targets: ["storage.${SECRET_INTERNAL_DOMAIN}:9811"]
|
||||
metricsPath: /metrics
|
||||
relabelings:
|
||||
- action: replace
|
||||
targetLabel: job
|
||||
replacement: *name
|
||||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: ScrapeConfig
|
||||
metadata:
|
||||
name: &name minio
|
||||
spec:
|
||||
staticConfigs:
|
||||
- targets: ["s3.${SECRET_INTERNAL_DOMAIN}"]
|
||||
metricsPath: /minio/v2/metrics/cluster
|
||||
authorization:
|
||||
credentials:
|
||||
name: kube-prometheus-stack-secret
|
||||
key: MINIO_PROM_TOKEN
|
||||
scheme: HTTPS
|
||||
relabelings:
|
||||
- action: replace
|
||||
targetLabel: job
|
||||
replacement: *name
|
Reference in New Issue
Block a user