--- apiVersion: helm.toolkit.fluxcd.io/v2beta1 kind: HelmRelease metadata: name: kube-prometheus-stack namespace: monitoring spec: interval: 5m chart: spec: # renovate: registryUrl=https://prometheus-community.github.io/helm-charts chart: kube-prometheus-stack version: 16.9.1 sourceRef: kind: HelmRepository name: prometheus-community-charts namespace: flux-system interval: 5m timeout: 20m values: server: resources: requests: memory: 1500Mi cpu: 200m limits: memory: 2000Mi prometheusOperator: createCustomResource: true configReloaderCpu: 200m alertmanager: ingress: enabled: true pathType: Prefix annotations: kubernetes.io/ingress.class: "nginx" nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify" nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}/" hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"] tls: - hosts: - alert-manager.${SECRET_CLUSTER_DOMAIN} config: global: resolve_timeout: 5m receivers: - name: "null" - name: "pushover" pushover_configs: - user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY} token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN} route: receiver: "pushover" routes: - match: alertname: Watchdog receiver: "null" - receiver: "pushover" inhibit_rules: - source_match: severity: "critical" target_match: severity: "warning" # Apply inhibition if the alertname is the same. equal: ["alertname", "namespace"] alertmanagerSpec: storage: volumeClaimTemplate: spec: storageClassName: rook-ceph-block resources: requests: storage: 10Gi nodeExporter: serviceMonitor: relabelings: - action: replace regex: (.*) replacement: $1 sourceLabels: - __meta_kubernetes_pod_node_name targetLabel: kubernetes_node kubelet: serviceMonitor: metricRelabelings: - action: replace sourceLabels: - node targetLabel: instance grafana: adminPassword: ${SECRET_KUBE_PROMETHEUS_STACK_GRAFANA_ADMIN_PASSWORD} dashboards: default: kubernetes-custom: url: https://raw.githubusercontent.com/auricom/home-cluster/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/12175 calico-felix: gnetId: 12175 revision: 5 datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/2842 ceph-cluster: gnetId: 2842 revision: 14 datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/5336 ceph-osd: gnetId: 5336 revision: 5 datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/5342 ceph-pools: gnetId: 5342 revision: 5 datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/11315 flux-cluster: url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/cluster.json datasource: Prometheus flux-control-plane: url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/control-plane.json datasource: Prometheus vernemq: url: https://raw.githubusercontent.com/vernemq/vernemq/master/metrics_scripts/grafana/VerneMQ%20Node%20Metrics.json datasource: Prometheus home-assistant: url: https://raw.githubusercontent.com/auricom/home-cluster/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json datasource: Prometheus truenas: url: https://raw.githubusercontent.com/auricom/home-cluster/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json datasource: Prometheus lidarr: url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/lidarr.json datasource: Prometheus radarr: url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/radarr.json datasource: Prometheus sonarr: url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/sonarr.json datasource: Prometheus deploymentStrategy: type: Recreate persistence: enabled: false env: GF_EXPLORE_ENABLED: true GF_DISABLE_SANITIZE_HTML: true GF_PANELS_DISABLE_SANITIZE_HTML: true plugins: - natel-discrete-panel - pr0ps-trackmap-panel - grafana-piechart-panel - vonage-status-panel - grafana-worldmap-panel - grafana-clock-panel dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: "default" orgId: 1 folder: "" type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/default sidecar: datasources: enabled: true defaultDatasourceEnabled: false dashboards: enabled: true searchNamespace: ALL additionalDataSources: - name: Prometheus type: prometheus access: proxy url: http://thanos-query-http:10902/ isDefault: true grafana.ini: server: root_url: https://grafana.${SECRET_CLUSTER_DOMAIN} paths: data: /var/lib/grafana/data logs: /var/log/grafana plugins: /var/lib/grafana/plugins provisioning: /etc/grafana/provisioning analytics: check_for_updates: true log: mode: console grafana_net: url: https://grafana.net smtp: enabled: false ingress: enabled: true pathType: Prefix annotations: kubernetes.io/ingress.class: "nginx" hosts: ["grafana.${SECRET_CLUSTER_DOMAIN}"] tls: - hosts: - grafana.${SECRET_CLUSTER_DOMAIN} kubeEtcd: enabled: false kubeControllerManager: enabled: false kubeScheduler: enabled: false kubeProxy: enabled: false prometheus: ingress: enabled: true pathType: Prefix annotations: kubernetes.io/ingress.class: "nginx" nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify" nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}/" hosts: ["prometheus.${SECRET_CLUSTER_DOMAIN}"] tls: - hosts: - prometheus.${SECRET_CLUSTER_DOMAIN} prometheusSpec: replicas: 2 replicaExternalLabelName: "replica" ruleSelector: {} ruleNamespaceSelector: {} ruleSelectorNilUsesHelmValues: false serviceMonitorSelector: {} serviceMonitorNamespaceSelector: {} serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelector: {} podMonitorNamespaceSelector: {} podMonitorSelectorNilUsesHelmValues: false retention: 6h enableAdminAPI: true walCompression: true storageSpec: volumeClaimTemplate: spec: storageClassName: rook-ceph-block resources: requests: storage: 10Gi additionalScrapeConfigs: - job_name: "opnsense" scrape_interval: 60s metrics_path: "/metrics" static_configs: - targets: ["opnsense.${SECRET_CLUSTER_DOMAIN_ROOT}:9273"] labels: app: "opnsense" - job_name: "hass" scrape_interval: 60s metrics_path: "/api/prometheus" bearer_token: "${SECRET_HASS_TOKEN}" static_configs: - targets: ["home-assistant.home.svc.cluster.local:8123"] labels: app: "hass" - job_name: "truenas" scrape_interval: 60s metrics_path: "/metrics" static_configs: - targets: ["truenas.${SECRET_CLUSTER_DOMAIN_ROOT}:9273"] labels: app: "truenas" - job_name: "truenas-remote" scrape_interval: 60s metrics_path: "/metrics" static_configs: - targets: ["truenas-remote.${SECRET_CLUSTER_DOMAIN_ROOT}:9273"] labels: app: "truenas-remote" # Example scrape config for probing ingresses via the Blackbox Exporter. # # The relabeling allows the actual ingress scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/probe`: Only probe ingresses that have a value of `true` - job_name: "kubernetes-ingresses" metrics_path: /probe scrape_interval: 60s params: module: [http_2xx] kubernetes_sd_configs: - role: ingress relabel_configs: - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [ __meta_kubernetes_ingress_scheme, __address__, __meta_kubernetes_ingress_path, ] regex: (.+);(.+);(.+) replacement: ${1}://${2}${3} target_label: __param_target - target_label: __address__ replacement: blackbox-exporter-prometheus-blackbox-exporter:9115 - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_ingress_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_ingress_name] target_label: kubernetes_name - job_name: "kubernetes-services-http" metrics_path: /probe scrape_interval: 60s params: module: [http_2xx] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_protocol] action: keep regex: http - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox-exporter-prometheus-blackbox-exporter:9115 - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name - job_name: "kubernetes-services-tcp" metrics_path: /probe scrape_interval: 60s params: module: [tcp_connect] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_protocol] action: keep regex: tcp - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox-exporter-prometheus-blackbox-exporter:9115 - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - prometheus topologyKey: "kubernetes.io/hostname" thanos: image: quay.io/thanos/thanos:v0.21.1 objectStorageConfig: name: thanos-objstore-secret key: objstore.yml thanosService: enabled: true