diff --git a/cluster/base-custom/charts/kustomization.yaml b/cluster/base-custom/charts/kustomization.yaml index 86ec0e22d..bad039583 100644 --- a/cluster/base-custom/charts/kustomization.yaml +++ b/cluster/base-custom/charts/kustomization.yaml @@ -6,7 +6,6 @@ resources: - cert-manager-webhook-ovh.yaml - coredns-charts.yaml - drone-charts.yaml - - fairwinds-charts.yaml - falco-security-charts.yaml - gitea-charts.yaml - grafana-loki-charts.yaml @@ -25,3 +24,4 @@ resources: - stakater-charts.yaml - twuni-charts.yaml - vernemq-charts.yaml + - weaveworks-kured-charts.yaml diff --git a/cluster/base-custom/charts/fairwinds-charts.yaml b/cluster/base-custom/charts/weaveworks-kured-charts.yaml similarity index 65% rename from cluster/base-custom/charts/fairwinds-charts.yaml rename to cluster/base-custom/charts/weaveworks-kured-charts.yaml index 231172f2d..7c4c4f267 100644 --- a/cluster/base-custom/charts/fairwinds-charts.yaml +++ b/cluster/base-custom/charts/weaveworks-kured-charts.yaml @@ -2,9 +2,9 @@ apiVersion: source.toolkit.fluxcd.io/v1beta1 kind: HelmRepository metadata: - name: fairwinds-charts + name: weaveworks-kured-charts namespace: flux-system spec: interval: 10m - url: https://charts.fairwinds.com/stable + url: https://weaveworks.github.io/kured timeout: 3m diff --git a/cluster/core/infrastructure/kured/helm-release.yaml b/cluster/core/infrastructure/kured/helm-release.yaml new file mode 100644 index 000000000..75a6fb382 --- /dev/null +++ b/cluster/core/infrastructure/kured/helm-release.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: kured + namespace: kube-system +spec: + interval: 5m + chart: + spec: + # renovate: registryUrl=https://weaveworks.github.io/kured + chart: kured + version: 2.4.3 + sourceRef: + kind: HelmRepository + name: weaveworks-kured-charts + namespace: flux-system + interval: 5m + values: + updateStrategy: RollingUpdate + configuration: + rebootDays: + - we + startTime: "2:00" + endTime: "5:00" + timeZone: "Europe/Paris" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Exists" + metrics: + create: true diff --git a/cluster/core/infrastructure/kured/kustomization.yaml b/cluster/core/infrastructure/kured/kustomization.yaml new file mode 100644 index 000000000..f7be7f203 --- /dev/null +++ b/cluster/core/infrastructure/kured/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml + - prometheus-rule.yaml diff --git a/cluster/core/infrastructure/kured/prometheus-rule.yaml b/cluster/core/infrastructure/kured/prometheus-rule.yaml new file mode 100644 index 000000000..80aae3cc4 --- /dev/null +++ b/cluster/core/infrastructure/kured/prometheus-rule.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: kured-rules + namespace: kube-system +spec: + groups: + - name: kured.rules + rules: + - alert: RebootRequired + annotations: + description: Node(s) require a manual reboot + summary: Reboot daemon has failed to do so for 24 hours + expr: max(kured_reboot_required) != 0 + for: 24h + labels: + severity: warning + - alert: RebootScheduled + annotations: + description: Node Reboot Scheduled + summary: Node {{$labels.node}} has been scheduled to reboot + expr: kured_reboot_required > 0 + for: 5m + labels: + severity: warning diff --git a/cluster/core/infrastructure/kustomization.yaml b/cluster/core/infrastructure/kustomization.yaml index fa18fc2e5..5302b8c5c 100644 --- a/cluster/core/infrastructure/kustomization.yaml +++ b/cluster/core/infrastructure/kustomization.yaml @@ -5,6 +5,7 @@ resources: - descheduler - flux - intel-gpu-plugin + - kured - node-feature-discovery - rook-ceph - system-upgrade