diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml index f090b834f..59dabad7d 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml @@ -27,3 +27,26 @@ spec: APP: *app # renovate: datasource=docker depName=quay.io/thanos/thanos THANOS_VERSION: v0.35.0 +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app kube-prometheus-rules + namespace: flux-system +spec: + targetNamespace: monitoring + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: kube-prometheus-stack + path: ./kubernetes/apps/monitoring/kube-prometheus-stack/rules + prune: true + sourceRef: + kind: GitRepository + name: home-ops-kubernetes + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/rules/kustomization.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/rules/kustomization.yaml new file mode 100644 index 000000000..5ebada61e --- /dev/null +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/rules/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +resources: + - ./zfs.yaml diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/rules/zfs.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/rules/zfs.yaml new file mode 100644 index 000000000..18ceedcc2 --- /dev/null +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/rules/zfs.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: zrepl-replication-errors + namespace: monitoring +spec: + groups: + - name: zrepl.rules + rules: + - alert: ZreplReplicationFilesystemErrors + expr: zrepl_replication_filesystem_errors > 0 + for: 15m # Alert if the condition persists for 15 minutes + labels: + severity: warning + annotations: + summary: "Zrepl replication filesystem errors detected" + description: "Zrepl job {{ $labels.zrepl_job }} has encountered filesystem errors during replication."