diff --git a/kubernetes/apps/monitoring/kustomization.yaml b/kubernetes/apps/monitoring/kustomization.yaml index c8cde334e..1a41727f7 100644 --- a/kubernetes/apps/monitoring/kustomization.yaml +++ b/kubernetes/apps/monitoring/kustomization.yaml @@ -9,4 +9,5 @@ resources: - ./gatus/ks.yaml - ./grafana/ks.yaml - ./kube-prometheus-stack/ks.yaml + - ./loki/ks.yaml - ./thanos/ks.yaml diff --git a/kubernetes/apps/monitoring/loki/app/gatus.yaml b/kubernetes/apps/monitoring/loki/app/gatus.yaml new file mode 100644 index 000000000..a25142766 --- /dev/null +++ b/kubernetes/apps/monitoring/loki/app/gatus.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-gatus-ep + namespace: monitoring + labels: + gatus.io/enabled: "true" +data: + config.yaml: | + endpoints: + - name: loki + group: internal + url: https://loki.${SECRET_CLUSTER_DOMAIN} + interval: 1m + client: + dns-resolver: tcp://1.1.1.1:53 + insecure: true + conditions: + - "[STATUS] == 200" + alerts: + - type: pushover diff --git a/kubernetes/archive/loki/app/helmrelease.yaml b/kubernetes/apps/monitoring/loki/app/helmrelease.yaml similarity index 58% rename from kubernetes/archive/loki/app/helmrelease.yaml rename to kubernetes/apps/monitoring/loki/app/helmrelease.yaml index 8a494a4c8..eea63d371 100644 --- a/kubernetes/archive/loki/app/helmrelease.yaml +++ b/kubernetes/apps/monitoring/loki/app/helmrelease.yaml @@ -7,17 +7,17 @@ metadata: namespace: monitoring spec: interval: 30m + timeout: 15m chart: spec: chart: loki - version: 5.27.0 + version: 5.36.3 sourceRef: kind: HelmRepository name: grafana namespace: flux-system maxHistory: 2 install: - createNamespace: true remediation: retries: 3 upgrade: @@ -26,6 +26,9 @@ spec: retries: 3 uninstall: keepHistory: false + dependsOn: + - name: rook-ceph-cluster + namespace: rook-ceph values: loki: structuredConfig: @@ -43,20 +46,24 @@ spec: reject_old_samples_max_age: 168h max_cache_freshness_per_query: 10m split_queries_by_interval: 15m - ingestion_rate_mb: 8 - ingestion_burst_size_mb: 16 + ingestion_rate_mb: 50 + ingestion_burst_size_mb: 1000 + per_stream_rate_limit: 5MB + per_stream_rate_limit_burst: 20MB + shard_streams: + enabled: true schema_config: configs: - - from: "2021-08-01" + - from: "2022-01-11" # quote store: boltdb-shipper object_store: s3 - schema: v11 + schema: v12 index: prefix: loki_index_ period: 24h common: path_prefix: /var/loki - replication_factor: 3 + replication_factor: 2 storage: s3: s3: null @@ -68,7 +75,7 @@ spec: ruler: enable_api: true enable_alertmanager_v2: true - alertmanager_url: http://kube-prometheus-stack-alertmanager:9093 + alertmanager_url: http://alertmanager-operated.monitoring.svc.cluster.local:9093 storage: type: local local: @@ -97,73 +104,55 @@ spec: analytics: reporting_enabled: false gateway: - enabled: true - replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.gatewaySelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname + replicas: 2 + image: + registry: ghcr.io ingress: enabled: true ingressClassName: "nginx" annotations: hajimari.io/enable: "false" hosts: - - host: &host "loki.${SECRET_CLUSTER_DOMAIN}" + - host: &host loki.devbu.io paths: - path: / pathType: Prefix tls: - hosts: - *host - write: - replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.writeSelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname - persistence: - size: 10Gi - storageClass: rook-ceph-block read: - replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.readSelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname - extraVolumeMounts: - - name: loki-rules - mountPath: /rules/fake - - name: loki-rules-tmp - mountPath: /tmp/scratch - - name: loki-tmp - mountPath: /tmp/loki-tmp - extraVolumes: - - name: loki-rules - emptyDir: {} - - name: loki-rules-tmp - emptyDir: {} - - name: loki-tmp - emptyDir: {} + replicas: 2 persistence: - size: 10Gi - storageClass: rook-ceph-block + storageClass: local-path + extraVolumeMounts: + - name: rules + mountPath: /rules + extraVolumes: + - name: rules + emptyDir: {} + write: + replicas: 2 + persistence: + storageClass: local-path + backend: + replicas: 2 + persistence: + storageClass: local-path + extraVolumeMounts: + - name: rules + mountPath: /rules/fake + - name: scratch + mountPath: /tmp/scratch + extraVolumes: + - name: rules + configMap: + name: loki-alerting-rules + - name: scratch + emptyDir: {} monitoring: + dashboards: + annotations: + grafana_folder: Loki serviceMonitor: enabled: false metricsInstance: @@ -172,24 +161,24 @@ spec: enabled: false grafanaAgent: installOperator: false - lokiCanary: - enabled: false + lokiCanary: + enabled: false test: enabled: false valuesFrom: - - kind: ConfigMap - name: loki-chunks-bucket + - targetPath: loki.structuredConfig.common.storage.s3.bucketnames + kind: ConfigMap + name: loki-bucket valuesKey: BUCKET_NAME - targetPath: loki.structuredConfig.common.storage.s3.bucketnames - - kind: ConfigMap - name: loki-chunks-bucket + - targetPath: loki.structuredConfig.common.storage.s3.endpoint + kind: ConfigMap + name: loki-bucket valuesKey: BUCKET_HOST - targetPath: loki.structuredConfig.common.storage.s3.endpoint - - kind: Secret - name: loki-chunks-bucket + - targetPath: loki.structuredConfig.common.storage.s3.access_key_id + kind: Secret + name: loki-bucket valuesKey: AWS_ACCESS_KEY_ID - targetPath: loki.structuredConfig.common.storage.s3.access_key_id - - kind: Secret - name: loki-chunks-bucket + - targetPath: loki.structuredConfig.common.storage.s3.secret_access_key + kind: Secret + name: loki-bucket valuesKey: AWS_SECRET_ACCESS_KEY - targetPath: loki.structuredConfig.common.storage.s3.secret_access_key diff --git a/kubernetes/archive/loki/app/kustomization.yaml b/kubernetes/apps/monitoring/loki/app/kustomization.yaml similarity index 55% rename from kubernetes/archive/loki/app/kustomization.yaml rename to kubernetes/apps/monitoring/loki/app/kustomization.yaml index 41c071142..5fd1ac4b3 100644 --- a/kubernetes/archive/loki/app/kustomization.yaml +++ b/kubernetes/apps/monitoring/loki/app/kustomization.yaml @@ -4,6 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: monitoring resources: - - ./object-bucket-claim.yaml - - ./config-map.yaml - ./helmrelease.yaml + - ./objectbucketclaim.yaml +configMapGenerator: + - name: loki-alerting-rules + files: + - loki-alerting-rules.yaml=./rules/loki-alerting-rules.yaml +generatorOptions: + disableNameSuffixHash: true diff --git a/kubernetes/archive/loki/app/object-bucket-claim.yaml b/kubernetes/apps/monitoring/loki/app/objectbucketclaim.yaml similarity index 60% rename from kubernetes/archive/loki/app/object-bucket-claim.yaml rename to kubernetes/apps/monitoring/loki/app/objectbucketclaim.yaml index 392f05fe3..541053be4 100644 --- a/kubernetes/archive/loki/app/object-bucket-claim.yaml +++ b/kubernetes/apps/monitoring/loki/app/objectbucketclaim.yaml @@ -2,10 +2,8 @@ apiVersion: objectbucket.io/v1alpha1 kind: ObjectBucketClaim metadata: - name: loki-chunks-bucket + name: loki-bucket namespace: monitoring spec: - bucketName: loki-chunks + bucketName: loki storageClassName: rook-ceph-bucket - additionalConfig: - maxSize: "50G" diff --git a/kubernetes/apps/monitoring/loki/app/rules/loki-alerting-rules.yaml b/kubernetes/apps/monitoring/loki/app/rules/loki-alerting-rules.yaml new file mode 100644 index 000000000..fb2dafd39 --- /dev/null +++ b/kubernetes/apps/monitoring/loki/app/rules/loki-alerting-rules.yaml @@ -0,0 +1,79 @@ +--- +groups: + - name: smart + rules: + - alert: SMARTFailure + expr: | + sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + hostname: "{{ $labels.hostname }}" + summary: "{{ $labels.hostname }} has reported SMART failures" + + - name: zigbee2mqtt + rules: + - alert: ZigbeeMQTTUnreachable + expr: | + sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + app: "{{ $labels.app }}" + summary: "{{ $labels.app }} is unable to reach MQTT" + + - name: zwave-js-ui + rules: + - alert: ZwaveMQTTUnreachable + expr: | + sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + app: "{{ $labels.app }}" + summary: "{{ $labels.app }} is unable to reach MQTT" + + - name: frigate + rules: + - alert: FrigateMQTTUnreachable + expr: | + sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + app: "{{ $labels.app }}" + summary: "{{ $labels.app }} is unable to reach MQTT" + + - name: home-assistant + rules: + - alert: HomeAssistantPostgresUnreachable + expr: | + sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + app: "{{ $labels.app }}" + summary: "{{ $labels.app }} is unable to connect to postgres" + + - name: bazarr + rules: + - alert: BazarrJobRaisedException + expr: | + sum by (app) (count_over_time({app="bazarr"} |~ "(?i)Job(.+)Update(.+)from(.+)raised an exception"[2m])) > 0 + for: 2m + labels: + severity: warning + category: logs + annotations: + app: "{{ $labels.app }}" + summary: "{{ $labels.app }} is raising job exceptions" diff --git a/kubernetes/archive/loki/ks.yaml b/kubernetes/apps/monitoring/loki/ks.yaml similarity index 84% rename from kubernetes/archive/loki/ks.yaml rename to kubernetes/apps/monitoring/loki/ks.yaml index 8d62df5d0..15a3c0ca5 100644 --- a/kubernetes/archive/loki/ks.yaml +++ b/kubernetes/apps/monitoring/loki/ks.yaml @@ -3,13 +3,13 @@ apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: cluster-apps-loki-app + name: cluster-apps-loki namespace: flux-system labels: substitution.flux.home.arpa/enabled: "true" spec: dependsOn: - - name: cluster-apps-rook-ceph-cluster + - name: cluster-apps-external-secrets-stores path: ./kubernetes/apps/monitoring/loki/app prune: true sourceRef: @@ -17,4 +17,4 @@ spec: name: home-ops-kubernetes interval: 30m retryInterval: 1m - timeout: 5m + timeout: 3m diff --git a/kubernetes/archive/loki/app/config-map.yaml b/kubernetes/archive/loki/app/config-map.yaml deleted file mode 100644 index fa92fc40b..000000000 --- a/kubernetes/archive/loki/app/config-map.yaml +++ /dev/null @@ -1,130 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: loki-alerting-rules - namespace: monitoring -data: - loki-alerting-rules.yaml: |- - groups: - # - # SMART Failures - # - - name: smart-failure - rules: - - alert: SmartFailures - expr: | - sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "SMART has reported failures on host {{ $labels.hostname }}" - # - # zigbee2mqtt - # - - name: zigbee2mqtt - rules: - - alert: ZigbeeUnableToReachMQTT - expr: | - sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Zigbee2mqtt is unable to reach MQTT" - # - # zwavejs2mqtt - # - - name: zwavejs2mqtt - rules: - - alert: ZwaveUnableToReachMQTT - expr: | - sum(count_over_time({app="zwavejs2mqtt"} |~ "(?i)error while connecting mqtt"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Zwavejs2mqtt is unable to reach MQTT" - # - # frigate - # - - name: frigate - rules: - - alert: FrigateUnableToReachMQTT - expr: | - sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Frigate is unable to reach MQTT" - # - # *arr - # - - name: arr - rules: - - alert: ArrDatabaseIsLocked - expr: | - sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "{{ $labels.app }} is experiencing locked database issues" - - alert: ArrDatabaseIsMalformed - expr: | - sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "{{ $labels.app }} is experiencing malformed database disk image issues" - # - # home-assistant - # - - name: home-assistant - rules: - - alert: HomeAssistantUnableToReachPostgresql - expr: | - sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Home Assistant is unable to connect to postgresql" - # - # valetudo - # - - name: valetudo - rules: - - alert: ValetudoUnableToReachMQTT - expr: | - sum by (hostname) (count_over_time({hostname="valetudo"} |~ "(?i).*error.*mqtt.*"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Valetudo is unable to connect to mqtt" - # - # node-red - # - - name: node-red - rules: - - alert: NodeRedUnableToReachHomeAssistant - expr: | - sum by (app) (count_over_time({app="node-red"} |~ "(?i)home assistant.*connecting to undefined"[2m])) > 0 - for: 2m - labels: - severity: critical - category: logs - annotations: - summary: "Node-Red is unable to connect to Home Assistant"