mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
🚀 loki
This commit is contained in:
@@ -9,4 +9,5 @@ resources:
|
||||
- ./gatus/ks.yaml
|
||||
- ./grafana/ks.yaml
|
||||
- ./kube-prometheus-stack/ks.yaml
|
||||
- ./loki/ks.yaml
|
||||
- ./thanos/ks.yaml
|
||||
|
22
kubernetes/apps/monitoring/loki/app/gatus.yaml
Normal file
22
kubernetes/apps/monitoring/loki/app/gatus.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-gatus-ep
|
||||
namespace: monitoring
|
||||
labels:
|
||||
gatus.io/enabled: "true"
|
||||
data:
|
||||
config.yaml: |
|
||||
endpoints:
|
||||
- name: loki
|
||||
group: internal
|
||||
url: https://loki.${SECRET_CLUSTER_DOMAIN}
|
||||
interval: 1m
|
||||
client:
|
||||
dns-resolver: tcp://1.1.1.1:53
|
||||
insecure: true
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
alerts:
|
||||
- type: pushover
|
@@ -7,17 +7,17 @@ metadata:
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 30m
|
||||
timeout: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: loki
|
||||
version: 5.27.0
|
||||
version: 5.36.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
maxHistory: 2
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
@@ -26,6 +26,9 @@ spec:
|
||||
retries: 3
|
||||
uninstall:
|
||||
keepHistory: false
|
||||
dependsOn:
|
||||
- name: rook-ceph-cluster
|
||||
namespace: rook-ceph
|
||||
values:
|
||||
loki:
|
||||
structuredConfig:
|
||||
@@ -43,20 +46,24 @@ spec:
|
||||
reject_old_samples_max_age: 168h
|
||||
max_cache_freshness_per_query: 10m
|
||||
split_queries_by_interval: 15m
|
||||
ingestion_rate_mb: 8
|
||||
ingestion_burst_size_mb: 16
|
||||
ingestion_rate_mb: 50
|
||||
ingestion_burst_size_mb: 1000
|
||||
per_stream_rate_limit: 5MB
|
||||
per_stream_rate_limit_burst: 20MB
|
||||
shard_streams:
|
||||
enabled: true
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2021-08-01"
|
||||
- from: "2022-01-11" # quote
|
||||
store: boltdb-shipper
|
||||
object_store: s3
|
||||
schema: v11
|
||||
schema: v12
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
common:
|
||||
path_prefix: /var/loki
|
||||
replication_factor: 3
|
||||
replication_factor: 2
|
||||
storage:
|
||||
s3:
|
||||
s3: null
|
||||
@@ -68,7 +75,7 @@ spec:
|
||||
ruler:
|
||||
enable_api: true
|
||||
enable_alertmanager_v2: true
|
||||
alertmanager_url: http://kube-prometheus-stack-alertmanager:9093
|
||||
alertmanager_url: http://alertmanager-operated.monitoring.svc.cluster.local:9093
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
@@ -97,73 +104,55 @@ spec:
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
gateway:
|
||||
enabled: true
|
||||
replicas: 3
|
||||
affinity: |
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 1
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
{{- include "loki.gatewaySelectorLabels" . | nindent 12 }}
|
||||
topologyKey: kubernetes.io/hostname
|
||||
replicas: 2
|
||||
image:
|
||||
registry: ghcr.io
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
hajimari.io/enable: "false"
|
||||
hosts:
|
||||
- host: &host "loki.${SECRET_CLUSTER_DOMAIN}"
|
||||
- host: &host loki.devbu.io
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- *host
|
||||
write:
|
||||
replicas: 3
|
||||
affinity: |
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 1
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
{{- include "loki.writeSelectorLabels" . | nindent 12 }}
|
||||
topologyKey: kubernetes.io/hostname
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: rook-ceph-block
|
||||
read:
|
||||
replicas: 3
|
||||
affinity: |
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 1
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
{{- include "loki.readSelectorLabels" . | nindent 12 }}
|
||||
topologyKey: kubernetes.io/hostname
|
||||
extraVolumeMounts:
|
||||
- name: loki-rules
|
||||
mountPath: /rules/fake
|
||||
- name: loki-rules-tmp
|
||||
mountPath: /tmp/scratch
|
||||
- name: loki-tmp
|
||||
mountPath: /tmp/loki-tmp
|
||||
extraVolumes:
|
||||
- name: loki-rules
|
||||
emptyDir: {}
|
||||
- name: loki-rules-tmp
|
||||
emptyDir: {}
|
||||
- name: loki-tmp
|
||||
emptyDir: {}
|
||||
replicas: 2
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: rook-ceph-block
|
||||
storageClass: local-path
|
||||
extraVolumeMounts:
|
||||
- name: rules
|
||||
mountPath: /rules
|
||||
extraVolumes:
|
||||
- name: rules
|
||||
emptyDir: {}
|
||||
write:
|
||||
replicas: 2
|
||||
persistence:
|
||||
storageClass: local-path
|
||||
backend:
|
||||
replicas: 2
|
||||
persistence:
|
||||
storageClass: local-path
|
||||
extraVolumeMounts:
|
||||
- name: rules
|
||||
mountPath: /rules/fake
|
||||
- name: scratch
|
||||
mountPath: /tmp/scratch
|
||||
extraVolumes:
|
||||
- name: rules
|
||||
configMap:
|
||||
name: loki-alerting-rules
|
||||
- name: scratch
|
||||
emptyDir: {}
|
||||
monitoring:
|
||||
dashboards:
|
||||
annotations:
|
||||
grafana_folder: Loki
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
metricsInstance:
|
||||
@@ -172,24 +161,24 @@ spec:
|
||||
enabled: false
|
||||
grafanaAgent:
|
||||
installOperator: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
test:
|
||||
enabled: false
|
||||
valuesFrom:
|
||||
- kind: ConfigMap
|
||||
name: loki-chunks-bucket
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.bucketnames
|
||||
kind: ConfigMap
|
||||
name: loki-bucket
|
||||
valuesKey: BUCKET_NAME
|
||||
targetPath: loki.structuredConfig.common.storage.s3.bucketnames
|
||||
- kind: ConfigMap
|
||||
name: loki-chunks-bucket
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.endpoint
|
||||
kind: ConfigMap
|
||||
name: loki-bucket
|
||||
valuesKey: BUCKET_HOST
|
||||
targetPath: loki.structuredConfig.common.storage.s3.endpoint
|
||||
- kind: Secret
|
||||
name: loki-chunks-bucket
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.access_key_id
|
||||
kind: Secret
|
||||
name: loki-bucket
|
||||
valuesKey: AWS_ACCESS_KEY_ID
|
||||
targetPath: loki.structuredConfig.common.storage.s3.access_key_id
|
||||
- kind: Secret
|
||||
name: loki-chunks-bucket
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.secret_access_key
|
||||
kind: Secret
|
||||
name: loki-bucket
|
||||
valuesKey: AWS_SECRET_ACCESS_KEY
|
||||
targetPath: loki.structuredConfig.common.storage.s3.secret_access_key
|
@@ -4,6 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: monitoring
|
||||
resources:
|
||||
- ./object-bucket-claim.yaml
|
||||
- ./config-map.yaml
|
||||
- ./helmrelease.yaml
|
||||
- ./objectbucketclaim.yaml
|
||||
configMapGenerator:
|
||||
- name: loki-alerting-rules
|
||||
files:
|
||||
- loki-alerting-rules.yaml=./rules/loki-alerting-rules.yaml
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
@@ -2,10 +2,8 @@
|
||||
apiVersion: objectbucket.io/v1alpha1
|
||||
kind: ObjectBucketClaim
|
||||
metadata:
|
||||
name: loki-chunks-bucket
|
||||
name: loki-bucket
|
||||
namespace: monitoring
|
||||
spec:
|
||||
bucketName: loki-chunks
|
||||
bucketName: loki
|
||||
storageClassName: rook-ceph-bucket
|
||||
additionalConfig:
|
||||
maxSize: "50G"
|
@@ -0,0 +1,79 @@
|
||||
---
|
||||
groups:
|
||||
- name: smart
|
||||
rules:
|
||||
- alert: SMARTFailure
|
||||
expr: |
|
||||
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
hostname: "{{ $labels.hostname }}"
|
||||
summary: "{{ $labels.hostname }} has reported SMART failures"
|
||||
|
||||
- name: zigbee2mqtt
|
||||
rules:
|
||||
- alert: ZigbeeMQTTUnreachable
|
||||
expr: |
|
||||
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
app: "{{ $labels.app }}"
|
||||
summary: "{{ $labels.app }} is unable to reach MQTT"
|
||||
|
||||
- name: zwave-js-ui
|
||||
rules:
|
||||
- alert: ZwaveMQTTUnreachable
|
||||
expr: |
|
||||
sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
app: "{{ $labels.app }}"
|
||||
summary: "{{ $labels.app }} is unable to reach MQTT"
|
||||
|
||||
- name: frigate
|
||||
rules:
|
||||
- alert: FrigateMQTTUnreachable
|
||||
expr: |
|
||||
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
app: "{{ $labels.app }}"
|
||||
summary: "{{ $labels.app }} is unable to reach MQTT"
|
||||
|
||||
- name: home-assistant
|
||||
rules:
|
||||
- alert: HomeAssistantPostgresUnreachable
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
app: "{{ $labels.app }}"
|
||||
summary: "{{ $labels.app }} is unable to connect to postgres"
|
||||
|
||||
- name: bazarr
|
||||
rules:
|
||||
- alert: BazarrJobRaisedException
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app="bazarr"} |~ "(?i)Job(.+)Update(.+)from(.+)raised an exception"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
category: logs
|
||||
annotations:
|
||||
app: "{{ $labels.app }}"
|
||||
summary: "{{ $labels.app }} is raising job exceptions"
|
@@ -3,13 +3,13 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: cluster-apps-loki-app
|
||||
name: cluster-apps-loki
|
||||
namespace: flux-system
|
||||
labels:
|
||||
substitution.flux.home.arpa/enabled: "true"
|
||||
spec:
|
||||
dependsOn:
|
||||
- name: cluster-apps-rook-ceph-cluster
|
||||
- name: cluster-apps-external-secrets-stores
|
||||
path: ./kubernetes/apps/monitoring/loki/app
|
||||
prune: true
|
||||
sourceRef:
|
||||
@@ -17,4 +17,4 @@ spec:
|
||||
name: home-ops-kubernetes
|
||||
interval: 30m
|
||||
retryInterval: 1m
|
||||
timeout: 5m
|
||||
timeout: 3m
|
@@ -1,130 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-alerting-rules
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-alerting-rules.yaml: |-
|
||||
groups:
|
||||
#
|
||||
# SMART Failures
|
||||
#
|
||||
- name: smart-failure
|
||||
rules:
|
||||
- alert: SmartFailures
|
||||
expr: |
|
||||
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "SMART has reported failures on host {{ $labels.hostname }}"
|
||||
#
|
||||
# zigbee2mqtt
|
||||
#
|
||||
- name: zigbee2mqtt
|
||||
rules:
|
||||
- alert: ZigbeeUnableToReachMQTT
|
||||
expr: |
|
||||
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Zigbee2mqtt is unable to reach MQTT"
|
||||
#
|
||||
# zwavejs2mqtt
|
||||
#
|
||||
- name: zwavejs2mqtt
|
||||
rules:
|
||||
- alert: ZwaveUnableToReachMQTT
|
||||
expr: |
|
||||
sum(count_over_time({app="zwavejs2mqtt"} |~ "(?i)error while connecting mqtt"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Zwavejs2mqtt is unable to reach MQTT"
|
||||
#
|
||||
# frigate
|
||||
#
|
||||
- name: frigate
|
||||
rules:
|
||||
- alert: FrigateUnableToReachMQTT
|
||||
expr: |
|
||||
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Frigate is unable to reach MQTT"
|
||||
#
|
||||
# *arr
|
||||
#
|
||||
- name: arr
|
||||
rules:
|
||||
- alert: ArrDatabaseIsLocked
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "{{ $labels.app }} is experiencing locked database issues"
|
||||
- alert: ArrDatabaseIsMalformed
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "{{ $labels.app }} is experiencing malformed database disk image issues"
|
||||
#
|
||||
# home-assistant
|
||||
#
|
||||
- name: home-assistant
|
||||
rules:
|
||||
- alert: HomeAssistantUnableToReachPostgresql
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Home Assistant is unable to connect to postgresql"
|
||||
#
|
||||
# valetudo
|
||||
#
|
||||
- name: valetudo
|
||||
rules:
|
||||
- alert: ValetudoUnableToReachMQTT
|
||||
expr: |
|
||||
sum by (hostname) (count_over_time({hostname="valetudo"} |~ "(?i).*error.*mqtt.*"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Valetudo is unable to connect to mqtt"
|
||||
#
|
||||
# node-red
|
||||
#
|
||||
- name: node-red
|
||||
rules:
|
||||
- alert: NodeRedUnableToReachHomeAssistant
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app="node-red"} |~ "(?i)home assistant.*connecting to undefined"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "Node-Red is unable to connect to Home Assistant"
|
Reference in New Issue
Block a user