🚀 loki

This commit is contained in:
auricom
2023-11-15 19:38:24 +01:00
parent 6ff46ee7ae
commit b7865afffb
8 changed files with 177 additions and 213 deletions

View File

@@ -9,4 +9,5 @@ resources:
- ./gatus/ks.yaml
- ./grafana/ks.yaml
- ./kube-prometheus-stack/ks.yaml
- ./loki/ks.yaml
- ./thanos/ks.yaml

View File

@@ -0,0 +1,22 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-gatus-ep
namespace: monitoring
labels:
gatus.io/enabled: "true"
data:
config.yaml: |
endpoints:
- name: loki
group: internal
url: https://loki.${SECRET_CLUSTER_DOMAIN}
interval: 1m
client:
dns-resolver: tcp://1.1.1.1:53
insecure: true
conditions:
- "[STATUS] == 200"
alerts:
- type: pushover

View File

@@ -7,17 +7,17 @@ metadata:
namespace: monitoring
spec:
interval: 30m
timeout: 15m
chart:
spec:
chart: loki
version: 5.27.0
version: 5.36.3
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
maxHistory: 2
install:
createNamespace: true
remediation:
retries: 3
upgrade:
@@ -26,6 +26,9 @@ spec:
retries: 3
uninstall:
keepHistory: false
dependsOn:
- name: rook-ceph-cluster
namespace: rook-ceph
values:
loki:
structuredConfig:
@@ -43,20 +46,24 @@ spec:
reject_old_samples_max_age: 168h
max_cache_freshness_per_query: 10m
split_queries_by_interval: 15m
ingestion_rate_mb: 8
ingestion_burst_size_mb: 16
ingestion_rate_mb: 50
ingestion_burst_size_mb: 1000
per_stream_rate_limit: 5MB
per_stream_rate_limit_burst: 20MB
shard_streams:
enabled: true
schema_config:
configs:
- from: "2021-08-01"
- from: "2022-01-11" # quote
store: boltdb-shipper
object_store: s3
schema: v11
schema: v12
index:
prefix: loki_index_
period: 24h
common:
path_prefix: /var/loki
replication_factor: 3
replication_factor: 2
storage:
s3:
s3: null
@@ -68,7 +75,7 @@ spec:
ruler:
enable_api: true
enable_alertmanager_v2: true
alertmanager_url: http://kube-prometheus-stack-alertmanager:9093
alertmanager_url: http://alertmanager-operated.monitoring.svc.cluster.local:9093
storage:
type: local
local:
@@ -97,73 +104,55 @@ spec:
analytics:
reporting_enabled: false
gateway:
enabled: true
replicas: 3
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.gatewaySelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
replicas: 2
image:
registry: ghcr.io
ingress:
enabled: true
ingressClassName: "nginx"
annotations:
hajimari.io/enable: "false"
hosts:
- host: &host "loki.${SECRET_CLUSTER_DOMAIN}"
- host: &host loki.devbu.io
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- *host
write:
replicas: 3
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.writeSelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
persistence:
size: 10Gi
storageClass: rook-ceph-block
read:
replicas: 3
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.readSelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
extraVolumeMounts:
- name: loki-rules
mountPath: /rules/fake
- name: loki-rules-tmp
mountPath: /tmp/scratch
- name: loki-tmp
mountPath: /tmp/loki-tmp
extraVolumes:
- name: loki-rules
emptyDir: {}
- name: loki-rules-tmp
emptyDir: {}
- name: loki-tmp
emptyDir: {}
replicas: 2
persistence:
size: 10Gi
storageClass: rook-ceph-block
storageClass: local-path
extraVolumeMounts:
- name: rules
mountPath: /rules
extraVolumes:
- name: rules
emptyDir: {}
write:
replicas: 2
persistence:
storageClass: local-path
backend:
replicas: 2
persistence:
storageClass: local-path
extraVolumeMounts:
- name: rules
mountPath: /rules/fake
- name: scratch
mountPath: /tmp/scratch
extraVolumes:
- name: rules
configMap:
name: loki-alerting-rules
- name: scratch
emptyDir: {}
monitoring:
dashboards:
annotations:
grafana_folder: Loki
serviceMonitor:
enabled: false
metricsInstance:
@@ -172,24 +161,24 @@ spec:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
lokiCanary:
enabled: false
test:
enabled: false
valuesFrom:
- kind: ConfigMap
name: loki-chunks-bucket
- targetPath: loki.structuredConfig.common.storage.s3.bucketnames
kind: ConfigMap
name: loki-bucket
valuesKey: BUCKET_NAME
targetPath: loki.structuredConfig.common.storage.s3.bucketnames
- kind: ConfigMap
name: loki-chunks-bucket
- targetPath: loki.structuredConfig.common.storage.s3.endpoint
kind: ConfigMap
name: loki-bucket
valuesKey: BUCKET_HOST
targetPath: loki.structuredConfig.common.storage.s3.endpoint
- kind: Secret
name: loki-chunks-bucket
- targetPath: loki.structuredConfig.common.storage.s3.access_key_id
kind: Secret
name: loki-bucket
valuesKey: AWS_ACCESS_KEY_ID
targetPath: loki.structuredConfig.common.storage.s3.access_key_id
- kind: Secret
name: loki-chunks-bucket
- targetPath: loki.structuredConfig.common.storage.s3.secret_access_key
kind: Secret
name: loki-bucket
valuesKey: AWS_SECRET_ACCESS_KEY
targetPath: loki.structuredConfig.common.storage.s3.secret_access_key

View File

@@ -4,6 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ./object-bucket-claim.yaml
- ./config-map.yaml
- ./helmrelease.yaml
- ./objectbucketclaim.yaml
configMapGenerator:
- name: loki-alerting-rules
files:
- loki-alerting-rules.yaml=./rules/loki-alerting-rules.yaml
generatorOptions:
disableNameSuffixHash: true

View File

@@ -2,10 +2,8 @@
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: loki-chunks-bucket
name: loki-bucket
namespace: monitoring
spec:
bucketName: loki-chunks
bucketName: loki
storageClassName: rook-ceph-bucket
additionalConfig:
maxSize: "50G"

View File

@@ -0,0 +1,79 @@
---
groups:
- name: smart
rules:
- alert: SMARTFailure
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
hostname: "{{ $labels.hostname }}"
summary: "{{ $labels.hostname }} has reported SMART failures"
- name: zigbee2mqtt
rules:
- alert: ZigbeeMQTTUnreachable
expr: |
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: zwave-js-ui
rules:
- alert: ZwaveMQTTUnreachable
expr: |
sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: frigate
rules:
- alert: FrigateMQTTUnreachable
expr: |
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: home-assistant
rules:
- alert: HomeAssistantPostgresUnreachable
expr: |
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to connect to postgres"
- name: bazarr
rules:
- alert: BazarrJobRaisedException
expr: |
sum by (app) (count_over_time({app="bazarr"} |~ "(?i)Job(.+)Update(.+)from(.+)raised an exception"[2m])) > 0
for: 2m
labels:
severity: warning
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is raising job exceptions"

View File

@@ -3,13 +3,13 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: cluster-apps-loki-app
name: cluster-apps-loki
namespace: flux-system
labels:
substitution.flux.home.arpa/enabled: "true"
spec:
dependsOn:
- name: cluster-apps-rook-ceph-cluster
- name: cluster-apps-external-secrets-stores
path: ./kubernetes/apps/monitoring/loki/app
prune: true
sourceRef:
@@ -17,4 +17,4 @@ spec:
name: home-ops-kubernetes
interval: 30m
retryInterval: 1m
timeout: 5m
timeout: 3m

View File

@@ -1,130 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-alerting-rules
namespace: monitoring
data:
loki-alerting-rules.yaml: |-
groups:
#
# SMART Failures
#
- name: smart-failure
rules:
- alert: SmartFailures
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "SMART has reported failures on host {{ $labels.hostname }}"
#
# zigbee2mqtt
#
- name: zigbee2mqtt
rules:
- alert: ZigbeeUnableToReachMQTT
expr: |
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Zigbee2mqtt is unable to reach MQTT"
#
# zwavejs2mqtt
#
- name: zwavejs2mqtt
rules:
- alert: ZwaveUnableToReachMQTT
expr: |
sum(count_over_time({app="zwavejs2mqtt"} |~ "(?i)error while connecting mqtt"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Zwavejs2mqtt is unable to reach MQTT"
#
# frigate
#
- name: frigate
rules:
- alert: FrigateUnableToReachMQTT
expr: |
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Frigate is unable to reach MQTT"
#
# *arr
#
- name: arr
rules:
- alert: ArrDatabaseIsLocked
expr: |
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "{{ $labels.app }} is experiencing locked database issues"
- alert: ArrDatabaseIsMalformed
expr: |
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "{{ $labels.app }} is experiencing malformed database disk image issues"
#
# home-assistant
#
- name: home-assistant
rules:
- alert: HomeAssistantUnableToReachPostgresql
expr: |
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Home Assistant is unable to connect to postgresql"
#
# valetudo
#
- name: valetudo
rules:
- alert: ValetudoUnableToReachMQTT
expr: |
sum by (hostname) (count_over_time({hostname="valetudo"} |~ "(?i).*error.*mqtt.*"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Valetudo is unable to connect to mqtt"
#
# node-red
#
- name: node-red
rules:
- alert: NodeRedUnableToReachHomeAssistant
expr: |
sum by (app) (count_over_time({app="node-red"} |~ "(?i)home assistant.*connecting to undefined"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "Node-Red is unable to connect to Home Assistant"