feat: archive loki & vector

This commit is contained in:
auricom
2024-05-05 01:10:02 +02:00
parent 677e9cb4dd
commit 62b2e0330c
21 changed files with 5 additions and 7 deletions

View File

@@ -157,11 +157,11 @@ spec:
access: proxy
url: http://thanos-query-frontend.monitoring.svc.cluster.local.:9090
isDefault: true
- name: Loki
type: loki
uid: loki
access: proxy
url: http://loki-gateway.monitoring.svc.cluster.local.:80
# - name: Loki
# type: loki
# uid: loki
# access: proxy
# url: http://loki-gateway.monitoring.svc.cluster.local.:80
- name: Alertmanager
type: alertmanager
uid: alertmanager

View File

@@ -9,7 +9,5 @@ resources:
- ./gatus/ks.yaml
- ./grafana/ks.yaml
- ./kube-prometheus-stack/ks.yaml
- ./loki/ks.yaml
- ./scrutiny/ks.yaml
- ./thanos/ks.yaml
- ./vector/ks.yaml

View File

@@ -1,22 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-gatus-ep
namespace: monitoring
labels:
gatus.io/enabled: "true"
data:
config.yaml: |
endpoints:
- name: loki
group: internal
url: https://loki.${SECRET_CLUSTER_DOMAIN}
interval: 1m
client:
dns-resolver: tcp://1.1.1.1:53
insecure: true
conditions:
- "[STATUS] == 200"
alerts:
- type: pushover

View File

@@ -1,147 +0,0 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: loki
namespace: monitoring
spec:
interval: 30m
timeout: 15m
chart:
spec:
chart: loki
version: 6.5.0
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
maxHistory: 2
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: Skip
remediation:
strategy: rollback
retries: 3
dependsOn:
- name: rook-ceph-cluster
namespace: rook-ceph
- name: vector-agent
namespace: monitoring
- name: vector-aggregator
namespace: monitoring
valuesFrom:
- targetPath: loki.storage.bucketNames.chunks
kind: ConfigMap
name: &cephBucket loki-bucket
valuesKey: BUCKET_NAME
- targetPath: loki.storage.s3.endpoint
kind: ConfigMap
name: *cephBucket
valuesKey: BUCKET_HOST
- targetPath: loki.storage.s3.region
kind: ConfigMap
name: *cephBucket
valuesKey: BUCKET_REGION
- targetPath: loki.storage.s3.accessKeyId
kind: Secret
name: *cephBucket
valuesKey: AWS_ACCESS_KEY_ID
- targetPath: loki.storage.s3.secretAccessKey
kind: Secret
name: *cephBucket
valuesKey: AWS_SECRET_ACCESS_KEY
values:
deploymentMode: SimpleScalable
loki:
podAnnotations:
configmap.reloader.stakater.com/reload: *cephBucket
secret.reloader.stakater.com/reload: *cephBucket
ingester:
chunk_encoding: snappy
storage:
type: s3
s3:
s3ForcePathStyle: true
insecure: true
schemaConfig:
configs:
- from: "2024-04-01" # quote
store: tsdb
object_store: s3
schema: v13
index:
prefix: loki_index_
period: 24h
structuredConfig:
auth_enabled: false
server:
log_level: info
http_listen_port: 3100
grpc_listen_port: 9095
grpc_server_max_recv_msg_size: 8388608
grpc_server_max_send_msg_size: 8388608
limits_config:
ingestion_burst_size_mb: 128
ingestion_rate_mb: 64
max_query_parallelism: 100
per_stream_rate_limit: 64M
per_stream_rate_limit_burst: 128M
reject_old_samples: true
reject_old_samples_max_age: 168h
retention_period: 30d
shard_streams:
enabled: true
split_queries_by_interval: 1h
query_scheduler:
max_outstanding_requests_per_tenant: 4096
frontend:
max_outstanding_per_tenant: 4096
ruler:
enable_api: true
enable_alertmanager_v2: true
alertmanager_url: http://alertmanager-operated.monitoring.svc.cluster.local:9093
storage:
type: local
local:
directory: /rules
rule_path: /rules/fake
analytics:
reporting_enabled: false
backend:
replicas: 3
persistence:
size: 20Gi
storageClass: openebs-hostpath
gateway:
replicas: 3
image:
registry: ghcr.io
ingress:
enabled: true
ingressClassName: internal
hosts:
- host: loki.devbu.io
paths:
- path: /
pathType: Prefix
read:
replicas: 3
write:
replicas: 3
persistence:
size: 20Gi
storageClass: openebs-hostpath
sidecar:
image:
repository: ghcr.io/kiwigrid/k8s-sidecar
rules:
searchNamespace: ALL
folder: /rules/fake
lokiCanary:
enabled: false
test:
enabled: false

View File

@@ -1,14 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ./helmrelease.yaml
- ./objectbucketclaim.yaml
configMapGenerator:
- name: loki-alerting-rules
files:
- loki-alerting-rules.yaml=./rules/loki-alerting-rules.yaml
generatorOptions:
disableNameSuffixHash: true

View File

@@ -1,9 +0,0 @@
---
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: loki-bucket
namespace: monitoring
spec:
bucketName: loki
storageClassName: rook-ceph-bucket

View File

@@ -1,79 +0,0 @@
---
groups:
- name: smart
rules:
- alert: SMARTFailure
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
hostname: "{{ $labels.hostname }}"
summary: "{{ $labels.hostname }} has reported SMART failures"
- name: zigbee2mqtt
rules:
- alert: ZigbeeMQTTUnreachable
expr: |
sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: zwave-js-ui
rules:
- alert: ZwaveMQTTUnreachable
expr: |
sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: frigate
rules:
- alert: FrigateMQTTUnreachable
expr: |
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"
- name: home-assistant
rules:
- alert: HomeAssistantPostgresUnreachable
expr: |
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to connect to postgres"
- name: bazarr
rules:
- alert: BazarrJobRaisedException
expr: |
sum by (app) (count_over_time({app="bazarr"} |~ "(?i)Job(.+)Update(.+)from(.+)raised an exception"[2m])) > 0
for: 2m
labels:
severity: warning
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is raising job exceptions"

View File

@@ -1,26 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app loki
namespace: flux-system
spec:
targetNamespace: monitoring
commonMetadata:
labels:
app.kubernetes.io/name: *app
dependsOn:
- name: external-secrets-stores
path: ./kubernetes/apps/monitoring/loki/app
prune: true
sourceRef:
kind: GitRepository
name: home-ops-kubernetes
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app

View File

@@ -1,36 +0,0 @@
---
data_dir: /vector-data-dir
api:
enabled: false
sources:
kubernetes_logs:
type: kubernetes_logs
talos_kernel_logs:
type: socket
mode: udp
address: 127.0.0.1:12000
talos_service_logs:
type: socket
mode: udp
address: 127.0.0.1:12001
sinks:
kubernetes_sink:
type: vector
inputs:
- kubernetes_logs
address: "vector-aggregator.monitoring:6000"
version: "2"
talos_kernel_sink:
type: vector
inputs:
- talos_kernel_logs
address: "vector-aggregator.monitoring:6050"
version: "2"
talos_service_sink:
type: vector
inputs:
- talos_service_logs
address: "vector-aggregator.monitoring:6051"
version: "2"

View File

@@ -1,83 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2beta2.schema.json
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: &app vector-agent
namespace: monitoring
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.1.0
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
maxHistory: 2
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
uninstall:
keepHistory: false
dependsOn:
- name: vector-aggregator
values:
controllers:
vector-agent:
type: daemonset
strategy: RollingUpdate
annotations:
reloader.stakater.com/auto: "true"
containers:
app:
image:
repository: docker.io/timberio/vector
tag: 0.37.1-debian@sha256:e0d1bc9e61d99a139870f7276a18f36c3365ad76456a2e103fd5cb277a5a1fcb
env:
PROCFS_ROOT: /host/proc
SYSFS_ROOT: /host/sys
VECTOR_SELF_NODE_NAME:
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
VECTOR_SELF_POD_NAME:
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
VECTOR_SELF_POD_NAMESPACE:
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
args: ["--config", "/etc/vector/vector.yaml"]
securityContext:
privileged: true
service:
app:
controller: *app
enabled: false
serviceAccount:
create: true
name: vector-agent
persistence:
config:
enabled: true
type: configMap
name: vector-agent-configmap # overriden by kustomizeconfig
globalMounts:
- path: /etc/vector/vector.yaml
subPath: vector.yaml
readOnly: true
data:
type: emptyDir
globalMounts:
- path: /vector-data-dir

View File

@@ -1,14 +0,0 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ./helmrelease.yaml
- ./rbac.yaml
configMapGenerator:
- name: vector-agent-configmap
files:
- vector.yaml=./config/vector.yaml
configurations:
- ./patches/kustomizeconfig.yaml

View File

@@ -1,7 +0,0 @@
---
nameReference:
- kind: ConfigMap
version: v1
fieldSpecs:
- path: spec/values/persistence/config/name
kind: HelmRelease

View File

@@ -1,34 +0,0 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: vector-agent
labels:
app.kubernetes.io/instance: vector-agent
app.kubernetes.io/name: vector-agent
rules:
- apiGroups:
- ""
resources:
- namespaces
- nodes
- pods
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: vector-agent
labels:
app.kubernetes.io/instance: vector-agent
app.kubernetes.io/name: vector-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: vector-agent
subjects:
- kind: ServiceAccount
name: vector-agent
namespace: monitoring

View File

@@ -1,159 +0,0 @@
---
data_dir: /vector-data-dir
api:
enabled: true
address: 0.0.0.0:8686
enrichment_tables:
geoip_table:
type: geoip
path: /usr/share/GeoIP/GeoLite2-City.mmdb
sources:
kubernetes_source:
address: 0.0.0.0:6000
type: vector
version: "2"
opnsense_logs:
address: 0.0.0.0:6001
type: vector
version: "2"
journald_source:
type: vector
address: 0.0.0.0:6002
version: "2"
vector_metrics:
type: internal_metrics
talos_kernel_logs:
address: 0.0.0.0:6050
type: socket
mode: udp
max_length: 102400
decoding:
codec: json
host_key: __host
talos_service_logs:
address: 0.0.0.0:6051
type: socket
mode: udp
max_length: 102400
decoding:
codec: json
host_key: __host
transforms:
talos_kernel_logs_xform:
type: remap
inputs:
- talos_kernel_logs
source: |-
.__host = replace!(.__host, "192.168.8.101", "talos-node-1")
.__host = replace(.__host, "192.168.8.102", "talos-node-2")
.__host = replace(.__host, "192.168.8.103", "talos-node-3")
.__host = replace(.__host, "192.168.8.104", "talos-node-4")
talos_service_logs_xform:
type: remap
inputs:
- talos_service_logs
source: |-
.__host = replace!(.__host, "192.168.8.101", "talos-node-1")
.__host = replace(.__host, "192.168.8.102", "talos-node-2")
.__host = replace(.__host, "192.168.8.103", "talos-node-3")
.__host = replace(.__host, "192.168.8.104", "talos-node-4")
kubernetes_remap:
type: remap
inputs:
- kubernetes_source
source: |
# Standardize 'app' index
.custom_app_name = .pod_labels."app.kubernetes.io/name" || .pod_labels.app || .pod_labels."k8s-app" || "unknown"
# Sinks
sinks:
loki_kubernetes:
type: loki
inputs:
- kubernetes_source
endpoint: http://loki-gateway.monitoring.svc.cluster.local:80
encoding:
codec: json
batch:
max_bytes: 2049000
out_of_order_action: rewrite_timestamp
remove_label_fields: true
remove_timestamp: true
labels:
k8s_app: '{{ custom_app_name }}'
k8s_container: '{{ kubernetes.container_name }}'
k8s_filename: '{{ kubernetes.file }}'
k8s_instance: '{{ kubernetes.pod_labels."app.kubernetes.io/instance" }}'
k8s_namespace: '{{ kubernetes.pod_namespace }}'
k8s_node: '{{ kubernetes.pod_node_name }}'
k8s_pod: '{{ kubernetes.pod_name }}'
loki_opnsense:
type: loki
inputs:
- opnsense_logs
endpoint: http://loki-gateway.monitoring.svc.cluster.local:80
encoding:
codec: json
batch:
max_bytes: 400000
out_of_order_action: rewrite_timestamp
labels:
hostname: '{{ host }}'
syslog_identifier: '{{SYSLOG_IDENTIFIER }}'
loki_journal:
type: loki
inputs:
- journald_source
endpoint: http://loki-gateway.monitoring.svc.cluster.local:80
encoding:
codec: json
batch:
max_bytes: 2049000
out_of_order_action: accept
remove_label_fields: true
remove_timestamp: true
labels:
hostname: '{{ host }}'
talos_kernel:
type: loki
inputs:
- talos_kernel_logs_xform
endpoint: http://loki-gateway.monitoring.svc.cluster.local:80
encoding:
codec: json
except_fields:
- __host
batch:
max_bytes: 1048576
out_of_order_action: rewrite_timestamp
labels:
hostname: '{{ __host }}'
service: '{{ facility }}'
talos_service:
type: loki
inputs:
- talos_service_logs_xform
endpoint: http://loki-gateway.monitoring.svc.cluster.local:80
encoding:
codec: json
except_fields:
- __host
batch:
max_bytes: 524288
out_of_order_action: rewrite_timestamp
labels:
hostname: '{{ __host }}'
service: "talos-service"
namespace: "talos:service"

View File

@@ -1,21 +0,0 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.devbu.io/external-secrets.io/externalsecret_v1beta1.json
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: vector-aggregator
namespace: monitoring
spec:
secretStoreRef:
kind: ClusterSecretStore
name: onepassword-connect
target:
name: vector-aggregator-secret
template:
engineVersion: v2
data:
GEOIPUPDATE_ACCOUNT_ID: "{{ .ACCOUNT_ID }}"
GEOIPUPDATE_LICENSE_KEY: "{{ .LICENSE_KEY }}"
dataFrom:
- extract:
key: maxmind

View File

@@ -1,106 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2beta2.schema.json
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: &app vector-aggregator
namespace: monitoring
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.1.0
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
maxHistory: 2
install:
createNamespace: true
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
uninstall:
keepHistory: false
values:
controllers:
vector-aggregator:
replicas: 2
strategy: RollingUpdate
annotations:
reloader.stakater.com/auto: "true"
initContainers:
init-geoip:
image:
repository: ghcr.io/maxmind/geoipupdate
tag: v7.0.1@sha256:80c57598a9ff552953e499cefc589cfe7b563d64262742ea42f2014251b557b0
pullPolicy: IfNotPresent
env:
GEOIPUPDATE_EDITION_IDS: GeoLite2-City
GEOIPUPDATE_FREQUENCY: "0"
GEOIPUPDATE_VERBOSE: "1"
envFrom:
- secretRef:
name: vector-aggregator-secret
containers:
app:
image:
repository: docker.io/timberio/vector
tag: 0.37.1-debian@sha256:e0d1bc9e61d99a139870f7276a18f36c3365ad76456a2e103fd5cb277a5a1fcb
args: ["--config", "/etc/vector/vector.yaml"]
pod:
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app.kubernetes.io/name: *app
service:
app:
controller: *app
type: LoadBalancer
loadBalancerIP: 192.168.169.108
externalTrafficPolicy: Local
ports:
http:
port: 8686
kubernetes-logs:
enabled: true
port: 6000
opnsense-logs:
enabled: true
port: 6001
journald-logs:
enabled: true
port: 6002
talos-kernel:
enabled: true
port: 6050
protocol: UDP
talos-service:
enabled: true
port: 6051
protocol: UDP
persistence:
config:
enabled: true
type: configMap
name: vector-aggregator-configmap # overriden by kustomizeconfig
globalMounts:
- path: /etc/vector/vector.yaml
subPath: vector.yaml
readOnly: true
data:
type: emptyDir
globalMounts:
- path: /vector-data-dir
geoip:
type: emptyDir
globalMounts:
- path: /usr/share/GeoIP

View File

@@ -1,14 +0,0 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ./externalsecret.yaml
- ./helmrelease.yaml
configMapGenerator:
- name: vector-aggregator-configmap
files:
- vector.yaml=./config/vector.yaml
configurations:
- ./patches/kustomizeconfig.yaml

View File

@@ -1,24 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: vector-aggregator
namespace: monitoring
spec:
values:
initContainers:
init-geoip:
image: ghcr.io/maxmind/geoipupdate:v7.0@sha256:80c57598a9ff552953e499cefc589cfe7b563d64262742ea42f2014251b557b0
env:
- name: GEOIPUPDATE_EDITION_IDS
value: GeoLite2-City
- name: GEOIPUPDATE_FREQUENCY
value: "0"
- name: GEOIPUPDATE_VERBOSE
value: "true"
envFrom:
- secretRef:
name: vector-aggregator-secret
volumeMounts:
- name: geoip
mountPath: /usr/share/GeoIP

View File

@@ -1,7 +0,0 @@
---
nameReference:
- kind: ConfigMap
version: v1
fieldSpecs:
- path: spec/values/persistence/config/name
kind: HelmRelease

View File

@@ -1,8 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ./agent
- ./aggregator

View File

@@ -1,25 +0,0 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app vector
namespace: flux-system
spec:
commonMetadata:
labels:
app.kubernetes.io/name: *app
dependsOn:
- name: external-secrets-stores
path: ./kubernetes/apps/monitoring/vector
prune: true
sourceRef:
kind: GitRepository
name: home-ops-kubernetes
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app