diff --git a/cluster/apps/monitoring/botkube/helm-release.yaml b/cluster/apps/monitoring/botkube/helm-release.yaml
deleted file mode 100644
index 4bee5381b..000000000
--- a/cluster/apps/monitoring/botkube/helm-release.yaml
+++ /dev/null
@@ -1,182 +0,0 @@
----
-apiVersion: helm.toolkit.fluxcd.io/v2beta1
-kind: HelmRelease
-metadata:
- name: botkube
- namespace: monitoring
-spec:
- interval: 5m
- chart:
- spec:
- # renovate: registryUrl=https://infracloudio.github.io/charts
- chart: botkube
- version: v0.12.4
- sourceRef:
- kind: HelmRepository
- name: infracloudio-charts
- namespace: flux-system
- interval: 5m
- values:
- image:
- repository: infracloudio/botkube
- tag: v0.12.1
- serviceMonitor:
- enabled: true
- config:
- settings:
- clustername: k3s
- resources:
- - name: v1/pods # Name of the resources e.g pod, deployment, ingress, etc. (Resource name must be in singular form)
- namespaces:
- include:
- - all
- ignore: # List of namespaces to be ignored (omitempty), used only with include: all
- - kasten-io # example : include [all], ignore [x,y,z]
- - kube-system
- events: # List of lifecycle events you want to receive, e.g create, update, delete, error OR all
- - create
- - delete
- - name: v1/services
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: apps/v1/deployments
- namespaces:
- include:
- - all
- events:
- - create
- - update
- - delete
- - error
- updateSetting:
- includeDiff: true
- fields:
- - spec.template.spec.containers[*].image
- - name: apps/v1/statefulsets
- namespaces:
- include:
- - all
- events:
- - create
- - update
- - delete
- - error
- updateSetting:
- includeDiff: true
- fields:
- - spec.template.spec.containers[*].image
- - name: networking.k8s.io/v1beta1/ingresses
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/nodes
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/namespaces
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/persistentvolumes
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/persistentvolumeclaims
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/secrets
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: v1/configmaps
- namespaces:
- include:
- - all
- ignore:
- - rook-ceph
- events:
- - delete
- - error
- - name: apps/v1/daemonsets
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - update
- updateSetting:
- includeDiff: true
- fields:
- - spec.template.spec.containers[*].image
- - name: rbac.authorization.k8s.io/v1/roles
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: rbac.authorization.k8s.io/v1/rolebindings
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: rbac.authorization.k8s.io/v1/clusterroles
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- - name: rbac.authorization.k8s.io/v1/clusterrolebindings
- namespaces:
- include:
- - all
- events:
- - create
- - delete
- - error
- recommendations: true
- communications:
- discord:
- enabled: true
- notiftype: short
- channel: "778626068637679707"
- botid: ${SECRET_BOTKUBE_DISCORD_BOTID}
- token: ${SECRET_BOTKUBE_DISCORD_TOKEN}
diff --git a/cluster/apps/monitoring/botkube/kustomization.yaml b/cluster/apps/monitoring/botkube/kustomization.yaml
deleted file mode 100644
index 2fa2de20c..000000000
--- a/cluster/apps/monitoring/botkube/kustomization.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
- - helm-release.yaml
diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json b/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
similarity index 99%
rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
rename to cluster/apps/monitoring/grafana/dashboards/home-assistant.json
index a56ad2516..dc94ac238 100644
--- a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
+++ b/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
@@ -415,4 +415,4 @@
"uid": "sn-bOoWMk",
"version": 1,
"weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json b/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
similarity index 99%
rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
rename to cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
index 381416892..efec37049 100644
--- a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
+++ b/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
@@ -368,4 +368,4 @@
"uid": "aEY0BVGnz",
"version": 1,
"weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json b/cluster/apps/monitoring/grafana/dashboards/truenas.json
similarity index 100%
rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json
rename to cluster/apps/monitoring/grafana/dashboards/truenas.json
diff --git a/cluster/apps/monitoring/grafana/helm-release.yaml b/cluster/apps/monitoring/grafana/helm-release.yaml
index 7887b0195..68ad965a8 100644
--- a/cluster/apps/monitoring/grafana/helm-release.yaml
+++ b/cluster/apps/monitoring/grafana/helm-release.yaml
@@ -33,6 +33,34 @@ spec:
admin:
existingSecret: grafana-admin-creds
grafana.ini:
+ auth:
+ signout_redirect_url: "https://login.${SECRET_CLUSTER_DOMAIN}/logout"
+ oauth_auto_login: false
+ auth.generic_oauth:
+ enabled: true
+ name: Authelia
+ client_id: grafana
+ client_secret: "${SECRET_GRAFANA_OAUTH_CLIENT_SECRET}"
+ scopes: "openid profile email groups"
+ empty_scopes: false
+ auth_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/authorization"
+ token_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/token"
+ api_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/userinfo"
+ login_attribute_path: preferred_username
+ groups_attribute_path: groups
+ name_attribute_path: name
+ use_pkce: true
+ auth.generic_oauth.group_mapping:
+ role_attribute_path: |
+ contains(groups[*], 'admins') && 'Admin' || contains(groups[*], 'people') && 'Viewer'
+ org_id: 1
+ auth.basic:
+ disable_login_form: false
+ auth.anonymous:
+ enabled: true
+ org_name: HomeOps
+ org_id: 1
+ org_role: Viewer
server:
root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}"
paths:
@@ -46,8 +74,6 @@ spec:
mode: console
grafana_net:
url: https://grafana.net
- auth.basic:
- disable_login_form: false
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
@@ -71,25 +97,22 @@ spec:
- name: Prometheus
type: prometheus
access: proxy
- url: http://thanos-query:9090/
+ url: http://thanos-query.monitoring.svc.cluster.local:9090
isDefault: true
# - name: Loki
# type: loki
# access: proxy
- # url: http://loki-gateway:80
+ # url: http://loki-gateway.monitoring.svc.cluster.local:80
dashboards:
default:
- kubernetes-custom:
- url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json
- datasource: Prometheus
home-assistant:
- url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
+ url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
datasource: Prometheus
homelab-temperatures:
- url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
+ url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
datasource: Prometheus
truenas:
- url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json
+ url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/truenas.json
datasource: Prometheus
sidecar:
dashboards:
@@ -126,3 +149,14 @@ spec:
- *host
persistence:
enabled: false
+ affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 100
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: app.kubernetes.io/name
+ operator: In
+ values: ["grafana"]
+ topologyKey: kubernetes.io/hostname
diff --git a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml
index cecd88fd4..a7f8ed2da 100644
--- a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml
+++ b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml
@@ -5,10 +5,9 @@ metadata:
name: kube-prometheus-stack
namespace: monitoring
spec:
- interval: 5m
+ interval: 15m
chart:
spec:
- # renovate: registryUrl=https://prometheus-community.github.io/helm-charts
chart: kube-prometheus-stack
version: 39.13.3
sourceRef:
@@ -24,85 +23,37 @@ spec:
remediation:
retries: 5
values:
- alertmanager:
- config:
- global:
- resolve_timeout: 5m
- receivers:
- - name: "null"
- - name: "pushover"
- pushover_configs:
- - user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
- token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
- send_resolved: true
- html: true
- priority: |-
- {{ if eq .Status "firing" }}1{{ else }}0{{ end }}
- url_title: View in Alert Manager
- title: |-
- [{{ .Status | toUpper -}}
- {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
- ] {{ .CommonLabels.alertname }}
- message: |-
- {{- range .Alerts }}
- {{- if ne .Labels.severity "" }}
- Severity: {{ .Labels.severity }}
- {{- else }}
- Severity: N/A
- {{- end }}
- {{- if ne .Annotations.description "" }}
- Description: {{ .Annotations.description }}
- {{- else if ne .Annotations.summary "" }}
- Summary: {{ .Annotations.summary }}
- {{- else if ne .Annotations.message "" }}
- Message: {{ .Annotations.message }}
- {{- else }}
- Description: N/A
- {{- end }}
- {{- if gt (len .Labels.SortedPairs) 0 }}
- Details:
- {{- range .Labels.SortedPairs }}
- • {{ .Name }}: {{ .Value }}
- {{- end }}
- {{- end }}
- {{- end }}
- route:
- receiver: "pushover"
- routes:
- - receiver: "null"
- matchers:
- - alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
- - receiver: "pushover"
- matchers:
- - severity = "critical"
- continue: true
- inhibit_rules:
- - source_matchers:
- - severity = "critical"
- target_matchers:
- - severity = "warning"
- equal: ["alertname", "namespace"]
- alertmanagerSpec:
- replicas: 2
- podAntiAffinity: hard
- storage:
- volumeClaimTemplate:
- spec:
- storageClassName: rook-ceph-block
- resources:
- requests:
- storage: 10Gi
- ingress:
- enabled: true
- pathType: Prefix
- ingressClassName: "nginx"
- annotations:
- nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
- nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
- hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
- tls:
- - hosts:
- - "alert-manager.${SECRET_CLUSTER_DOMAIN}"
+ ###
+ ### Component values
+ ###
+ kubeApiServer:
+ enabled: true
+ kubeControllerManager:
+ enabled: false
+
+ kubeEtcd:
+ enabled: false
+
+ kubelet:
+ enabled: true
+ serviceMonitor:
+ metricRelabelings:
+ - action: replace
+ sourceLabels:
+ - node
+ targetLabel: instance
+
+ kubeProxy:
+ enabled: false
+
+ kubeScheduler:
+ enabled: false
+
+ kubeStateMetrics:
+ enabled: true
+ kube-state-metrics:
+ metricLabelsAllowlist:
+ - "persistentvolumeclaims=[*]"
prometheus:
monitor:
enabled: true
@@ -113,8 +64,42 @@ spec:
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
+ resources:
+ requests:
+ cpu: 15m
+ memory: 127M
+ limits:
+ memory: 153M
+
+ grafana:
+ enabled: false
+ forceDeployDashboards: true
+
+ nodeExporter:
+ enabled: true
+
+ prometheus-node-exporter:
+ resources:
+ requests:
+ cpu: 23m
+ memory: 64M
+ limits:
+ memory: 64M
+ prometheus:
+ monitor:
+ enabled: true
+ relabelings:
+ - action: replace
+ regex: (.*)
+ replacement: $1
+ sourceLabels:
+ - __meta_kubernetes_pod_node_name
+ targetLabel: kubernetes_node
+
+ ###
+ ### Prometheus operator values
+ ###
prometheusOperator:
- createCustomResource: true
prometheusConfigReloader:
resources:
requests:
@@ -123,35 +108,10 @@ spec:
limits:
cpu: 300m
memory: 50Mi
- nodeExporter:
- enabled: true
- serviceMonitor:
- relabelings:
- - action: replace
- regex: (.*)
- replacement: $1
- sourceLabels:
- - __meta_kubernetes_pod_node_name
- targetLabel: kubernetes_node
- kubelet:
- enabled: true
- serviceMonitor:
- metricRelabelings:
- - action: replace
- sourceLabels:
- - node
- targetLabel: instance
- grafana:
- enabled: false
- forceDeployDashboards: true
- kubeEtcd:
- enabled: false
- kubeControllerManager:
- enabled: false
- kubeScheduler:
- enabled: false
- kubeProxy:
- enabled: false
+
+ ###
+ ### Prometheus instance values
+ ###
prometheus:
ingress:
enabled: true
@@ -171,7 +131,7 @@ spec:
cpu: 400m
limits:
memory: 6000Mi
- replicas: 2
+ replicas: 1
replicaExternalLabelName: "replica"
podAntiAffinity: hard
ruleSelector: {}
@@ -183,8 +143,9 @@ spec:
podMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelectorNilUsesHelmValues: false
- retention: 2d
- retentionSize: "6GB"
+ probeSelectorNilUsesHelmValues: false
+ retention: 14d
+ retentionSize: "45GB"
enableAdminAPI: true
walCompression: true
storageSpec:
@@ -193,13 +154,10 @@ spec:
storageClassName: rook-ceph-block
resources:
requests:
- storage: 10Gi
+ storage: 50Gi
thanos:
image: quay.io/thanos/thanos:v0.28.0
- version: v0.25.2
- objectStorageConfig:
- name: thanos-objstore-secret
- key: objstore.yml
+ version: v0.28.0
additionalScrapeConfigs:
- job_name: "opnsense"
scrape_interval: 60s
@@ -331,3 +289,93 @@ spec:
tls:
- hosts:
- *host
+
+ alertmanager:
+ config:
+ global:
+ resolve_timeout: 5m
+ receivers:
+ - name: "null"
+ - name: "pushover"
+ pushover_configs:
+ - user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
+ token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
+ send_resolved: true
+ html: true
+ priority: |-
+ {{ if eq .Status "firing" }}1{{ else }}0{{ end }}
+ url_title: View in Alert Manager
+ title: |-
+ [{{ .Status | toUpper -}}
+ {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
+ ] {{ .CommonLabels.alertname }}
+ message: |-
+ {{- range .Alerts }}
+ {{- if ne .Labels.severity "" }}
+ Severity: {{ .Labels.severity }}
+ {{- else }}
+ Severity: N/A
+ {{- end }}
+ {{- if ne .Annotations.description "" }}
+ Description: {{ .Annotations.description }}
+ {{- else if ne .Annotations.summary "" }}
+ Summary: {{ .Annotations.summary }}
+ {{- else if ne .Annotations.message "" }}
+ Message: {{ .Annotations.message }}
+ {{- else }}
+ Description: N/A
+ {{- end }}
+ {{- if gt (len .Labels.SortedPairs) 0 }}
+ Details:
+ {{- range .Labels.SortedPairs }}
+ • {{ .Name }}: {{ .Value }}
+ {{- end }}
+ {{- end }}
+ {{- end }}
+ route:
+ receiver: "pushover"
+ routes:
+ - receiver: "null"
+ matchers:
+ - alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
+ - receiver: "pushover"
+ matchers:
+ - severity = "critical"
+ continue: true
+ inhibit_rules:
+ - source_matchers:
+ - severity = "critical"
+ target_matchers:
+ - severity = "warning"
+ equal: ["alertname", "namespace"]
+ alertmanagerSpec:
+ replicas: 1
+ podAntiAffinity: hard
+ storage:
+ volumeClaimTemplate:
+ spec:
+ storageClassName: rook-ceph-block
+ resources:
+ requests:
+ storage: 1Gi
+ ingress:
+ enabled: true
+ pathType: Prefix
+ ingressClassName: "nginx"
+ annotations:
+ nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
+ nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
+ hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
+ tls:
+ - hosts:
+ - "alert-manager.${SECRET_CLUSTER_DOMAIN}"
+ prometheus:
+ monitor:
+ enabled: true
+ relabelings:
+ - action: replace
+ regex: (.*)
+ replacement: $1
+ sourceLabels:
+ - __meta_kubernetes_pod_node_name
+ targetLabel: kubernetes_node
diff --git a/cluster/apps/monitoring/kustomization.yaml b/cluster/apps/monitoring/kustomization.yaml
index 273c0074d..3dd92412f 100644
--- a/cluster/apps/monitoring/kustomization.yaml
+++ b/cluster/apps/monitoring/kustomization.yaml
@@ -3,10 +3,8 @@ kind: Kustomization
resources:
- namespace.yaml
- blackbox-exporter
- - botkube
- grafana
- healthchecks
- kube-prometheus-stack
- #- loki-stack
- thanos
- uptime-kuma
diff --git a/cluster/apps/monitoring/loki-stack/helm-release.yaml b/cluster/apps/monitoring/loki-stack/helm-release.yaml
deleted file mode 100644
index 0a5daf43a..000000000
--- a/cluster/apps/monitoring/loki-stack/helm-release.yaml
+++ /dev/null
@@ -1,186 +0,0 @@
----
-apiVersion: helm.toolkit.fluxcd.io/v2beta1
-kind: HelmRelease
-metadata:
- name: loki-stack
- namespace: monitoring
-spec:
- interval: 5m
- chart:
- spec:
- # renovate: registryUrl=https://grafana.github.io/helm-charts
- chart: loki-stack
- version: 2.6.5
- sourceRef:
- kind: HelmRepository
- name: grafana-loki-charts
- namespace: flux-system
- interval: 5m
- values:
- loki:
- image:
- repository: grafana/loki
- tag: 2.6.1
- pullPolicy: IfNotPresent
- replicas: 3
- persistence:
- enabled: false
- config:
- auth_enabled: false
- server:
- http_listen_port: 3100
- distributor:
- ring:
- kvstore:
- store: memberlist
- ingester:
- lifecycler:
- ring:
- kvstore:
- store: memberlist
- replication_factor: 1
- final_sleep: 0s
- chunk_idle_period: 5m
- chunk_retain_period: 30s
- memberlist:
- abort_if_cluster_join_fails: false
- # Expose this port on all distributor, ingester
- # and querier replicas.
- bind_port: 7946
- # You can use a headless k8s service for all distributor,
- # ingester and querier components.
- join_members:
- - loki-stack-headless.monitoring.svc.cluster.local:7946
- # max_join_backoff: 1m
- # max_join_retries: 10
- # min_join_backoff: 1s
- schema_config:
- configs:
- - from: "2020-10-24"
- store: boltdb-shipper
- object_store: s3
- schema: v11
- index:
- prefix: index_
- period: 24h
- storage_config:
- aws:
- insecure: false
- s3: https://${SECRET_MINIO_ACCESS_KEY}:${SECRET_MINIO_SECRET_KEY}@${SECRET_MINIO_ENDPOINT}/loki
- s3forcepathstyle: true
- boltdb_shipper:
- active_index_directory: /data/loki/index
- cache_location: /data/loki/index_cache
- resync_interval: 5s
- shared_store: s3
- limits_config:
- enforce_metric_name: false
- reject_old_samples: true
- reject_old_samples_max_age: 168h
- extraPorts:
- - port: 7956
- protocol: TCP
- name: loki-gossip-ring
- targetPort: 7946
- serviceMonitor:
- enabled: true
- podAnnotations:
- prometheus.io/scrape: "true"
- prometheus.io/port: "http-metrics"
- promtail:
- image:
- registry: docker.io
- repository: grafana/promtail
- tag: latest
- pullPolicy: Always
- serviceMonitor:
- enabled: true
- extraScrapeConfigs:
- - job_name: syslog
- syslog:
- listen_address: 0.0.0.0:1514
- label_structured_data: true
- labels:
- job: "syslog"
- relabel_configs:
- - source_labels: ['__syslog_connection_ip_address']
- target_label: 'ip_address'
- - source_labels: ['__syslog_message_severity']
- target_label: 'severity'
- - source_labels: ['__syslog_message_facility']
- target_label: 'facility'
- - source_labels: ['__syslog_message_hostname']
- target_label: 'host'
- - source_labels: ['__syslog_message_app_name']
- target_label: 'app'
- - source_labels: ['__syslog_message_SRC']
- target_label: 'source_ip'
- - source_labels: ['__syslog_message_SPT']
- target_label: 'source_port'
- - source_labels: ['__syslog_message_DPT']
- target_label: 'destination_port'
- - source_labels: ['__syslog_message_DST']
- target_label: 'destination_ip'
- pipeline_stages:
- # - job_name: pfsense
- # syslog:
- # listen_address: 0.0.0.0:1514
- # idle_timeout: 60s
- # label_structured_data: false
- # labels:
- # job: "syslog"
- # host: pfsense
- # relabel_configs:
- # - source_labels: ["__syslog_message_severity"]
- # target_label: "severity"
- # #- source_labels: ['__syslog_message_facility']
- # # target_label: 'facility'
- # - source_labels: ["__syslog_message_app_name"]
- # target_label: "app_name"
- # pipeline_stages:
- # - match:
- # selector: '{app_name="filterlog"}'
- # stages:
- # - regex:
- # expression: '(?P\d*?),(?P\d*?),(?P\d*?),(?P\d*?),(?Pigb.{1,5}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P4{1}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\d*?),(?P\w*?),(?P\d*?),(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P\d+?),(?P\d+?),(?P\d+?)'
- # # ipv6 // ,(?P6{1}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\d*?),
- # - labels:
- # pfsense_fw_rule: ""
- # #pfsense_fw_subrule: ''
- # #pfsense_fw_anchor: ''
- # pfsense_fw_tracker: ""
- # pfsense_fw_interface: ""
- # pfsense_fw_reason: ""
- # pfsense_fw_action: ""
- # pfsense_fw_direction: ""
- # #pfsense_fw_ip_version: ''
- # #pfsense_fw_tos: ''
- # #pfsense_fw_ecn: ''
- # #pfsense_fw_ttl: ''
- # #pfsense_fw_id: ''
- # #pfsense_fw_offset: ''
- # #pfsense_fw_flag: ''
- # pfsense_fw_protocol_id: ""
- # pfsense_fw_protocol_text: ""
- # #pfsense_fw_length: ''
- # pfsense_fw_source_address: ""
- # pfsense_fw_destination_address: ""
- # pfsense_fw_source_port: ""
- # pfsense_fw_destination_port: ""
- # #pfsense_fw_data_length: ''
- # # - metrics:
- # # lines_total:
- # # type: Counter
- # # description: "pfsense firewall : total number of log lines"
- # # prefix: pfsense_firewall_
- # # match_all: true
- # # count_entry_bytes: true
- # # config:
- # # action: add
- syslogService:
- enabled: true
- type: LoadBalancer
- port: 1514
- externalIPs:
- - ${CLUSTER_LB_LOKI_SYSLOG}
- externalTrafficPolicy: Local
diff --git a/cluster/apps/monitoring/loki-stack/kustomization.yaml b/cluster/apps/monitoring/loki-stack/kustomization.yaml
deleted file mode 100644
index d1c0a463d..000000000
--- a/cluster/apps/monitoring/loki-stack/kustomization.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
- - helm-release.yaml
- - prometheus-rule.yaml
diff --git a/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml b/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
deleted file mode 100644
index 707c248cb..000000000
--- a/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
----
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
- name: loki.rules
- namespace: monitoring
-spec:
- groups:
- - name: loki.rules
- rules:
- - alert: LokiRequestErrors
- annotations:
- message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
- expr: |
- 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
- /
- sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
- > 10
- for: 15m
- labels:
- severity: critical
- - alert: LokiRequestPanics
- annotations:
- message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
- expr: |
- sum(increase(loki_panic_total[10m])) by (namespace, job)
- > 0
- labels:
- severity: critical
- - alert: LokiRequestLatency
- annotations:
- message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
- expr: |
- namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
- > 1
- for: 15m
- labels:
- severity: critical
- - expr: |
- histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job))
- record: job:loki_request_duration_seconds:99quantile
- - expr: |
- histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job))
- record: job:loki_request_duration_seconds:50quantile
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
- /
- sum(rate(loki_request_duration_seconds_count[1m])) by (job)
- record: job:loki_request_duration_seconds:avg
- - expr: |
- sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job)
- record: job:loki_request_duration_seconds_bucket:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
- record: job:loki_request_duration_seconds_sum:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_count[1m])) by (job)
- record: job:loki_request_duration_seconds_count:sum_rate
- - expr: |
- histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job, route))
- record: job_route:loki_request_duration_seconds:99quantile
- - expr: |
- histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job, route))
- record: job_route:loki_request_duration_seconds:50quantile
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
- /
- sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
- record: job_route:loki_request_duration_seconds:avg
- - expr: |
- sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, job, route)
- record: job_route:loki_request_duration_seconds_bucket:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
- record: job_route:loki_request_duration_seconds_sum:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
- record: job_route:loki_request_duration_seconds_count:sum_rate
- - expr: |
- histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, namespace, job, route))
- record: namespace_job_route:loki_request_duration_seconds:99quantile
- - expr: |
- histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, namespace, job, route))
- record: namespace_job_route:loki_request_duration_seconds:50quantile
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
- /
- sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
- record: namespace_job_route:loki_request_duration_seconds:avg
- - expr: |
- sum(rate(loki_request_duration_seconds_bucket[1m]))
- by (le, namespace, job, route)
- record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_sum[1m]))
- by (namespace, job, route)
- record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- - expr: |
- sum(rate(loki_request_duration_seconds_count[1m]))
- by (namespace, job, route)
- record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
diff --git a/cluster/apps/monitoring/thanos/helm-release.yaml b/cluster/apps/monitoring/thanos/helm-release.yaml
index cf1c53ce0..4c50813eb 100644
--- a/cluster/apps/monitoring/thanos/helm-release.yaml
+++ b/cluster/apps/monitoring/thanos/helm-release.yaml
@@ -5,21 +5,30 @@ metadata:
name: thanos
namespace: monitoring
spec:
- interval: 5m
+ interval: 15m
chart:
spec:
- # renovate: registryUrl=https://charts.bitnami.com/bitnami
chart: thanos
version: 11.4.0
sourceRef:
kind: HelmRepository
name: bitnami-charts
namespace: flux-system
- interval: 5m
+ install:
+ createNamespace: true
+ remediation:
+ retries: 5
+ upgrade:
+ remediation:
+ retries: 5
+ dependsOn:
+ - name: kube-prometheus-stack
+ namespace: monitoring
values:
query:
enabled: true
replicaCount: 2
+ podAntiAffinityPreset: hard
replicaLabels:
- replica
dnsDiscovery:
@@ -27,46 +36,26 @@ spec:
sidecarsNamespace: monitoring
ingress:
enabled: true
- hostname: "thanos.${SECRET_CLUSTER_DOMAIN}"
+ hostname: &host "thanos-query.${SECRET_CLUSTER_DOMAIN}"
annotations:
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
- # traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
- # traefik.ingress.kubernetes.io/router.middlewares: networking-forward-auth@kubernetescrd
+ ingressClassName: "nginx"
tls: true
+ extraTls:
+ - hosts:
+ - *host
queryFrontend:
enabled: false
bucketweb:
- enabled: true
+ enabled: false
compactor:
- enabled: true
- strategyType: Recreate
- persistence:
- size: 30Gi
+ enabled: false
storegateway:
- enabled: true
+ enabled: false
ruler:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: true
- objstoreConfig: |-
- type: s3
- config:
- bucket: thanos
- endpoint: ${SECRET_MINIO_ENDPOINT}
- access_key: "${SECRET_MINIO_ACCESS_KEY}"
- secret_key: "${SECRET_MINIO_SECRET_KEY}"
- insecure: false
-
- postRenderers:
- - kustomize:
- patchesJson6902:
- - target:
- kind: Ingress
- name: thanos-query
- patch:
- - op: add
- path: /spec/ingressClassName
- value: nginx
diff --git a/cluster/apps/monitoring/thanos/kustomization.yaml b/cluster/apps/monitoring/thanos/kustomization.yaml
index 73ca882a9..2fa2de20c 100644
--- a/cluster/apps/monitoring/thanos/kustomization.yaml
+++ b/cluster/apps/monitoring/thanos/kustomization.yaml
@@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- # - prometheus-rule.yaml
diff --git a/cluster/apps/monitoring/thanos/prometheus-rule.yaml b/cluster/apps/monitoring/thanos/prometheus-rule.yaml
deleted file mode 100644
index 5b64cfaec..000000000
--- a/cluster/apps/monitoring/thanos/prometheus-rule.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
----
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
- name: thanos.rules
- namespace: monitoring
-spec:
- groups:
- - name: thanos.rules
- rules:
- - alert: ThanosCompactionHalted
- expr: |
- thanos_compactor_halted == 1
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: "Thanos compaction halted on {{ $labels.instance }}"
- description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: ThanosCompactBucketOperationFailure
- expr: |
- rate(thanos_objstore_bucket_operation_failures_total[1m])
- > 0
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
- description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: ThanosCompactNotRun
- expr: |
- (time() - thanos_objstore_bucket_last_successful_upload_time)
- > 24*60*60
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: "Thanos compact not run on {{ $labels.instance }}"
- description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/cluster/charts/infracloudio-charts.yaml b/cluster/charts/infracloudio-charts.yaml
deleted file mode 100644
index c4a009310..000000000
--- a/cluster/charts/infracloudio-charts.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
----
-apiVersion: source.toolkit.fluxcd.io/v1beta1
-kind: HelmRepository
-metadata:
- name: infracloudio-charts
- namespace: flux-system
-spec:
- interval: 1h
- url: https://infracloudio.github.io/charts
- timeout: 3m
diff --git a/cluster/charts/kustomization.yaml b/cluster/charts/kustomization.yaml
index 4e400644a..bb0dfe6ed 100644
--- a/cluster/charts/kustomization.yaml
+++ b/cluster/charts/kustomization.yaml
@@ -12,7 +12,6 @@ resources:
- gitea-charts.yaml
- grafana-charts.yaml
- influxdata-charts.yaml
- - infracloudio-charts.yaml
- ingress-nginx-charts.yaml
- jetstack-charts.yaml
- k8s-at-home.yaml