mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
♻️ monitoring
This commit is contained in:
@@ -1,182 +0,0 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: botkube
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://infracloudio.github.io/charts
|
||||
chart: botkube
|
||||
version: v0.12.4
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: infracloudio-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
values:
|
||||
image:
|
||||
repository: infracloudio/botkube
|
||||
tag: v0.12.1
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
config:
|
||||
settings:
|
||||
clustername: k3s
|
||||
resources:
|
||||
- name: v1/pods # Name of the resources e.g pod, deployment, ingress, etc. (Resource name must be in singular form)
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
ignore: # List of namespaces to be ignored (omitempty), used only with include: all
|
||||
- kasten-io # example : include [all], ignore [x,y,z]
|
||||
- kube-system
|
||||
events: # List of lifecycle events you want to receive, e.g create, update, delete, error OR all
|
||||
- create
|
||||
- delete
|
||||
- name: v1/services
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: apps/v1/deployments
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- update
|
||||
- delete
|
||||
- error
|
||||
updateSetting:
|
||||
includeDiff: true
|
||||
fields:
|
||||
- spec.template.spec.containers[*].image
|
||||
- name: apps/v1/statefulsets
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- update
|
||||
- delete
|
||||
- error
|
||||
updateSetting:
|
||||
includeDiff: true
|
||||
fields:
|
||||
- spec.template.spec.containers[*].image
|
||||
- name: networking.k8s.io/v1beta1/ingresses
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/nodes
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/namespaces
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/persistentvolumes
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/persistentvolumeclaims
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/secrets
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/configmaps
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
ignore:
|
||||
- rook-ceph
|
||||
events:
|
||||
- delete
|
||||
- error
|
||||
- name: apps/v1/daemonsets
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- update
|
||||
updateSetting:
|
||||
includeDiff: true
|
||||
fields:
|
||||
- spec.template.spec.containers[*].image
|
||||
- name: rbac.authorization.k8s.io/v1/roles
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/rolebindings
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/clusterroles
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/clusterrolebindings
|
||||
namespaces:
|
||||
include:
|
||||
- all
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
recommendations: true
|
||||
communications:
|
||||
discord:
|
||||
enabled: true
|
||||
notiftype: short
|
||||
channel: "778626068637679707"
|
||||
botid: ${SECRET_BOTKUBE_DISCORD_BOTID}
|
||||
token: ${SECRET_BOTKUBE_DISCORD_TOKEN}
|
@@ -1,5 +0,0 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
@@ -415,4 +415,4 @@
|
||||
"uid": "sn-bOoWMk",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
@@ -368,4 +368,4 @@
|
||||
"uid": "aEY0BVGnz",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
@@ -33,6 +33,34 @@ spec:
|
||||
admin:
|
||||
existingSecret: grafana-admin-creds
|
||||
grafana.ini:
|
||||
auth:
|
||||
signout_redirect_url: "https://login.${SECRET_CLUSTER_DOMAIN}/logout"
|
||||
oauth_auto_login: false
|
||||
auth.generic_oauth:
|
||||
enabled: true
|
||||
name: Authelia
|
||||
client_id: grafana
|
||||
client_secret: "${SECRET_GRAFANA_OAUTH_CLIENT_SECRET}"
|
||||
scopes: "openid profile email groups"
|
||||
empty_scopes: false
|
||||
auth_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/authorization"
|
||||
token_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/token"
|
||||
api_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/userinfo"
|
||||
login_attribute_path: preferred_username
|
||||
groups_attribute_path: groups
|
||||
name_attribute_path: name
|
||||
use_pkce: true
|
||||
auth.generic_oauth.group_mapping:
|
||||
role_attribute_path: |
|
||||
contains(groups[*], 'admins') && 'Admin' || contains(groups[*], 'people') && 'Viewer'
|
||||
org_id: 1
|
||||
auth.basic:
|
||||
disable_login_form: false
|
||||
auth.anonymous:
|
||||
enabled: true
|
||||
org_name: HomeOps
|
||||
org_id: 1
|
||||
org_role: Viewer
|
||||
server:
|
||||
root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}"
|
||||
paths:
|
||||
@@ -46,8 +74,6 @@ spec:
|
||||
mode: console
|
||||
grafana_net:
|
||||
url: https://grafana.net
|
||||
auth.basic:
|
||||
disable_login_form: false
|
||||
dashboardProviders:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
@@ -71,25 +97,22 @@ spec:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://thanos-query:9090/
|
||||
url: http://thanos-query.monitoring.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
# - name: Loki
|
||||
# type: loki
|
||||
# access: proxy
|
||||
# url: http://loki-gateway:80
|
||||
# url: http://loki-gateway.monitoring.svc.cluster.local:80
|
||||
dashboards:
|
||||
default:
|
||||
kubernetes-custom:
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json
|
||||
datasource: Prometheus
|
||||
home-assistant:
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
|
||||
datasource: Prometheus
|
||||
homelab-temperatures:
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
|
||||
datasource: Prometheus
|
||||
truenas:
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json
|
||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/truenas.json
|
||||
datasource: Prometheus
|
||||
sidecar:
|
||||
dashboards:
|
||||
@@ -126,3 +149,14 @@ spec:
|
||||
- *host
|
||||
persistence:
|
||||
enabled: false
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values: ["grafana"]
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
@@ -5,10 +5,9 @@ metadata:
|
||||
name: kube-prometheus-stack
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://prometheus-community.github.io/helm-charts
|
||||
chart: kube-prometheus-stack
|
||||
version: 39.13.3
|
||||
sourceRef:
|
||||
@@ -24,85 +23,37 @@ spec:
|
||||
remediation:
|
||||
retries: 5
|
||||
values:
|
||||
alertmanager:
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
receivers:
|
||||
- name: "null"
|
||||
- name: "pushover"
|
||||
pushover_configs:
|
||||
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
|
||||
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: |-
|
||||
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
||||
url_title: View in Alert Manager
|
||||
title: |-
|
||||
[{{ .Status | toUpper -}}
|
||||
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
||||
] {{ .CommonLabels.alertname }}
|
||||
message: |-
|
||||
{{- range .Alerts }}
|
||||
{{- if ne .Labels.severity "" }}
|
||||
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
|
||||
{{- else }}
|
||||
<b>Severity:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if ne .Annotations.description "" }}
|
||||
<b>Description:</b> <i>{{ .Annotations.description }}</i>
|
||||
{{- else if ne .Annotations.summary "" }}
|
||||
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
|
||||
{{- else if ne .Annotations.message "" }}
|
||||
<b>Message:</b> <i>{{ .Annotations.message }}</i>
|
||||
{{- else }}
|
||||
<b>Description:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if gt (len .Labels.SortedPairs) 0 }}
|
||||
<b>Details:</b>
|
||||
{{- range .Labels.SortedPairs }}
|
||||
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
route:
|
||||
receiver: "pushover"
|
||||
routes:
|
||||
- receiver: "null"
|
||||
matchers:
|
||||
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
|
||||
- receiver: "pushover"
|
||||
matchers:
|
||||
- severity = "critical"
|
||||
continue: true
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = "critical"
|
||||
target_matchers:
|
||||
- severity = "warning"
|
||||
equal: ["alertname", "namespace"]
|
||||
alertmanagerSpec:
|
||||
replicas: 2
|
||||
podAntiAffinity: hard
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: rook-ceph-block
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
||||
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
||||
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
|
||||
tls:
|
||||
- hosts:
|
||||
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
|
||||
###
|
||||
### Component values
|
||||
###
|
||||
kubeApiServer:
|
||||
enabled: true
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
|
||||
kubelet:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
metricRelabelings:
|
||||
- action: replace
|
||||
sourceLabels:
|
||||
- node
|
||||
targetLabel: instance
|
||||
|
||||
kubeProxy:
|
||||
enabled: false
|
||||
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
|
||||
kubeStateMetrics:
|
||||
enabled: true
|
||||
kube-state-metrics:
|
||||
metricLabelsAllowlist:
|
||||
- "persistentvolumeclaims=[*]"
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
@@ -113,8 +64,42 @@ spec:
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
resources:
|
||||
requests:
|
||||
cpu: 15m
|
||||
memory: 127M
|
||||
limits:
|
||||
memory: 153M
|
||||
|
||||
grafana:
|
||||
enabled: false
|
||||
forceDeployDashboards: true
|
||||
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
|
||||
prometheus-node-exporter:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 23m
|
||||
memory: 64M
|
||||
limits:
|
||||
memory: 64M
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
|
||||
###
|
||||
### Prometheus operator values
|
||||
###
|
||||
prometheusOperator:
|
||||
createCustomResource: true
|
||||
prometheusConfigReloader:
|
||||
resources:
|
||||
requests:
|
||||
@@ -123,35 +108,10 @@ spec:
|
||||
limits:
|
||||
cpu: 300m
|
||||
memory: 50Mi
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
kubelet:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
metricRelabelings:
|
||||
- action: replace
|
||||
sourceLabels:
|
||||
- node
|
||||
targetLabel: instance
|
||||
grafana:
|
||||
enabled: false
|
||||
forceDeployDashboards: true
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
kubeProxy:
|
||||
enabled: false
|
||||
|
||||
###
|
||||
### Prometheus instance values
|
||||
###
|
||||
prometheus:
|
||||
ingress:
|
||||
enabled: true
|
||||
@@ -171,7 +131,7 @@ spec:
|
||||
cpu: 400m
|
||||
limits:
|
||||
memory: 6000Mi
|
||||
replicas: 2
|
||||
replicas: 1
|
||||
replicaExternalLabelName: "replica"
|
||||
podAntiAffinity: hard
|
||||
ruleSelector: {}
|
||||
@@ -183,8 +143,9 @@ spec:
|
||||
podMonitorSelector: {}
|
||||
podMonitorNamespaceSelector: {}
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
retention: 2d
|
||||
retentionSize: "6GB"
|
||||
probeSelectorNilUsesHelmValues: false
|
||||
retention: 14d
|
||||
retentionSize: "45GB"
|
||||
enableAdminAPI: true
|
||||
walCompression: true
|
||||
storageSpec:
|
||||
@@ -193,13 +154,10 @@ spec:
|
||||
storageClassName: rook-ceph-block
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
storage: 50Gi
|
||||
thanos:
|
||||
image: quay.io/thanos/thanos:v0.28.0
|
||||
version: v0.25.2
|
||||
objectStorageConfig:
|
||||
name: thanos-objstore-secret
|
||||
key: objstore.yml
|
||||
version: v0.28.0
|
||||
additionalScrapeConfigs:
|
||||
- job_name: "opnsense"
|
||||
scrape_interval: 60s
|
||||
@@ -331,3 +289,93 @@ spec:
|
||||
tls:
|
||||
- hosts:
|
||||
- *host
|
||||
|
||||
alertmanager:
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
receivers:
|
||||
- name: "null"
|
||||
- name: "pushover"
|
||||
pushover_configs:
|
||||
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
|
||||
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: |-
|
||||
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
||||
url_title: View in Alert Manager
|
||||
title: |-
|
||||
[{{ .Status | toUpper -}}
|
||||
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
||||
] {{ .CommonLabels.alertname }}
|
||||
message: |-
|
||||
{{- range .Alerts }}
|
||||
{{- if ne .Labels.severity "" }}
|
||||
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
|
||||
{{- else }}
|
||||
<b>Severity:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if ne .Annotations.description "" }}
|
||||
<b>Description:</b> <i>{{ .Annotations.description }}</i>
|
||||
{{- else if ne .Annotations.summary "" }}
|
||||
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
|
||||
{{- else if ne .Annotations.message "" }}
|
||||
<b>Message:</b> <i>{{ .Annotations.message }}</i>
|
||||
{{- else }}
|
||||
<b>Description:</b> <i>N/A</i>
|
||||
{{- end }}
|
||||
{{- if gt (len .Labels.SortedPairs) 0 }}
|
||||
<b>Details:</b>
|
||||
{{- range .Labels.SortedPairs }}
|
||||
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
route:
|
||||
receiver: "pushover"
|
||||
routes:
|
||||
- receiver: "null"
|
||||
matchers:
|
||||
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
|
||||
- receiver: "pushover"
|
||||
matchers:
|
||||
- severity = "critical"
|
||||
continue: true
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = "critical"
|
||||
target_matchers:
|
||||
- severity = "warning"
|
||||
equal: ["alertname", "namespace"]
|
||||
alertmanagerSpec:
|
||||
replicas: 1
|
||||
podAntiAffinity: hard
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: rook-ceph-block
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
||||
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
||||
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
|
||||
tls:
|
||||
- hosts:
|
||||
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
|
@@ -3,10 +3,8 @@ kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- blackbox-exporter
|
||||
- botkube
|
||||
- grafana
|
||||
- healthchecks
|
||||
- kube-prometheus-stack
|
||||
#- loki-stack
|
||||
- thanos
|
||||
- uptime-kuma
|
||||
|
@@ -1,186 +0,0 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: loki-stack
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://grafana.github.io/helm-charts
|
||||
chart: loki-stack
|
||||
version: 2.6.5
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana-loki-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
values:
|
||||
loki:
|
||||
image:
|
||||
repository: grafana/loki
|
||||
tag: 2.6.1
|
||||
pullPolicy: IfNotPresent
|
||||
replicas: 3
|
||||
persistence:
|
||||
enabled: false
|
||||
config:
|
||||
auth_enabled: false
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
distributor:
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
ingester:
|
||||
lifecycler:
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
replication_factor: 1
|
||||
final_sleep: 0s
|
||||
chunk_idle_period: 5m
|
||||
chunk_retain_period: 30s
|
||||
memberlist:
|
||||
abort_if_cluster_join_fails: false
|
||||
# Expose this port on all distributor, ingester
|
||||
# and querier replicas.
|
||||
bind_port: 7946
|
||||
# You can use a headless k8s service for all distributor,
|
||||
# ingester and querier components.
|
||||
join_members:
|
||||
- loki-stack-headless.monitoring.svc.cluster.local:7946
|
||||
# max_join_backoff: 1m
|
||||
# max_join_retries: 10
|
||||
# min_join_backoff: 1s
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
store: boltdb-shipper
|
||||
object_store: s3
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
storage_config:
|
||||
aws:
|
||||
insecure: false
|
||||
s3: https://${SECRET_MINIO_ACCESS_KEY}:${SECRET_MINIO_SECRET_KEY}@${SECRET_MINIO_ENDPOINT}/loki
|
||||
s3forcepathstyle: true
|
||||
boltdb_shipper:
|
||||
active_index_directory: /data/loki/index
|
||||
cache_location: /data/loki/index_cache
|
||||
resync_interval: 5s
|
||||
shared_store: s3
|
||||
limits_config:
|
||||
enforce_metric_name: false
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
extraPorts:
|
||||
- port: 7956
|
||||
protocol: TCP
|
||||
name: loki-gossip-ring
|
||||
targetPort: 7946
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
podAnnotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "http-metrics"
|
||||
promtail:
|
||||
image:
|
||||
registry: docker.io
|
||||
repository: grafana/promtail
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
extraScrapeConfigs:
|
||||
- job_name: syslog
|
||||
syslog:
|
||||
listen_address: 0.0.0.0:1514
|
||||
label_structured_data: true
|
||||
labels:
|
||||
job: "syslog"
|
||||
relabel_configs:
|
||||
- source_labels: ['__syslog_connection_ip_address']
|
||||
target_label: 'ip_address'
|
||||
- source_labels: ['__syslog_message_severity']
|
||||
target_label: 'severity'
|
||||
- source_labels: ['__syslog_message_facility']
|
||||
target_label: 'facility'
|
||||
- source_labels: ['__syslog_message_hostname']
|
||||
target_label: 'host'
|
||||
- source_labels: ['__syslog_message_app_name']
|
||||
target_label: 'app'
|
||||
- source_labels: ['__syslog_message_SRC']
|
||||
target_label: 'source_ip'
|
||||
- source_labels: ['__syslog_message_SPT']
|
||||
target_label: 'source_port'
|
||||
- source_labels: ['__syslog_message_DPT']
|
||||
target_label: 'destination_port'
|
||||
- source_labels: ['__syslog_message_DST']
|
||||
target_label: 'destination_ip'
|
||||
pipeline_stages:
|
||||
# - job_name: pfsense
|
||||
# syslog:
|
||||
# listen_address: 0.0.0.0:1514
|
||||
# idle_timeout: 60s
|
||||
# label_structured_data: false
|
||||
# labels:
|
||||
# job: "syslog"
|
||||
# host: pfsense
|
||||
# relabel_configs:
|
||||
# - source_labels: ["__syslog_message_severity"]
|
||||
# target_label: "severity"
|
||||
# #- source_labels: ['__syslog_message_facility']
|
||||
# # target_label: 'facility'
|
||||
# - source_labels: ["__syslog_message_app_name"]
|
||||
# target_label: "app_name"
|
||||
# pipeline_stages:
|
||||
# - match:
|
||||
# selector: '{app_name="filterlog"}'
|
||||
# stages:
|
||||
# - regex:
|
||||
# expression: '(?P<pfsense_fw_rule>\d*?),(?P<pfsense_fw_subrule>\d*?),(?P<pfsense_fw_anchor>\d*?),(?P<pfsense_fw_tracker>\d*?),(?P<pfsense_fw_interface>igb.{1,5}?),(?P<pfsense_fw_reason>\w*?),(?P<pfsense_fw_action>\w*?),(?P<pfsense_fw_direction>\w*?),(?P<pfsense_fw_ip_version>4{1}?),(?P<pfsense_fw_tos>\w*?),(?P<pfsense_fw_ecn>\w*?),(?P<pfsense_fw_ttl>\w*?),(?P<pfsense_fw_id>\w*?),(?P<pfsense_fw_offset>\w*?),(?P<pfsense_fw_flag>\w*?),(?P<pfsense_fw_protocol_id>\d*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_length>\d*?),(?P<pfsense_fw_source_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_destination_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_source_port>\d+?),(?P<pfsense_fw_destination_port>\d+?),(?P<pfsense_fw_data_length>\d+?)'
|
||||
# # ipv6 // ,(?P<pfsense_fw_ip_version>6{1}?),(?P<pfsense_fw_lass>\w*?),(?P<pfsense_fw_flow_label>\w*?),(?P<pfsense_fw_hop_limit>\w*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_protocol_id>\d*?),
|
||||
# - labels:
|
||||
# pfsense_fw_rule: ""
|
||||
# #pfsense_fw_subrule: ''
|
||||
# #pfsense_fw_anchor: ''
|
||||
# pfsense_fw_tracker: ""
|
||||
# pfsense_fw_interface: ""
|
||||
# pfsense_fw_reason: ""
|
||||
# pfsense_fw_action: ""
|
||||
# pfsense_fw_direction: ""
|
||||
# #pfsense_fw_ip_version: ''
|
||||
# #pfsense_fw_tos: ''
|
||||
# #pfsense_fw_ecn: ''
|
||||
# #pfsense_fw_ttl: ''
|
||||
# #pfsense_fw_id: ''
|
||||
# #pfsense_fw_offset: ''
|
||||
# #pfsense_fw_flag: ''
|
||||
# pfsense_fw_protocol_id: ""
|
||||
# pfsense_fw_protocol_text: ""
|
||||
# #pfsense_fw_length: ''
|
||||
# pfsense_fw_source_address: ""
|
||||
# pfsense_fw_destination_address: ""
|
||||
# pfsense_fw_source_port: ""
|
||||
# pfsense_fw_destination_port: ""
|
||||
# #pfsense_fw_data_length: ''
|
||||
# # - metrics:
|
||||
# # lines_total:
|
||||
# # type: Counter
|
||||
# # description: "pfsense firewall : total number of log lines"
|
||||
# # prefix: pfsense_firewall_
|
||||
# # match_all: true
|
||||
# # count_entry_bytes: true
|
||||
# # config:
|
||||
# # action: add
|
||||
syslogService:
|
||||
enabled: true
|
||||
type: LoadBalancer
|
||||
port: 1514
|
||||
externalIPs:
|
||||
- ${CLUSTER_LB_LOKI_SYSLOG}
|
||||
externalTrafficPolicy: Local
|
@@ -1,6 +0,0 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- prometheus-rule.yaml
|
@@ -1,109 +0,0 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: loki.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: loki.rules
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
@@ -5,21 +5,30 @@ metadata:
|
||||
name: thanos
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://charts.bitnami.com/bitnami
|
||||
chart: thanos
|
||||
version: 11.4.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bitnami-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
dependsOn:
|
||||
- name: kube-prometheus-stack
|
||||
namespace: monitoring
|
||||
values:
|
||||
query:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
podAntiAffinityPreset: hard
|
||||
replicaLabels:
|
||||
- replica
|
||||
dnsDiscovery:
|
||||
@@ -27,46 +36,26 @@ spec:
|
||||
sidecarsNamespace: monitoring
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: "thanos.${SECRET_CLUSTER_DOMAIN}"
|
||||
hostname: &host "thanos-query.${SECRET_CLUSTER_DOMAIN}"
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
||||
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
||||
# traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
||||
# traefik.ingress.kubernetes.io/router.middlewares: networking-forward-auth@kubernetescrd
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
queryFrontend:
|
||||
enabled: false
|
||||
bucketweb:
|
||||
enabled: true
|
||||
enabled: false
|
||||
compactor:
|
||||
enabled: true
|
||||
strategyType: Recreate
|
||||
persistence:
|
||||
size: 30Gi
|
||||
enabled: false
|
||||
storegateway:
|
||||
enabled: true
|
||||
enabled: false
|
||||
ruler:
|
||||
enabled: false
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
objstoreConfig: |-
|
||||
type: s3
|
||||
config:
|
||||
bucket: thanos
|
||||
endpoint: ${SECRET_MINIO_ENDPOINT}
|
||||
access_key: "${SECRET_MINIO_ACCESS_KEY}"
|
||||
secret_key: "${SECRET_MINIO_SECRET_KEY}"
|
||||
insecure: false
|
||||
|
||||
postRenderers:
|
||||
- kustomize:
|
||||
patchesJson6902:
|
||||
- target:
|
||||
kind: Ingress
|
||||
name: thanos-query
|
||||
patch:
|
||||
- op: add
|
||||
path: /spec/ingressClassName
|
||||
value: nginx
|
||||
|
@@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
# - prometheus-rule.yaml
|
||||
|
@@ -1,39 +0,0 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: thanos.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: thanos.rules
|
||||
rules:
|
||||
- alert: ThanosCompactionHalted
|
||||
expr: |
|
||||
thanos_compactor_halted == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compaction halted on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: ThanosCompactBucketOperationFailure
|
||||
expr: |
|
||||
rate(thanos_objstore_bucket_operation_failures_total[1m])
|
||||
> 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: ThanosCompactNotRun
|
||||
expr: |
|
||||
(time() - thanos_objstore_bucket_last_successful_upload_time)
|
||||
> 24*60*60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Thanos compact not run on {{ $labels.instance }}"
|
||||
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
@@ -1,10 +0,0 @@
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: infracloudio-charts
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://infracloudio.github.io/charts
|
||||
timeout: 3m
|
@@ -12,7 +12,6 @@ resources:
|
||||
- gitea-charts.yaml
|
||||
- grafana-charts.yaml
|
||||
- influxdata-charts.yaml
|
||||
- infracloudio-charts.yaml
|
||||
- ingress-nginx-charts.yaml
|
||||
- jetstack-charts.yaml
|
||||
- k8s-at-home.yaml
|
||||
|
Reference in New Issue
Block a user