mirror of
https://github.com/auricom/home-cluster.git
synced 2025-09-17 18:24:14 +02:00
♻️ monitoring
This commit is contained in:
@@ -1,182 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: botkube
|
|
||||||
namespace: monitoring
|
|
||||||
spec:
|
|
||||||
interval: 5m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
# renovate: registryUrl=https://infracloudio.github.io/charts
|
|
||||||
chart: botkube
|
|
||||||
version: v0.12.4
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: infracloudio-charts
|
|
||||||
namespace: flux-system
|
|
||||||
interval: 5m
|
|
||||||
values:
|
|
||||||
image:
|
|
||||||
repository: infracloudio/botkube
|
|
||||||
tag: v0.12.1
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: true
|
|
||||||
config:
|
|
||||||
settings:
|
|
||||||
clustername: k3s
|
|
||||||
resources:
|
|
||||||
- name: v1/pods # Name of the resources e.g pod, deployment, ingress, etc. (Resource name must be in singular form)
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
ignore: # List of namespaces to be ignored (omitempty), used only with include: all
|
|
||||||
- kasten-io # example : include [all], ignore [x,y,z]
|
|
||||||
- kube-system
|
|
||||||
events: # List of lifecycle events you want to receive, e.g create, update, delete, error OR all
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- name: v1/services
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: apps/v1/deployments
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
updateSetting:
|
|
||||||
includeDiff: true
|
|
||||||
fields:
|
|
||||||
- spec.template.spec.containers[*].image
|
|
||||||
- name: apps/v1/statefulsets
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
updateSetting:
|
|
||||||
includeDiff: true
|
|
||||||
fields:
|
|
||||||
- spec.template.spec.containers[*].image
|
|
||||||
- name: networking.k8s.io/v1beta1/ingresses
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/nodes
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/namespaces
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/persistentvolumes
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/persistentvolumeclaims
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/secrets
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: v1/configmaps
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
ignore:
|
|
||||||
- rook-ceph
|
|
||||||
events:
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: apps/v1/daemonsets
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- update
|
|
||||||
updateSetting:
|
|
||||||
includeDiff: true
|
|
||||||
fields:
|
|
||||||
- spec.template.spec.containers[*].image
|
|
||||||
- name: rbac.authorization.k8s.io/v1/roles
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: rbac.authorization.k8s.io/v1/rolebindings
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: rbac.authorization.k8s.io/v1/clusterroles
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
- name: rbac.authorization.k8s.io/v1/clusterrolebindings
|
|
||||||
namespaces:
|
|
||||||
include:
|
|
||||||
- all
|
|
||||||
events:
|
|
||||||
- create
|
|
||||||
- delete
|
|
||||||
- error
|
|
||||||
recommendations: true
|
|
||||||
communications:
|
|
||||||
discord:
|
|
||||||
enabled: true
|
|
||||||
notiftype: short
|
|
||||||
channel: "778626068637679707"
|
|
||||||
botid: ${SECRET_BOTKUBE_DISCORD_BOTID}
|
|
||||||
token: ${SECRET_BOTKUBE_DISCORD_TOKEN}
|
|
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- helm-release.yaml
|
|
@@ -415,4 +415,4 @@
|
|||||||
"uid": "sn-bOoWMk",
|
"uid": "sn-bOoWMk",
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"weekStart": ""
|
"weekStart": ""
|
||||||
}
|
}
|
@@ -368,4 +368,4 @@
|
|||||||
"uid": "aEY0BVGnz",
|
"uid": "aEY0BVGnz",
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"weekStart": ""
|
"weekStart": ""
|
||||||
}
|
}
|
@@ -33,6 +33,34 @@ spec:
|
|||||||
admin:
|
admin:
|
||||||
existingSecret: grafana-admin-creds
|
existingSecret: grafana-admin-creds
|
||||||
grafana.ini:
|
grafana.ini:
|
||||||
|
auth:
|
||||||
|
signout_redirect_url: "https://login.${SECRET_CLUSTER_DOMAIN}/logout"
|
||||||
|
oauth_auto_login: false
|
||||||
|
auth.generic_oauth:
|
||||||
|
enabled: true
|
||||||
|
name: Authelia
|
||||||
|
client_id: grafana
|
||||||
|
client_secret: "${SECRET_GRAFANA_OAUTH_CLIENT_SECRET}"
|
||||||
|
scopes: "openid profile email groups"
|
||||||
|
empty_scopes: false
|
||||||
|
auth_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/authorization"
|
||||||
|
token_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/token"
|
||||||
|
api_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/userinfo"
|
||||||
|
login_attribute_path: preferred_username
|
||||||
|
groups_attribute_path: groups
|
||||||
|
name_attribute_path: name
|
||||||
|
use_pkce: true
|
||||||
|
auth.generic_oauth.group_mapping:
|
||||||
|
role_attribute_path: |
|
||||||
|
contains(groups[*], 'admins') && 'Admin' || contains(groups[*], 'people') && 'Viewer'
|
||||||
|
org_id: 1
|
||||||
|
auth.basic:
|
||||||
|
disable_login_form: false
|
||||||
|
auth.anonymous:
|
||||||
|
enabled: true
|
||||||
|
org_name: HomeOps
|
||||||
|
org_id: 1
|
||||||
|
org_role: Viewer
|
||||||
server:
|
server:
|
||||||
root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}"
|
root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}"
|
||||||
paths:
|
paths:
|
||||||
@@ -46,8 +74,6 @@ spec:
|
|||||||
mode: console
|
mode: console
|
||||||
grafana_net:
|
grafana_net:
|
||||||
url: https://grafana.net
|
url: https://grafana.net
|
||||||
auth.basic:
|
|
||||||
disable_login_form: false
|
|
||||||
dashboardProviders:
|
dashboardProviders:
|
||||||
dashboardproviders.yaml:
|
dashboardproviders.yaml:
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
@@ -71,25 +97,22 @@ spec:
|
|||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://thanos-query:9090/
|
url: http://thanos-query.monitoring.svc.cluster.local:9090
|
||||||
isDefault: true
|
isDefault: true
|
||||||
# - name: Loki
|
# - name: Loki
|
||||||
# type: loki
|
# type: loki
|
||||||
# access: proxy
|
# access: proxy
|
||||||
# url: http://loki-gateway:80
|
# url: http://loki-gateway.monitoring.svc.cluster.local:80
|
||||||
dashboards:
|
dashboards:
|
||||||
default:
|
default:
|
||||||
kubernetes-custom:
|
|
||||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json
|
|
||||||
datasource: Prometheus
|
|
||||||
home-assistant:
|
home-assistant:
|
||||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
|
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
|
||||||
datasource: Prometheus
|
datasource: Prometheus
|
||||||
homelab-temperatures:
|
homelab-temperatures:
|
||||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
|
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
|
||||||
datasource: Prometheus
|
datasource: Prometheus
|
||||||
truenas:
|
truenas:
|
||||||
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json
|
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/truenas.json
|
||||||
datasource: Prometheus
|
datasource: Prometheus
|
||||||
sidecar:
|
sidecar:
|
||||||
dashboards:
|
dashboards:
|
||||||
@@ -126,3 +149,14 @@ spec:
|
|||||||
- *host
|
- *host
|
||||||
persistence:
|
persistence:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
affinity:
|
||||||
|
podAntiAffinity:
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
podAffinityTerm:
|
||||||
|
labelSelector:
|
||||||
|
matchExpressions:
|
||||||
|
- key: app.kubernetes.io/name
|
||||||
|
operator: In
|
||||||
|
values: ["grafana"]
|
||||||
|
topologyKey: kubernetes.io/hostname
|
||||||
|
@@ -5,10 +5,9 @@ metadata:
|
|||||||
name: kube-prometheus-stack
|
name: kube-prometheus-stack
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
interval: 5m
|
interval: 15m
|
||||||
chart:
|
chart:
|
||||||
spec:
|
spec:
|
||||||
# renovate: registryUrl=https://prometheus-community.github.io/helm-charts
|
|
||||||
chart: kube-prometheus-stack
|
chart: kube-prometheus-stack
|
||||||
version: 39.13.3
|
version: 39.13.3
|
||||||
sourceRef:
|
sourceRef:
|
||||||
@@ -24,85 +23,37 @@ spec:
|
|||||||
remediation:
|
remediation:
|
||||||
retries: 5
|
retries: 5
|
||||||
values:
|
values:
|
||||||
alertmanager:
|
###
|
||||||
config:
|
### Component values
|
||||||
global:
|
###
|
||||||
resolve_timeout: 5m
|
kubeApiServer:
|
||||||
receivers:
|
enabled: true
|
||||||
- name: "null"
|
kubeControllerManager:
|
||||||
- name: "pushover"
|
enabled: false
|
||||||
pushover_configs:
|
|
||||||
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
|
kubeEtcd:
|
||||||
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
|
enabled: false
|
||||||
send_resolved: true
|
|
||||||
html: true
|
kubelet:
|
||||||
priority: |-
|
enabled: true
|
||||||
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
serviceMonitor:
|
||||||
url_title: View in Alert Manager
|
metricRelabelings:
|
||||||
title: |-
|
- action: replace
|
||||||
[{{ .Status | toUpper -}}
|
sourceLabels:
|
||||||
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
- node
|
||||||
] {{ .CommonLabels.alertname }}
|
targetLabel: instance
|
||||||
message: |-
|
|
||||||
{{- range .Alerts }}
|
kubeProxy:
|
||||||
{{- if ne .Labels.severity "" }}
|
enabled: false
|
||||||
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
|
|
||||||
{{- else }}
|
kubeScheduler:
|
||||||
<b>Severity:</b> <i>N/A</i>
|
enabled: false
|
||||||
{{- end }}
|
|
||||||
{{- if ne .Annotations.description "" }}
|
kubeStateMetrics:
|
||||||
<b>Description:</b> <i>{{ .Annotations.description }}</i>
|
enabled: true
|
||||||
{{- else if ne .Annotations.summary "" }}
|
kube-state-metrics:
|
||||||
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
|
metricLabelsAllowlist:
|
||||||
{{- else if ne .Annotations.message "" }}
|
- "persistentvolumeclaims=[*]"
|
||||||
<b>Message:</b> <i>{{ .Annotations.message }}</i>
|
|
||||||
{{- else }}
|
|
||||||
<b>Description:</b> <i>N/A</i>
|
|
||||||
{{- end }}
|
|
||||||
{{- if gt (len .Labels.SortedPairs) 0 }}
|
|
||||||
<b>Details:</b>
|
|
||||||
{{- range .Labels.SortedPairs }}
|
|
||||||
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
route:
|
|
||||||
receiver: "pushover"
|
|
||||||
routes:
|
|
||||||
- receiver: "null"
|
|
||||||
matchers:
|
|
||||||
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
|
|
||||||
- receiver: "pushover"
|
|
||||||
matchers:
|
|
||||||
- severity = "critical"
|
|
||||||
continue: true
|
|
||||||
inhibit_rules:
|
|
||||||
- source_matchers:
|
|
||||||
- severity = "critical"
|
|
||||||
target_matchers:
|
|
||||||
- severity = "warning"
|
|
||||||
equal: ["alertname", "namespace"]
|
|
||||||
alertmanagerSpec:
|
|
||||||
replicas: 2
|
|
||||||
podAntiAffinity: hard
|
|
||||||
storage:
|
|
||||||
volumeClaimTemplate:
|
|
||||||
spec:
|
|
||||||
storageClassName: rook-ceph-block
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 10Gi
|
|
||||||
ingress:
|
|
||||||
enabled: true
|
|
||||||
pathType: Prefix
|
|
||||||
ingressClassName: "nginx"
|
|
||||||
annotations:
|
|
||||||
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
|
||||||
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
|
||||||
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
|
|
||||||
tls:
|
|
||||||
- hosts:
|
|
||||||
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
|
|
||||||
prometheus:
|
prometheus:
|
||||||
monitor:
|
monitor:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -113,8 +64,42 @@ spec:
|
|||||||
sourceLabels:
|
sourceLabels:
|
||||||
- __meta_kubernetes_pod_node_name
|
- __meta_kubernetes_pod_node_name
|
||||||
targetLabel: kubernetes_node
|
targetLabel: kubernetes_node
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 15m
|
||||||
|
memory: 127M
|
||||||
|
limits:
|
||||||
|
memory: 153M
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
enabled: false
|
||||||
|
forceDeployDashboards: true
|
||||||
|
|
||||||
|
nodeExporter:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
prometheus-node-exporter:
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 23m
|
||||||
|
memory: 64M
|
||||||
|
limits:
|
||||||
|
memory: 64M
|
||||||
|
prometheus:
|
||||||
|
monitor:
|
||||||
|
enabled: true
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: kubernetes_node
|
||||||
|
|
||||||
|
###
|
||||||
|
### Prometheus operator values
|
||||||
|
###
|
||||||
prometheusOperator:
|
prometheusOperator:
|
||||||
createCustomResource: true
|
|
||||||
prometheusConfigReloader:
|
prometheusConfigReloader:
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
@@ -123,35 +108,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 300m
|
cpu: 300m
|
||||||
memory: 50Mi
|
memory: 50Mi
|
||||||
nodeExporter:
|
|
||||||
enabled: true
|
###
|
||||||
serviceMonitor:
|
### Prometheus instance values
|
||||||
relabelings:
|
###
|
||||||
- action: replace
|
|
||||||
regex: (.*)
|
|
||||||
replacement: $1
|
|
||||||
sourceLabels:
|
|
||||||
- __meta_kubernetes_pod_node_name
|
|
||||||
targetLabel: kubernetes_node
|
|
||||||
kubelet:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
metricRelabelings:
|
|
||||||
- action: replace
|
|
||||||
sourceLabels:
|
|
||||||
- node
|
|
||||||
targetLabel: instance
|
|
||||||
grafana:
|
|
||||||
enabled: false
|
|
||||||
forceDeployDashboards: true
|
|
||||||
kubeEtcd:
|
|
||||||
enabled: false
|
|
||||||
kubeControllerManager:
|
|
||||||
enabled: false
|
|
||||||
kubeScheduler:
|
|
||||||
enabled: false
|
|
||||||
kubeProxy:
|
|
||||||
enabled: false
|
|
||||||
prometheus:
|
prometheus:
|
||||||
ingress:
|
ingress:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -171,7 +131,7 @@ spec:
|
|||||||
cpu: 400m
|
cpu: 400m
|
||||||
limits:
|
limits:
|
||||||
memory: 6000Mi
|
memory: 6000Mi
|
||||||
replicas: 2
|
replicas: 1
|
||||||
replicaExternalLabelName: "replica"
|
replicaExternalLabelName: "replica"
|
||||||
podAntiAffinity: hard
|
podAntiAffinity: hard
|
||||||
ruleSelector: {}
|
ruleSelector: {}
|
||||||
@@ -183,8 +143,9 @@ spec:
|
|||||||
podMonitorSelector: {}
|
podMonitorSelector: {}
|
||||||
podMonitorNamespaceSelector: {}
|
podMonitorNamespaceSelector: {}
|
||||||
podMonitorSelectorNilUsesHelmValues: false
|
podMonitorSelectorNilUsesHelmValues: false
|
||||||
retention: 2d
|
probeSelectorNilUsesHelmValues: false
|
||||||
retentionSize: "6GB"
|
retention: 14d
|
||||||
|
retentionSize: "45GB"
|
||||||
enableAdminAPI: true
|
enableAdminAPI: true
|
||||||
walCompression: true
|
walCompression: true
|
||||||
storageSpec:
|
storageSpec:
|
||||||
@@ -193,13 +154,10 @@ spec:
|
|||||||
storageClassName: rook-ceph-block
|
storageClassName: rook-ceph-block
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: 10Gi
|
storage: 50Gi
|
||||||
thanos:
|
thanos:
|
||||||
image: quay.io/thanos/thanos:v0.28.0
|
image: quay.io/thanos/thanos:v0.28.0
|
||||||
version: v0.25.2
|
version: v0.28.0
|
||||||
objectStorageConfig:
|
|
||||||
name: thanos-objstore-secret
|
|
||||||
key: objstore.yml
|
|
||||||
additionalScrapeConfigs:
|
additionalScrapeConfigs:
|
||||||
- job_name: "opnsense"
|
- job_name: "opnsense"
|
||||||
scrape_interval: 60s
|
scrape_interval: 60s
|
||||||
@@ -331,3 +289,93 @@ spec:
|
|||||||
tls:
|
tls:
|
||||||
- hosts:
|
- hosts:
|
||||||
- *host
|
- *host
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
config:
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
receivers:
|
||||||
|
- name: "null"
|
||||||
|
- name: "pushover"
|
||||||
|
pushover_configs:
|
||||||
|
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
|
||||||
|
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
|
||||||
|
send_resolved: true
|
||||||
|
html: true
|
||||||
|
priority: |-
|
||||||
|
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
||||||
|
url_title: View in Alert Manager
|
||||||
|
title: |-
|
||||||
|
[{{ .Status | toUpper -}}
|
||||||
|
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
||||||
|
] {{ .CommonLabels.alertname }}
|
||||||
|
message: |-
|
||||||
|
{{- range .Alerts }}
|
||||||
|
{{- if ne .Labels.severity "" }}
|
||||||
|
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
|
||||||
|
{{- else }}
|
||||||
|
<b>Severity:</b> <i>N/A</i>
|
||||||
|
{{- end }}
|
||||||
|
{{- if ne .Annotations.description "" }}
|
||||||
|
<b>Description:</b> <i>{{ .Annotations.description }}</i>
|
||||||
|
{{- else if ne .Annotations.summary "" }}
|
||||||
|
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
|
||||||
|
{{- else if ne .Annotations.message "" }}
|
||||||
|
<b>Message:</b> <i>{{ .Annotations.message }}</i>
|
||||||
|
{{- else }}
|
||||||
|
<b>Description:</b> <i>N/A</i>
|
||||||
|
{{- end }}
|
||||||
|
{{- if gt (len .Labels.SortedPairs) 0 }}
|
||||||
|
<b>Details:</b>
|
||||||
|
{{- range .Labels.SortedPairs }}
|
||||||
|
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
route:
|
||||||
|
receiver: "pushover"
|
||||||
|
routes:
|
||||||
|
- receiver: "null"
|
||||||
|
matchers:
|
||||||
|
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
|
||||||
|
- receiver: "pushover"
|
||||||
|
matchers:
|
||||||
|
- severity = "critical"
|
||||||
|
continue: true
|
||||||
|
inhibit_rules:
|
||||||
|
- source_matchers:
|
||||||
|
- severity = "critical"
|
||||||
|
target_matchers:
|
||||||
|
- severity = "warning"
|
||||||
|
equal: ["alertname", "namespace"]
|
||||||
|
alertmanagerSpec:
|
||||||
|
replicas: 1
|
||||||
|
podAntiAffinity: hard
|
||||||
|
storage:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: rook-ceph-block
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
pathType: Prefix
|
||||||
|
ingressClassName: "nginx"
|
||||||
|
annotations:
|
||||||
|
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
||||||
|
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
||||||
|
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
|
||||||
|
prometheus:
|
||||||
|
monitor:
|
||||||
|
enabled: true
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: kubernetes_node
|
||||||
|
@@ -3,10 +3,8 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- namespace.yaml
|
- namespace.yaml
|
||||||
- blackbox-exporter
|
- blackbox-exporter
|
||||||
- botkube
|
|
||||||
- grafana
|
- grafana
|
||||||
- healthchecks
|
- healthchecks
|
||||||
- kube-prometheus-stack
|
- kube-prometheus-stack
|
||||||
#- loki-stack
|
|
||||||
- thanos
|
- thanos
|
||||||
- uptime-kuma
|
- uptime-kuma
|
||||||
|
@@ -1,186 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: loki-stack
|
|
||||||
namespace: monitoring
|
|
||||||
spec:
|
|
||||||
interval: 5m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
# renovate: registryUrl=https://grafana.github.io/helm-charts
|
|
||||||
chart: loki-stack
|
|
||||||
version: 2.6.5
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: grafana-loki-charts
|
|
||||||
namespace: flux-system
|
|
||||||
interval: 5m
|
|
||||||
values:
|
|
||||||
loki:
|
|
||||||
image:
|
|
||||||
repository: grafana/loki
|
|
||||||
tag: 2.6.1
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
replicas: 3
|
|
||||||
persistence:
|
|
||||||
enabled: false
|
|
||||||
config:
|
|
||||||
auth_enabled: false
|
|
||||||
server:
|
|
||||||
http_listen_port: 3100
|
|
||||||
distributor:
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: memberlist
|
|
||||||
ingester:
|
|
||||||
lifecycler:
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: memberlist
|
|
||||||
replication_factor: 1
|
|
||||||
final_sleep: 0s
|
|
||||||
chunk_idle_period: 5m
|
|
||||||
chunk_retain_period: 30s
|
|
||||||
memberlist:
|
|
||||||
abort_if_cluster_join_fails: false
|
|
||||||
# Expose this port on all distributor, ingester
|
|
||||||
# and querier replicas.
|
|
||||||
bind_port: 7946
|
|
||||||
# You can use a headless k8s service for all distributor,
|
|
||||||
# ingester and querier components.
|
|
||||||
join_members:
|
|
||||||
- loki-stack-headless.monitoring.svc.cluster.local:7946
|
|
||||||
# max_join_backoff: 1m
|
|
||||||
# max_join_retries: 10
|
|
||||||
# min_join_backoff: 1s
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: "2020-10-24"
|
|
||||||
store: boltdb-shipper
|
|
||||||
object_store: s3
|
|
||||||
schema: v11
|
|
||||||
index:
|
|
||||||
prefix: index_
|
|
||||||
period: 24h
|
|
||||||
storage_config:
|
|
||||||
aws:
|
|
||||||
insecure: false
|
|
||||||
s3: https://${SECRET_MINIO_ACCESS_KEY}:${SECRET_MINIO_SECRET_KEY}@${SECRET_MINIO_ENDPOINT}/loki
|
|
||||||
s3forcepathstyle: true
|
|
||||||
boltdb_shipper:
|
|
||||||
active_index_directory: /data/loki/index
|
|
||||||
cache_location: /data/loki/index_cache
|
|
||||||
resync_interval: 5s
|
|
||||||
shared_store: s3
|
|
||||||
limits_config:
|
|
||||||
enforce_metric_name: false
|
|
||||||
reject_old_samples: true
|
|
||||||
reject_old_samples_max_age: 168h
|
|
||||||
extraPorts:
|
|
||||||
- port: 7956
|
|
||||||
protocol: TCP
|
|
||||||
name: loki-gossip-ring
|
|
||||||
targetPort: 7946
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: true
|
|
||||||
podAnnotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "http-metrics"
|
|
||||||
promtail:
|
|
||||||
image:
|
|
||||||
registry: docker.io
|
|
||||||
repository: grafana/promtail
|
|
||||||
tag: latest
|
|
||||||
pullPolicy: Always
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: true
|
|
||||||
extraScrapeConfigs:
|
|
||||||
- job_name: syslog
|
|
||||||
syslog:
|
|
||||||
listen_address: 0.0.0.0:1514
|
|
||||||
label_structured_data: true
|
|
||||||
labels:
|
|
||||||
job: "syslog"
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: ['__syslog_connection_ip_address']
|
|
||||||
target_label: 'ip_address'
|
|
||||||
- source_labels: ['__syslog_message_severity']
|
|
||||||
target_label: 'severity'
|
|
||||||
- source_labels: ['__syslog_message_facility']
|
|
||||||
target_label: 'facility'
|
|
||||||
- source_labels: ['__syslog_message_hostname']
|
|
||||||
target_label: 'host'
|
|
||||||
- source_labels: ['__syslog_message_app_name']
|
|
||||||
target_label: 'app'
|
|
||||||
- source_labels: ['__syslog_message_SRC']
|
|
||||||
target_label: 'source_ip'
|
|
||||||
- source_labels: ['__syslog_message_SPT']
|
|
||||||
target_label: 'source_port'
|
|
||||||
- source_labels: ['__syslog_message_DPT']
|
|
||||||
target_label: 'destination_port'
|
|
||||||
- source_labels: ['__syslog_message_DST']
|
|
||||||
target_label: 'destination_ip'
|
|
||||||
pipeline_stages:
|
|
||||||
# - job_name: pfsense
|
|
||||||
# syslog:
|
|
||||||
# listen_address: 0.0.0.0:1514
|
|
||||||
# idle_timeout: 60s
|
|
||||||
# label_structured_data: false
|
|
||||||
# labels:
|
|
||||||
# job: "syslog"
|
|
||||||
# host: pfsense
|
|
||||||
# relabel_configs:
|
|
||||||
# - source_labels: ["__syslog_message_severity"]
|
|
||||||
# target_label: "severity"
|
|
||||||
# #- source_labels: ['__syslog_message_facility']
|
|
||||||
# # target_label: 'facility'
|
|
||||||
# - source_labels: ["__syslog_message_app_name"]
|
|
||||||
# target_label: "app_name"
|
|
||||||
# pipeline_stages:
|
|
||||||
# - match:
|
|
||||||
# selector: '{app_name="filterlog"}'
|
|
||||||
# stages:
|
|
||||||
# - regex:
|
|
||||||
# expression: '(?P<pfsense_fw_rule>\d*?),(?P<pfsense_fw_subrule>\d*?),(?P<pfsense_fw_anchor>\d*?),(?P<pfsense_fw_tracker>\d*?),(?P<pfsense_fw_interface>igb.{1,5}?),(?P<pfsense_fw_reason>\w*?),(?P<pfsense_fw_action>\w*?),(?P<pfsense_fw_direction>\w*?),(?P<pfsense_fw_ip_version>4{1}?),(?P<pfsense_fw_tos>\w*?),(?P<pfsense_fw_ecn>\w*?),(?P<pfsense_fw_ttl>\w*?),(?P<pfsense_fw_id>\w*?),(?P<pfsense_fw_offset>\w*?),(?P<pfsense_fw_flag>\w*?),(?P<pfsense_fw_protocol_id>\d*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_length>\d*?),(?P<pfsense_fw_source_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_destination_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_source_port>\d+?),(?P<pfsense_fw_destination_port>\d+?),(?P<pfsense_fw_data_length>\d+?)'
|
|
||||||
# # ipv6 // ,(?P<pfsense_fw_ip_version>6{1}?),(?P<pfsense_fw_lass>\w*?),(?P<pfsense_fw_flow_label>\w*?),(?P<pfsense_fw_hop_limit>\w*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_protocol_id>\d*?),
|
|
||||||
# - labels:
|
|
||||||
# pfsense_fw_rule: ""
|
|
||||||
# #pfsense_fw_subrule: ''
|
|
||||||
# #pfsense_fw_anchor: ''
|
|
||||||
# pfsense_fw_tracker: ""
|
|
||||||
# pfsense_fw_interface: ""
|
|
||||||
# pfsense_fw_reason: ""
|
|
||||||
# pfsense_fw_action: ""
|
|
||||||
# pfsense_fw_direction: ""
|
|
||||||
# #pfsense_fw_ip_version: ''
|
|
||||||
# #pfsense_fw_tos: ''
|
|
||||||
# #pfsense_fw_ecn: ''
|
|
||||||
# #pfsense_fw_ttl: ''
|
|
||||||
# #pfsense_fw_id: ''
|
|
||||||
# #pfsense_fw_offset: ''
|
|
||||||
# #pfsense_fw_flag: ''
|
|
||||||
# pfsense_fw_protocol_id: ""
|
|
||||||
# pfsense_fw_protocol_text: ""
|
|
||||||
# #pfsense_fw_length: ''
|
|
||||||
# pfsense_fw_source_address: ""
|
|
||||||
# pfsense_fw_destination_address: ""
|
|
||||||
# pfsense_fw_source_port: ""
|
|
||||||
# pfsense_fw_destination_port: ""
|
|
||||||
# #pfsense_fw_data_length: ''
|
|
||||||
# # - metrics:
|
|
||||||
# # lines_total:
|
|
||||||
# # type: Counter
|
|
||||||
# # description: "pfsense firewall : total number of log lines"
|
|
||||||
# # prefix: pfsense_firewall_
|
|
||||||
# # match_all: true
|
|
||||||
# # count_entry_bytes: true
|
|
||||||
# # config:
|
|
||||||
# # action: add
|
|
||||||
syslogService:
|
|
||||||
enabled: true
|
|
||||||
type: LoadBalancer
|
|
||||||
port: 1514
|
|
||||||
externalIPs:
|
|
||||||
- ${CLUSTER_LB_LOKI_SYSLOG}
|
|
||||||
externalTrafficPolicy: Local
|
|
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- helm-release.yaml
|
|
||||||
- prometheus-rule.yaml
|
|
@@ -1,109 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: loki.rules
|
|
||||||
namespace: monitoring
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: loki.rules
|
|
||||||
rules:
|
|
||||||
- alert: LokiRequestErrors
|
|
||||||
annotations:
|
|
||||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
|
||||||
expr: |
|
|
||||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
|
||||||
/
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
||||||
> 10
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: LokiRequestPanics
|
|
||||||
annotations:
|
|
||||||
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
|
||||||
expr: |
|
|
||||||
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
|
||||||
> 0
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: LokiRequestLatency
|
|
||||||
annotations:
|
|
||||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
|
||||||
expr: |
|
|
||||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
|
||||||
> 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job))
|
|
||||||
record: job:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job))
|
|
||||||
record: job:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
|
||||||
/
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
|
||||||
record: job:loki_request_duration_seconds:avg
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job)
|
|
||||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
|
||||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
|
||||||
record: job:loki_request_duration_seconds_count:sum_rate
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job, route))
|
|
||||||
record: job_route:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job, route))
|
|
||||||
record: job_route:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
|
||||||
/
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
|
||||||
record: job_route:loki_request_duration_seconds:avg
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job, route)
|
|
||||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
|
||||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
|
||||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, namespace, job, route))
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: |
|
|
||||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, namespace, job, route))
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
|
||||||
/
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, namespace, job, route)
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_sum[1m]))
|
|
||||||
by (namespace, job, route)
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: |
|
|
||||||
sum(rate(loki_request_duration_seconds_count[1m]))
|
|
||||||
by (namespace, job, route)
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
|
@@ -5,21 +5,30 @@ metadata:
|
|||||||
name: thanos
|
name: thanos
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
interval: 5m
|
interval: 15m
|
||||||
chart:
|
chart:
|
||||||
spec:
|
spec:
|
||||||
# renovate: registryUrl=https://charts.bitnami.com/bitnami
|
|
||||||
chart: thanos
|
chart: thanos
|
||||||
version: 11.4.0
|
version: 11.4.0
|
||||||
sourceRef:
|
sourceRef:
|
||||||
kind: HelmRepository
|
kind: HelmRepository
|
||||||
name: bitnami-charts
|
name: bitnami-charts
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
interval: 5m
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 5
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 5
|
||||||
|
dependsOn:
|
||||||
|
- name: kube-prometheus-stack
|
||||||
|
namespace: monitoring
|
||||||
values:
|
values:
|
||||||
query:
|
query:
|
||||||
enabled: true
|
enabled: true
|
||||||
replicaCount: 2
|
replicaCount: 2
|
||||||
|
podAntiAffinityPreset: hard
|
||||||
replicaLabels:
|
replicaLabels:
|
||||||
- replica
|
- replica
|
||||||
dnsDiscovery:
|
dnsDiscovery:
|
||||||
@@ -27,46 +36,26 @@ spec:
|
|||||||
sidecarsNamespace: monitoring
|
sidecarsNamespace: monitoring
|
||||||
ingress:
|
ingress:
|
||||||
enabled: true
|
enabled: true
|
||||||
hostname: "thanos.${SECRET_CLUSTER_DOMAIN}"
|
hostname: &host "thanos-query.${SECRET_CLUSTER_DOMAIN}"
|
||||||
annotations:
|
annotations:
|
||||||
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
|
||||||
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
|
||||||
# traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
ingressClassName: "nginx"
|
||||||
# traefik.ingress.kubernetes.io/router.middlewares: networking-forward-auth@kubernetescrd
|
|
||||||
tls: true
|
tls: true
|
||||||
|
extraTls:
|
||||||
|
- hosts:
|
||||||
|
- *host
|
||||||
queryFrontend:
|
queryFrontend:
|
||||||
enabled: false
|
enabled: false
|
||||||
bucketweb:
|
bucketweb:
|
||||||
enabled: true
|
enabled: false
|
||||||
compactor:
|
compactor:
|
||||||
enabled: true
|
enabled: false
|
||||||
strategyType: Recreate
|
|
||||||
persistence:
|
|
||||||
size: 30Gi
|
|
||||||
storegateway:
|
storegateway:
|
||||||
enabled: true
|
enabled: false
|
||||||
ruler:
|
ruler:
|
||||||
enabled: false
|
enabled: false
|
||||||
metrics:
|
metrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
serviceMonitor:
|
serviceMonitor:
|
||||||
enabled: true
|
enabled: true
|
||||||
objstoreConfig: |-
|
|
||||||
type: s3
|
|
||||||
config:
|
|
||||||
bucket: thanos
|
|
||||||
endpoint: ${SECRET_MINIO_ENDPOINT}
|
|
||||||
access_key: "${SECRET_MINIO_ACCESS_KEY}"
|
|
||||||
secret_key: "${SECRET_MINIO_SECRET_KEY}"
|
|
||||||
insecure: false
|
|
||||||
|
|
||||||
postRenderers:
|
|
||||||
- kustomize:
|
|
||||||
patchesJson6902:
|
|
||||||
- target:
|
|
||||||
kind: Ingress
|
|
||||||
name: thanos-query
|
|
||||||
patch:
|
|
||||||
- op: add
|
|
||||||
path: /spec/ingressClassName
|
|
||||||
value: nginx
|
|
||||||
|
@@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
|||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- helm-release.yaml
|
- helm-release.yaml
|
||||||
# - prometheus-rule.yaml
|
|
||||||
|
@@ -1,39 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: thanos.rules
|
|
||||||
namespace: monitoring
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: thanos.rules
|
|
||||||
rules:
|
|
||||||
- alert: ThanosCompactionHalted
|
|
||||||
expr: |
|
|
||||||
thanos_compactor_halted == 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Thanos compaction halted on {{ $labels.instance }}"
|
|
||||||
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- alert: ThanosCompactBucketOperationFailure
|
|
||||||
expr: |
|
|
||||||
rate(thanos_objstore_bucket_operation_failures_total[1m])
|
|
||||||
> 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
|
|
||||||
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- alert: ThanosCompactNotRun
|
|
||||||
expr: |
|
|
||||||
(time() - thanos_objstore_bucket_last_successful_upload_time)
|
|
||||||
> 24*60*60
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Thanos compact not run on {{ $labels.instance }}"
|
|
||||||
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
@@ -1,10 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: source.toolkit.fluxcd.io/v1beta1
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: infracloudio-charts
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 1h
|
|
||||||
url: https://infracloudio.github.io/charts
|
|
||||||
timeout: 3m
|
|
@@ -12,7 +12,6 @@ resources:
|
|||||||
- gitea-charts.yaml
|
- gitea-charts.yaml
|
||||||
- grafana-charts.yaml
|
- grafana-charts.yaml
|
||||||
- influxdata-charts.yaml
|
- influxdata-charts.yaml
|
||||||
- infracloudio-charts.yaml
|
|
||||||
- ingress-nginx-charts.yaml
|
- ingress-nginx-charts.yaml
|
||||||
- jetstack-charts.yaml
|
- jetstack-charts.yaml
|
||||||
- k8s-at-home.yaml
|
- k8s-at-home.yaml
|
||||||
|
Reference in New Issue
Block a user