♻️ monitoring

This commit is contained in:
auricom
2022-09-13 19:55:24 +02:00
parent 18841845a9
commit 9d2e17f9c6
16 changed files with 233 additions and 703 deletions

View File

@@ -1,182 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: botkube
namespace: monitoring
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://infracloudio.github.io/charts
chart: botkube
version: v0.12.4
sourceRef:
kind: HelmRepository
name: infracloudio-charts
namespace: flux-system
interval: 5m
values:
image:
repository: infracloudio/botkube
tag: v0.12.1
serviceMonitor:
enabled: true
config:
settings:
clustername: k3s
resources:
- name: v1/pods # Name of the resources e.g pod, deployment, ingress, etc. (Resource name must be in singular form)
namespaces:
include:
- all
ignore: # List of namespaces to be ignored (omitempty), used only with include: all
- kasten-io # example : include [all], ignore [x,y,z]
- kube-system
events: # List of lifecycle events you want to receive, e.g create, update, delete, error OR all
- create
- delete
- name: v1/services
namespaces:
include:
- all
events:
- create
- delete
- error
- name: apps/v1/deployments
namespaces:
include:
- all
events:
- create
- update
- delete
- error
updateSetting:
includeDiff: true
fields:
- spec.template.spec.containers[*].image
- name: apps/v1/statefulsets
namespaces:
include:
- all
events:
- create
- update
- delete
- error
updateSetting:
includeDiff: true
fields:
- spec.template.spec.containers[*].image
- name: networking.k8s.io/v1beta1/ingresses
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/nodes
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/namespaces
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/persistentvolumes
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/persistentvolumeclaims
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/secrets
namespaces:
include:
- all
events:
- create
- delete
- error
- name: v1/configmaps
namespaces:
include:
- all
ignore:
- rook-ceph
events:
- delete
- error
- name: apps/v1/daemonsets
namespaces:
include:
- all
events:
- create
- delete
- error
- update
updateSetting:
includeDiff: true
fields:
- spec.template.spec.containers[*].image
- name: rbac.authorization.k8s.io/v1/roles
namespaces:
include:
- all
events:
- create
- delete
- error
- name: rbac.authorization.k8s.io/v1/rolebindings
namespaces:
include:
- all
events:
- create
- delete
- error
- name: rbac.authorization.k8s.io/v1/clusterroles
namespaces:
include:
- all
events:
- create
- delete
- error
- name: rbac.authorization.k8s.io/v1/clusterrolebindings
namespaces:
include:
- all
events:
- create
- delete
- error
recommendations: true
communications:
discord:
enabled: true
notiftype: short
channel: "778626068637679707"
botid: ${SECRET_BOTKUBE_DISCORD_BOTID}
token: ${SECRET_BOTKUBE_DISCORD_TOKEN}

View File

@@ -1,5 +0,0 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml

View File

@@ -33,6 +33,34 @@ spec:
admin:
existingSecret: grafana-admin-creds
grafana.ini:
auth:
signout_redirect_url: "https://login.${SECRET_CLUSTER_DOMAIN}/logout"
oauth_auto_login: false
auth.generic_oauth:
enabled: true
name: Authelia
client_id: grafana
client_secret: "${SECRET_GRAFANA_OAUTH_CLIENT_SECRET}"
scopes: "openid profile email groups"
empty_scopes: false
auth_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/authorization"
token_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/token"
api_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/userinfo"
login_attribute_path: preferred_username
groups_attribute_path: groups
name_attribute_path: name
use_pkce: true
auth.generic_oauth.group_mapping:
role_attribute_path: |
contains(groups[*], 'admins') && 'Admin' || contains(groups[*], 'people') && 'Viewer'
org_id: 1
auth.basic:
disable_login_form: false
auth.anonymous:
enabled: true
org_name: HomeOps
org_id: 1
org_role: Viewer
server:
root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}"
paths:
@@ -46,8 +74,6 @@ spec:
mode: console
grafana_net:
url: https://grafana.net
auth.basic:
disable_login_form: false
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
@@ -71,25 +97,22 @@ spec:
- name: Prometheus
type: prometheus
access: proxy
url: http://thanos-query:9090/
url: http://thanos-query.monitoring.svc.cluster.local:9090
isDefault: true
# - name: Loki
# type: loki
# access: proxy
# url: http://loki-gateway:80
# url: http://loki-gateway.monitoring.svc.cluster.local:80
dashboards:
default:
kubernetes-custom:
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json
datasource: Prometheus
home-assistant:
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/home-assistant.json
datasource: Prometheus
homelab-temperatures:
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json
datasource: Prometheus
truenas:
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json
url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/truenas.json
datasource: Prometheus
sidecar:
dashboards:
@@ -126,3 +149,14 @@ spec:
- *host
persistence:
enabled: false
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/name
operator: In
values: ["grafana"]
topologyKey: kubernetes.io/hostname

View File

@@ -5,10 +5,9 @@ metadata:
name: kube-prometheus-stack
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://prometheus-community.github.io/helm-charts
chart: kube-prometheus-stack
version: 39.13.3
sourceRef:
@@ -24,85 +23,37 @@ spec:
remediation:
retries: 5
values:
alertmanager:
config:
global:
resolve_timeout: 5m
receivers:
- name: "null"
- name: "pushover"
pushover_configs:
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
send_resolved: true
html: true
priority: |-
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
url_title: View in Alert Manager
title: |-
[{{ .Status | toUpper -}}
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
] {{ .CommonLabels.alertname }}
message: |-
{{- range .Alerts }}
{{- if ne .Labels.severity "" }}
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
{{- else }}
<b>Severity:</b> <i>N/A</i>
{{- end }}
{{- if ne .Annotations.description "" }}
<b>Description:</b> <i>{{ .Annotations.description }}</i>
{{- else if ne .Annotations.summary "" }}
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
{{- else if ne .Annotations.message "" }}
<b>Message:</b> <i>{{ .Annotations.message }}</i>
{{- else }}
<b>Description:</b> <i>N/A</i>
{{- end }}
{{- if gt (len .Labels.SortedPairs) 0 }}
<b>Details:</b>
{{- range .Labels.SortedPairs }}
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
{{- end }}
{{- end }}
{{- end }}
route:
receiver: "pushover"
routes:
- receiver: "null"
matchers:
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
- receiver: "pushover"
matchers:
- severity = "critical"
continue: true
inhibit_rules:
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
equal: ["alertname", "namespace"]
alertmanagerSpec:
replicas: 2
podAntiAffinity: hard
storage:
volumeClaimTemplate:
spec:
storageClassName: rook-ceph-block
resources:
requests:
storage: 10Gi
ingress:
enabled: true
pathType: Prefix
ingressClassName: "nginx"
annotations:
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
tls:
- hosts:
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
###
### Component values
###
kubeApiServer:
enabled: true
kubeControllerManager:
enabled: false
kubeEtcd:
enabled: false
kubelet:
enabled: true
serviceMonitor:
metricRelabelings:
- action: replace
sourceLabels:
- node
targetLabel: instance
kubeProxy:
enabled: false
kubeScheduler:
enabled: false
kubeStateMetrics:
enabled: true
kube-state-metrics:
metricLabelsAllowlist:
- "persistentvolumeclaims=[*]"
prometheus:
monitor:
enabled: true
@@ -113,8 +64,42 @@ spec:
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
resources:
requests:
cpu: 15m
memory: 127M
limits:
memory: 153M
grafana:
enabled: false
forceDeployDashboards: true
nodeExporter:
enabled: true
prometheus-node-exporter:
resources:
requests:
cpu: 23m
memory: 64M
limits:
memory: 64M
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
###
### Prometheus operator values
###
prometheusOperator:
createCustomResource: true
prometheusConfigReloader:
resources:
requests:
@@ -123,35 +108,10 @@ spec:
limits:
cpu: 300m
memory: 50Mi
nodeExporter:
enabled: true
serviceMonitor:
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
kubelet:
enabled: true
serviceMonitor:
metricRelabelings:
- action: replace
sourceLabels:
- node
targetLabel: instance
grafana:
enabled: false
forceDeployDashboards: true
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false
###
### Prometheus instance values
###
prometheus:
ingress:
enabled: true
@@ -171,7 +131,7 @@ spec:
cpu: 400m
limits:
memory: 6000Mi
replicas: 2
replicas: 1
replicaExternalLabelName: "replica"
podAntiAffinity: hard
ruleSelector: {}
@@ -183,8 +143,9 @@ spec:
podMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelectorNilUsesHelmValues: false
retention: 2d
retentionSize: "6GB"
probeSelectorNilUsesHelmValues: false
retention: 14d
retentionSize: "45GB"
enableAdminAPI: true
walCompression: true
storageSpec:
@@ -193,13 +154,10 @@ spec:
storageClassName: rook-ceph-block
resources:
requests:
storage: 10Gi
storage: 50Gi
thanos:
image: quay.io/thanos/thanos:v0.28.0
version: v0.25.2
objectStorageConfig:
name: thanos-objstore-secret
key: objstore.yml
version: v0.28.0
additionalScrapeConfigs:
- job_name: "opnsense"
scrape_interval: 60s
@@ -331,3 +289,93 @@ spec:
tls:
- hosts:
- *host
alertmanager:
config:
global:
resolve_timeout: 5m
receivers:
- name: "null"
- name: "pushover"
pushover_configs:
- user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY}
token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN}
send_resolved: true
html: true
priority: |-
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
url_title: View in Alert Manager
title: |-
[{{ .Status | toUpper -}}
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
] {{ .CommonLabels.alertname }}
message: |-
{{- range .Alerts }}
{{- if ne .Labels.severity "" }}
<b>Severity:</b> <i>{{ .Labels.severity }}</i>
{{- else }}
<b>Severity:</b> <i>N/A</i>
{{- end }}
{{- if ne .Annotations.description "" }}
<b>Description:</b> <i>{{ .Annotations.description }}</i>
{{- else if ne .Annotations.summary "" }}
<b>Summary:</b> <i>{{ .Annotations.summary }}</i>
{{- else if ne .Annotations.message "" }}
<b>Message:</b> <i>{{ .Annotations.message }}</i>
{{- else }}
<b>Description:</b> <i>N/A</i>
{{- end }}
{{- if gt (len .Labels.SortedPairs) 0 }}
<b>Details:</b>
{{- range .Labels.SortedPairs }}
• <b>{{ .Name }}:</b> <i>{{ .Value }}</i>
{{- end }}
{{- end }}
{{- end }}
route:
receiver: "pushover"
routes:
- receiver: "null"
matchers:
- alertname =~ "InfoInhibitor|Watchdog|RebootScheduled"
- receiver: "pushover"
matchers:
- severity = "critical"
continue: true
inhibit_rules:
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
equal: ["alertname", "namespace"]
alertmanagerSpec:
replicas: 1
podAntiAffinity: hard
storage:
volumeClaimTemplate:
spec:
storageClassName: rook-ceph-block
resources:
requests:
storage: 1Gi
ingress:
enabled: true
pathType: Prefix
ingressClassName: "nginx"
annotations:
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"]
tls:
- hosts:
- "alert-manager.${SECRET_CLUSTER_DOMAIN}"
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node

View File

@@ -3,10 +3,8 @@ kind: Kustomization
resources:
- namespace.yaml
- blackbox-exporter
- botkube
- grafana
- healthchecks
- kube-prometheus-stack
#- loki-stack
- thanos
- uptime-kuma

View File

@@ -1,186 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: loki-stack
namespace: monitoring
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://grafana.github.io/helm-charts
chart: loki-stack
version: 2.6.5
sourceRef:
kind: HelmRepository
name: grafana-loki-charts
namespace: flux-system
interval: 5m
values:
loki:
image:
repository: grafana/loki
tag: 2.6.1
pullPolicy: IfNotPresent
replicas: 3
persistence:
enabled: false
config:
auth_enabled: false
server:
http_listen_port: 3100
distributor:
ring:
kvstore:
store: memberlist
ingester:
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
memberlist:
abort_if_cluster_join_fails: false
# Expose this port on all distributor, ingester
# and querier replicas.
bind_port: 7946
# You can use a headless k8s service for all distributor,
# ingester and querier components.
join_members:
- loki-stack-headless.monitoring.svc.cluster.local:7946
# max_join_backoff: 1m
# max_join_retries: 10
# min_join_backoff: 1s
schema_config:
configs:
- from: "2020-10-24"
store: boltdb-shipper
object_store: s3
schema: v11
index:
prefix: index_
period: 24h
storage_config:
aws:
insecure: false
s3: https://${SECRET_MINIO_ACCESS_KEY}:${SECRET_MINIO_SECRET_KEY}@${SECRET_MINIO_ENDPOINT}/loki
s3forcepathstyle: true
boltdb_shipper:
active_index_directory: /data/loki/index
cache_location: /data/loki/index_cache
resync_interval: 5s
shared_store: s3
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
extraPorts:
- port: 7956
protocol: TCP
name: loki-gossip-ring
targetPort: 7946
serviceMonitor:
enabled: true
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "http-metrics"
promtail:
image:
registry: docker.io
repository: grafana/promtail
tag: latest
pullPolicy: Always
serviceMonitor:
enabled: true
extraScrapeConfigs:
- job_name: syslog
syslog:
listen_address: 0.0.0.0:1514
label_structured_data: true
labels:
job: "syslog"
relabel_configs:
- source_labels: ['__syslog_connection_ip_address']
target_label: 'ip_address'
- source_labels: ['__syslog_message_severity']
target_label: 'severity'
- source_labels: ['__syslog_message_facility']
target_label: 'facility'
- source_labels: ['__syslog_message_hostname']
target_label: 'host'
- source_labels: ['__syslog_message_app_name']
target_label: 'app'
- source_labels: ['__syslog_message_SRC']
target_label: 'source_ip'
- source_labels: ['__syslog_message_SPT']
target_label: 'source_port'
- source_labels: ['__syslog_message_DPT']
target_label: 'destination_port'
- source_labels: ['__syslog_message_DST']
target_label: 'destination_ip'
pipeline_stages:
# - job_name: pfsense
# syslog:
# listen_address: 0.0.0.0:1514
# idle_timeout: 60s
# label_structured_data: false
# labels:
# job: "syslog"
# host: pfsense
# relabel_configs:
# - source_labels: ["__syslog_message_severity"]
# target_label: "severity"
# #- source_labels: ['__syslog_message_facility']
# # target_label: 'facility'
# - source_labels: ["__syslog_message_app_name"]
# target_label: "app_name"
# pipeline_stages:
# - match:
# selector: '{app_name="filterlog"}'
# stages:
# - regex:
# expression: '(?P<pfsense_fw_rule>\d*?),(?P<pfsense_fw_subrule>\d*?),(?P<pfsense_fw_anchor>\d*?),(?P<pfsense_fw_tracker>\d*?),(?P<pfsense_fw_interface>igb.{1,5}?),(?P<pfsense_fw_reason>\w*?),(?P<pfsense_fw_action>\w*?),(?P<pfsense_fw_direction>\w*?),(?P<pfsense_fw_ip_version>4{1}?),(?P<pfsense_fw_tos>\w*?),(?P<pfsense_fw_ecn>\w*?),(?P<pfsense_fw_ttl>\w*?),(?P<pfsense_fw_id>\w*?),(?P<pfsense_fw_offset>\w*?),(?P<pfsense_fw_flag>\w*?),(?P<pfsense_fw_protocol_id>\d*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_length>\d*?),(?P<pfsense_fw_source_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_destination_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P<pfsense_fw_source_port>\d+?),(?P<pfsense_fw_destination_port>\d+?),(?P<pfsense_fw_data_length>\d+?)'
# # ipv6 // ,(?P<pfsense_fw_ip_version>6{1}?),(?P<pfsense_fw_lass>\w*?),(?P<pfsense_fw_flow_label>\w*?),(?P<pfsense_fw_hop_limit>\w*?),(?P<pfsense_fw_protocol_text>\w*?),(?P<pfsense_fw_protocol_id>\d*?),
# - labels:
# pfsense_fw_rule: ""
# #pfsense_fw_subrule: ''
# #pfsense_fw_anchor: ''
# pfsense_fw_tracker: ""
# pfsense_fw_interface: ""
# pfsense_fw_reason: ""
# pfsense_fw_action: ""
# pfsense_fw_direction: ""
# #pfsense_fw_ip_version: ''
# #pfsense_fw_tos: ''
# #pfsense_fw_ecn: ''
# #pfsense_fw_ttl: ''
# #pfsense_fw_id: ''
# #pfsense_fw_offset: ''
# #pfsense_fw_flag: ''
# pfsense_fw_protocol_id: ""
# pfsense_fw_protocol_text: ""
# #pfsense_fw_length: ''
# pfsense_fw_source_address: ""
# pfsense_fw_destination_address: ""
# pfsense_fw_source_port: ""
# pfsense_fw_destination_port: ""
# #pfsense_fw_data_length: ''
# # - metrics:
# # lines_total:
# # type: Counter
# # description: "pfsense firewall : total number of log lines"
# # prefix: pfsense_firewall_
# # match_all: true
# # count_entry_bytes: true
# # config:
# # action: add
syslogService:
enabled: true
type: LoadBalancer
port: 1514
externalIPs:
- ${CLUSTER_LB_LOKI_SYSLOG}
externalTrafficPolicy: Local

View File

@@ -1,6 +0,0 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- prometheus-rule.yaml

View File

@@ -1,109 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: loki.rules
namespace: monitoring
spec:
groups:
- name: loki.rules
rules:
- alert: LokiRequestErrors
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job)
> 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
> 1
for: 15m
labels:
severity: critical
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate

View File

@@ -5,21 +5,30 @@ metadata:
name: thanos
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://charts.bitnami.com/bitnami
chart: thanos
version: 11.4.0
sourceRef:
kind: HelmRepository
name: bitnami-charts
namespace: flux-system
interval: 5m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
dependsOn:
- name: kube-prometheus-stack
namespace: monitoring
values:
query:
enabled: true
replicaCount: 2
podAntiAffinityPreset: hard
replicaLabels:
- replica
dnsDiscovery:
@@ -27,46 +36,26 @@ spec:
sidecarsNamespace: monitoring
ingress:
enabled: true
hostname: "thanos.${SECRET_CLUSTER_DOMAIN}"
hostname: &host "thanos-query.${SECRET_CLUSTER_DOMAIN}"
annotations:
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}"
# traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
# traefik.ingress.kubernetes.io/router.middlewares: networking-forward-auth@kubernetescrd
ingressClassName: "nginx"
tls: true
extraTls:
- hosts:
- *host
queryFrontend:
enabled: false
bucketweb:
enabled: true
enabled: false
compactor:
enabled: true
strategyType: Recreate
persistence:
size: 30Gi
enabled: false
storegateway:
enabled: true
enabled: false
ruler:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: true
objstoreConfig: |-
type: s3
config:
bucket: thanos
endpoint: ${SECRET_MINIO_ENDPOINT}
access_key: "${SECRET_MINIO_ACCESS_KEY}"
secret_key: "${SECRET_MINIO_SECRET_KEY}"
insecure: false
postRenderers:
- kustomize:
patchesJson6902:
- target:
kind: Ingress
name: thanos-query
patch:
- op: add
path: /spec/ingressClassName
value: nginx

View File

@@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
# - prometheus-rule.yaml

View File

@@ -1,39 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: thanos.rules
namespace: monitoring
spec:
groups:
- name: thanos.rules
rules:
- alert: ThanosCompactionHalted
expr: |
thanos_compactor_halted == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compaction halted on {{ $labels.instance }}"
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactBucketOperationFailure
expr: |
rate(thanos_objstore_bucket_operation_failures_total[1m])
> 0
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compact bucket operation failure on {{ $labels.instance }}"
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactNotRun
expr: |
(time() - thanos_objstore_bucket_last_successful_upload_time)
> 24*60*60
for: 0m
labels:
severity: critical
annotations:
summary: "Thanos compact not run on {{ $labels.instance }}"
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -1,10 +0,0 @@
---
apiVersion: source.toolkit.fluxcd.io/v1beta1
kind: HelmRepository
metadata:
name: infracloudio-charts
namespace: flux-system
spec:
interval: 1h
url: https://infracloudio.github.io/charts
timeout: 3m

View File

@@ -12,7 +12,6 @@ resources:
- gitea-charts.yaml
- grafana-charts.yaml
- influxdata-charts.yaml
- infracloudio-charts.yaml
- ingress-nginx-charts.yaml
- jetstack-charts.yaml
- k8s-at-home.yaml