diff --git a/cluster/apps/monitoring/botkube/helm-release.yaml b/cluster/apps/monitoring/botkube/helm-release.yaml deleted file mode 100644 index 4bee5381b..000000000 --- a/cluster/apps/monitoring/botkube/helm-release.yaml +++ /dev/null @@ -1,182 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: botkube - namespace: monitoring -spec: - interval: 5m - chart: - spec: - # renovate: registryUrl=https://infracloudio.github.io/charts - chart: botkube - version: v0.12.4 - sourceRef: - kind: HelmRepository - name: infracloudio-charts - namespace: flux-system - interval: 5m - values: - image: - repository: infracloudio/botkube - tag: v0.12.1 - serviceMonitor: - enabled: true - config: - settings: - clustername: k3s - resources: - - name: v1/pods # Name of the resources e.g pod, deployment, ingress, etc. (Resource name must be in singular form) - namespaces: - include: - - all - ignore: # List of namespaces to be ignored (omitempty), used only with include: all - - kasten-io # example : include [all], ignore [x,y,z] - - kube-system - events: # List of lifecycle events you want to receive, e.g create, update, delete, error OR all - - create - - delete - - name: v1/services - namespaces: - include: - - all - events: - - create - - delete - - error - - name: apps/v1/deployments - namespaces: - include: - - all - events: - - create - - update - - delete - - error - updateSetting: - includeDiff: true - fields: - - spec.template.spec.containers[*].image - - name: apps/v1/statefulsets - namespaces: - include: - - all - events: - - create - - update - - delete - - error - updateSetting: - includeDiff: true - fields: - - spec.template.spec.containers[*].image - - name: networking.k8s.io/v1beta1/ingresses - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/nodes - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/namespaces - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/persistentvolumes - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/persistentvolumeclaims - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/secrets - namespaces: - include: - - all - events: - - create - - delete - - error - - name: v1/configmaps - namespaces: - include: - - all - ignore: - - rook-ceph - events: - - delete - - error - - name: apps/v1/daemonsets - namespaces: - include: - - all - events: - - create - - delete - - error - - update - updateSetting: - includeDiff: true - fields: - - spec.template.spec.containers[*].image - - name: rbac.authorization.k8s.io/v1/roles - namespaces: - include: - - all - events: - - create - - delete - - error - - name: rbac.authorization.k8s.io/v1/rolebindings - namespaces: - include: - - all - events: - - create - - delete - - error - - name: rbac.authorization.k8s.io/v1/clusterroles - namespaces: - include: - - all - events: - - create - - delete - - error - - name: rbac.authorization.k8s.io/v1/clusterrolebindings - namespaces: - include: - - all - events: - - create - - delete - - error - recommendations: true - communications: - discord: - enabled: true - notiftype: short - channel: "778626068637679707" - botid: ${SECRET_BOTKUBE_DISCORD_BOTID} - token: ${SECRET_BOTKUBE_DISCORD_TOKEN} diff --git a/cluster/apps/monitoring/botkube/kustomization.yaml b/cluster/apps/monitoring/botkube/kustomization.yaml deleted file mode 100644 index 2fa2de20c..000000000 --- a/cluster/apps/monitoring/botkube/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - helm-release.yaml diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json b/cluster/apps/monitoring/grafana/dashboards/home-assistant.json similarity index 99% rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json rename to cluster/apps/monitoring/grafana/dashboards/home-assistant.json index a56ad2516..dc94ac238 100644 --- a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json +++ b/cluster/apps/monitoring/grafana/dashboards/home-assistant.json @@ -415,4 +415,4 @@ "uid": "sn-bOoWMk", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json b/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json similarity index 99% rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json rename to cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json index 381416892..efec37049 100644 --- a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json +++ b/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json @@ -368,4 +368,4 @@ "uid": "aEY0BVGnz", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json b/cluster/apps/monitoring/grafana/dashboards/truenas.json similarity index 100% rename from cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json rename to cluster/apps/monitoring/grafana/dashboards/truenas.json diff --git a/cluster/apps/monitoring/grafana/helm-release.yaml b/cluster/apps/monitoring/grafana/helm-release.yaml index 7887b0195..68ad965a8 100644 --- a/cluster/apps/monitoring/grafana/helm-release.yaml +++ b/cluster/apps/monitoring/grafana/helm-release.yaml @@ -33,6 +33,34 @@ spec: admin: existingSecret: grafana-admin-creds grafana.ini: + auth: + signout_redirect_url: "https://login.${SECRET_CLUSTER_DOMAIN}/logout" + oauth_auto_login: false + auth.generic_oauth: + enabled: true + name: Authelia + client_id: grafana + client_secret: "${SECRET_GRAFANA_OAUTH_CLIENT_SECRET}" + scopes: "openid profile email groups" + empty_scopes: false + auth_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/authorization" + token_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/token" + api_url: "https://login.${SECRET_CLUSTER_DOMAIN}/api/oidc/userinfo" + login_attribute_path: preferred_username + groups_attribute_path: groups + name_attribute_path: name + use_pkce: true + auth.generic_oauth.group_mapping: + role_attribute_path: | + contains(groups[*], 'admins') && 'Admin' || contains(groups[*], 'people') && 'Viewer' + org_id: 1 + auth.basic: + disable_login_form: false + auth.anonymous: + enabled: true + org_name: HomeOps + org_id: 1 + org_role: Viewer server: root_url: "https://grafana.${SECRET_CLUSTER_DOMAIN}" paths: @@ -46,8 +74,6 @@ spec: mode: console grafana_net: url: https://grafana.net - auth.basic: - disable_login_form: false dashboardProviders: dashboardproviders.yaml: apiVersion: 1 @@ -71,25 +97,22 @@ spec: - name: Prometheus type: prometheus access: proxy - url: http://thanos-query:9090/ + url: http://thanos-query.monitoring.svc.cluster.local:9090 isDefault: true # - name: Loki # type: loki # access: proxy - # url: http://loki-gateway:80 + # url: http://loki-gateway.monitoring.svc.cluster.local:80 dashboards: default: - kubernetes-custom: - url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temparatures.json - datasource: Prometheus home-assistant: - url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/home-assistant.json + url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/home-assistant.json datasource: Prometheus homelab-temperatures: - url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/homelab-temperatures.json + url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/homelab-temperatures.json datasource: Prometheus truenas: - url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/kube-prometheus-stack/grafana-dashboards/truenas.json + url: https://raw.githubusercontent.com/auricom/home-ops/main/cluster/apps/monitoring/grafana/dashboards/truenas.json datasource: Prometheus sidecar: dashboards: @@ -126,3 +149,14 @@ spec: - *host persistence: enabled: false + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["grafana"] + topologyKey: kubernetes.io/hostname diff --git a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml index cecd88fd4..a7f8ed2da 100644 --- a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml +++ b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml @@ -5,10 +5,9 @@ metadata: name: kube-prometheus-stack namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://prometheus-community.github.io/helm-charts chart: kube-prometheus-stack version: 39.13.3 sourceRef: @@ -24,85 +23,37 @@ spec: remediation: retries: 5 values: - alertmanager: - config: - global: - resolve_timeout: 5m - receivers: - - name: "null" - - name: "pushover" - pushover_configs: - - user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY} - token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN} - send_resolved: true - html: true - priority: |- - {{ if eq .Status "firing" }}1{{ else }}0{{ end }} - url_title: View in Alert Manager - title: |- - [{{ .Status | toUpper -}} - {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}} - ] {{ .CommonLabels.alertname }} - message: |- - {{- range .Alerts }} - {{- if ne .Labels.severity "" }} - Severity: {{ .Labels.severity }} - {{- else }} - Severity: N/A - {{- end }} - {{- if ne .Annotations.description "" }} - Description: {{ .Annotations.description }} - {{- else if ne .Annotations.summary "" }} - Summary: {{ .Annotations.summary }} - {{- else if ne .Annotations.message "" }} - Message: {{ .Annotations.message }} - {{- else }} - Description: N/A - {{- end }} - {{- if gt (len .Labels.SortedPairs) 0 }} - Details: - {{- range .Labels.SortedPairs }} - • {{ .Name }}: {{ .Value }} - {{- end }} - {{- end }} - {{- end }} - route: - receiver: "pushover" - routes: - - receiver: "null" - matchers: - - alertname =~ "InfoInhibitor|Watchdog|RebootScheduled" - - receiver: "pushover" - matchers: - - severity = "critical" - continue: true - inhibit_rules: - - source_matchers: - - severity = "critical" - target_matchers: - - severity = "warning" - equal: ["alertname", "namespace"] - alertmanagerSpec: - replicas: 2 - podAntiAffinity: hard - storage: - volumeClaimTemplate: - spec: - storageClassName: rook-ceph-block - resources: - requests: - storage: 10Gi - ingress: - enabled: true - pathType: Prefix - ingressClassName: "nginx" - annotations: - nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify" - nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}" - hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"] - tls: - - hosts: - - "alert-manager.${SECRET_CLUSTER_DOMAIN}" + ### + ### Component values + ### + kubeApiServer: + enabled: true + kubeControllerManager: + enabled: false + + kubeEtcd: + enabled: false + + kubelet: + enabled: true + serviceMonitor: + metricRelabelings: + - action: replace + sourceLabels: + - node + targetLabel: instance + + kubeProxy: + enabled: false + + kubeScheduler: + enabled: false + + kubeStateMetrics: + enabled: true + kube-state-metrics: + metricLabelsAllowlist: + - "persistentvolumeclaims=[*]" prometheus: monitor: enabled: true @@ -113,8 +64,42 @@ spec: sourceLabels: - __meta_kubernetes_pod_node_name targetLabel: kubernetes_node + resources: + requests: + cpu: 15m + memory: 127M + limits: + memory: 153M + + grafana: + enabled: false + forceDeployDashboards: true + + nodeExporter: + enabled: true + + prometheus-node-exporter: + resources: + requests: + cpu: 23m + memory: 64M + limits: + memory: 64M + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node + + ### + ### Prometheus operator values + ### prometheusOperator: - createCustomResource: true prometheusConfigReloader: resources: requests: @@ -123,35 +108,10 @@ spec: limits: cpu: 300m memory: 50Mi - nodeExporter: - enabled: true - serviceMonitor: - relabelings: - - action: replace - regex: (.*) - replacement: $1 - sourceLabels: - - __meta_kubernetes_pod_node_name - targetLabel: kubernetes_node - kubelet: - enabled: true - serviceMonitor: - metricRelabelings: - - action: replace - sourceLabels: - - node - targetLabel: instance - grafana: - enabled: false - forceDeployDashboards: true - kubeEtcd: - enabled: false - kubeControllerManager: - enabled: false - kubeScheduler: - enabled: false - kubeProxy: - enabled: false + + ### + ### Prometheus instance values + ### prometheus: ingress: enabled: true @@ -171,7 +131,7 @@ spec: cpu: 400m limits: memory: 6000Mi - replicas: 2 + replicas: 1 replicaExternalLabelName: "replica" podAntiAffinity: hard ruleSelector: {} @@ -183,8 +143,9 @@ spec: podMonitorSelector: {} podMonitorNamespaceSelector: {} podMonitorSelectorNilUsesHelmValues: false - retention: 2d - retentionSize: "6GB" + probeSelectorNilUsesHelmValues: false + retention: 14d + retentionSize: "45GB" enableAdminAPI: true walCompression: true storageSpec: @@ -193,13 +154,10 @@ spec: storageClassName: rook-ceph-block resources: requests: - storage: 10Gi + storage: 50Gi thanos: image: quay.io/thanos/thanos:v0.28.0 - version: v0.25.2 - objectStorageConfig: - name: thanos-objstore-secret - key: objstore.yml + version: v0.28.0 additionalScrapeConfigs: - job_name: "opnsense" scrape_interval: 60s @@ -331,3 +289,93 @@ spec: tls: - hosts: - *host + + alertmanager: + config: + global: + resolve_timeout: 5m + receivers: + - name: "null" + - name: "pushover" + pushover_configs: + - user_key: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_USER_KEY} + token: ${SECRET_KUBE_PROMETHEUS_STACK_ALERTMANAGER_PUSHOVER_TOKEN} + send_resolved: true + html: true + priority: |- + {{ if eq .Status "firing" }}1{{ else }}0{{ end }} + url_title: View in Alert Manager + title: |- + [{{ .Status | toUpper -}} + {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}} + ] {{ .CommonLabels.alertname }} + message: |- + {{- range .Alerts }} + {{- if ne .Labels.severity "" }} + Severity: {{ .Labels.severity }} + {{- else }} + Severity: N/A + {{- end }} + {{- if ne .Annotations.description "" }} + Description: {{ .Annotations.description }} + {{- else if ne .Annotations.summary "" }} + Summary: {{ .Annotations.summary }} + {{- else if ne .Annotations.message "" }} + Message: {{ .Annotations.message }} + {{- else }} + Description: N/A + {{- end }} + {{- if gt (len .Labels.SortedPairs) 0 }} + Details: + {{- range .Labels.SortedPairs }} + • {{ .Name }}: {{ .Value }} + {{- end }} + {{- end }} + {{- end }} + route: + receiver: "pushover" + routes: + - receiver: "null" + matchers: + - alertname =~ "InfoInhibitor|Watchdog|RebootScheduled" + - receiver: "pushover" + matchers: + - severity = "critical" + continue: true + inhibit_rules: + - source_matchers: + - severity = "critical" + target_matchers: + - severity = "warning" + equal: ["alertname", "namespace"] + alertmanagerSpec: + replicas: 1 + podAntiAffinity: hard + storage: + volumeClaimTemplate: + spec: + storageClassName: rook-ceph-block + resources: + requests: + storage: 1Gi + ingress: + enabled: true + pathType: Prefix + ingressClassName: "nginx" + annotations: + nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify" + nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}" + hosts: ["alert-manager.${SECRET_CLUSTER_DOMAIN}"] + tls: + - hosts: + - "alert-manager.${SECRET_CLUSTER_DOMAIN}" + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node diff --git a/cluster/apps/monitoring/kustomization.yaml b/cluster/apps/monitoring/kustomization.yaml index 273c0074d..3dd92412f 100644 --- a/cluster/apps/monitoring/kustomization.yaml +++ b/cluster/apps/monitoring/kustomization.yaml @@ -3,10 +3,8 @@ kind: Kustomization resources: - namespace.yaml - blackbox-exporter - - botkube - grafana - healthchecks - kube-prometheus-stack - #- loki-stack - thanos - uptime-kuma diff --git a/cluster/apps/monitoring/loki-stack/helm-release.yaml b/cluster/apps/monitoring/loki-stack/helm-release.yaml deleted file mode 100644 index 0a5daf43a..000000000 --- a/cluster/apps/monitoring/loki-stack/helm-release.yaml +++ /dev/null @@ -1,186 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: loki-stack - namespace: monitoring -spec: - interval: 5m - chart: - spec: - # renovate: registryUrl=https://grafana.github.io/helm-charts - chart: loki-stack - version: 2.6.5 - sourceRef: - kind: HelmRepository - name: grafana-loki-charts - namespace: flux-system - interval: 5m - values: - loki: - image: - repository: grafana/loki - tag: 2.6.1 - pullPolicy: IfNotPresent - replicas: 3 - persistence: - enabled: false - config: - auth_enabled: false - server: - http_listen_port: 3100 - distributor: - ring: - kvstore: - store: memberlist - ingester: - lifecycler: - ring: - kvstore: - store: memberlist - replication_factor: 1 - final_sleep: 0s - chunk_idle_period: 5m - chunk_retain_period: 30s - memberlist: - abort_if_cluster_join_fails: false - # Expose this port on all distributor, ingester - # and querier replicas. - bind_port: 7946 - # You can use a headless k8s service for all distributor, - # ingester and querier components. - join_members: - - loki-stack-headless.monitoring.svc.cluster.local:7946 - # max_join_backoff: 1m - # max_join_retries: 10 - # min_join_backoff: 1s - schema_config: - configs: - - from: "2020-10-24" - store: boltdb-shipper - object_store: s3 - schema: v11 - index: - prefix: index_ - period: 24h - storage_config: - aws: - insecure: false - s3: https://${SECRET_MINIO_ACCESS_KEY}:${SECRET_MINIO_SECRET_KEY}@${SECRET_MINIO_ENDPOINT}/loki - s3forcepathstyle: true - boltdb_shipper: - active_index_directory: /data/loki/index - cache_location: /data/loki/index_cache - resync_interval: 5s - shared_store: s3 - limits_config: - enforce_metric_name: false - reject_old_samples: true - reject_old_samples_max_age: 168h - extraPorts: - - port: 7956 - protocol: TCP - name: loki-gossip-ring - targetPort: 7946 - serviceMonitor: - enabled: true - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "http-metrics" - promtail: - image: - registry: docker.io - repository: grafana/promtail - tag: latest - pullPolicy: Always - serviceMonitor: - enabled: true - extraScrapeConfigs: - - job_name: syslog - syslog: - listen_address: 0.0.0.0:1514 - label_structured_data: true - labels: - job: "syslog" - relabel_configs: - - source_labels: ['__syslog_connection_ip_address'] - target_label: 'ip_address' - - source_labels: ['__syslog_message_severity'] - target_label: 'severity' - - source_labels: ['__syslog_message_facility'] - target_label: 'facility' - - source_labels: ['__syslog_message_hostname'] - target_label: 'host' - - source_labels: ['__syslog_message_app_name'] - target_label: 'app' - - source_labels: ['__syslog_message_SRC'] - target_label: 'source_ip' - - source_labels: ['__syslog_message_SPT'] - target_label: 'source_port' - - source_labels: ['__syslog_message_DPT'] - target_label: 'destination_port' - - source_labels: ['__syslog_message_DST'] - target_label: 'destination_ip' - pipeline_stages: - # - job_name: pfsense - # syslog: - # listen_address: 0.0.0.0:1514 - # idle_timeout: 60s - # label_structured_data: false - # labels: - # job: "syslog" - # host: pfsense - # relabel_configs: - # - source_labels: ["__syslog_message_severity"] - # target_label: "severity" - # #- source_labels: ['__syslog_message_facility'] - # # target_label: 'facility' - # - source_labels: ["__syslog_message_app_name"] - # target_label: "app_name" - # pipeline_stages: - # - match: - # selector: '{app_name="filterlog"}' - # stages: - # - regex: - # expression: '(?P\d*?),(?P\d*?),(?P\d*?),(?P\d*?),(?Pigb.{1,5}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P4{1}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\d*?),(?P\w*?),(?P\d*?),(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?),(?P\d+?),(?P\d+?),(?P\d+?)' - # # ipv6 // ,(?P6{1}?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\w*?),(?P\d*?), - # - labels: - # pfsense_fw_rule: "" - # #pfsense_fw_subrule: '' - # #pfsense_fw_anchor: '' - # pfsense_fw_tracker: "" - # pfsense_fw_interface: "" - # pfsense_fw_reason: "" - # pfsense_fw_action: "" - # pfsense_fw_direction: "" - # #pfsense_fw_ip_version: '' - # #pfsense_fw_tos: '' - # #pfsense_fw_ecn: '' - # #pfsense_fw_ttl: '' - # #pfsense_fw_id: '' - # #pfsense_fw_offset: '' - # #pfsense_fw_flag: '' - # pfsense_fw_protocol_id: "" - # pfsense_fw_protocol_text: "" - # #pfsense_fw_length: '' - # pfsense_fw_source_address: "" - # pfsense_fw_destination_address: "" - # pfsense_fw_source_port: "" - # pfsense_fw_destination_port: "" - # #pfsense_fw_data_length: '' - # # - metrics: - # # lines_total: - # # type: Counter - # # description: "pfsense firewall : total number of log lines" - # # prefix: pfsense_firewall_ - # # match_all: true - # # count_entry_bytes: true - # # config: - # # action: add - syslogService: - enabled: true - type: LoadBalancer - port: 1514 - externalIPs: - - ${CLUSTER_LB_LOKI_SYSLOG} - externalTrafficPolicy: Local diff --git a/cluster/apps/monitoring/loki-stack/kustomization.yaml b/cluster/apps/monitoring/loki-stack/kustomization.yaml deleted file mode 100644 index d1c0a463d..000000000 --- a/cluster/apps/monitoring/loki-stack/kustomization.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - helm-release.yaml - - prometheus-rule.yaml diff --git a/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml b/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml deleted file mode 100644 index 707c248cb..000000000 --- a/cluster/apps/monitoring/loki-stack/prometheus-rule.yaml +++ /dev/null @@ -1,109 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: loki.rules - namespace: monitoring -spec: - groups: - - name: loki.rules - rules: - - alert: LokiRequestErrors - annotations: - message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors." - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics." - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency." - expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} - > 1 - for: 15m - labels: - severity: critical - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job) - record: job:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - record: job:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds_count:sum_rate - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route) - record: job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_count:sum_rate - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) - by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) - by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/cluster/apps/monitoring/thanos/helm-release.yaml b/cluster/apps/monitoring/thanos/helm-release.yaml index cf1c53ce0..4c50813eb 100644 --- a/cluster/apps/monitoring/thanos/helm-release.yaml +++ b/cluster/apps/monitoring/thanos/helm-release.yaml @@ -5,21 +5,30 @@ metadata: name: thanos namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://charts.bitnami.com/bitnami chart: thanos version: 11.4.0 sourceRef: kind: HelmRepository name: bitnami-charts namespace: flux-system - interval: 5m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + dependsOn: + - name: kube-prometheus-stack + namespace: monitoring values: query: enabled: true replicaCount: 2 + podAntiAffinityPreset: hard replicaLabels: - replica dnsDiscovery: @@ -27,46 +36,26 @@ spec: sidecarsNamespace: monitoring ingress: enabled: true - hostname: "thanos.${SECRET_CLUSTER_DOMAIN}" + hostname: &host "thanos-query.${SECRET_CLUSTER_DOMAIN}" annotations: nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify" nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN}" - # traefik.ingress.kubernetes.io/router.entrypoints: "websecure" - # traefik.ingress.kubernetes.io/router.middlewares: networking-forward-auth@kubernetescrd + ingressClassName: "nginx" tls: true + extraTls: + - hosts: + - *host queryFrontend: enabled: false bucketweb: - enabled: true + enabled: false compactor: - enabled: true - strategyType: Recreate - persistence: - size: 30Gi + enabled: false storegateway: - enabled: true + enabled: false ruler: enabled: false metrics: enabled: true serviceMonitor: enabled: true - objstoreConfig: |- - type: s3 - config: - bucket: thanos - endpoint: ${SECRET_MINIO_ENDPOINT} - access_key: "${SECRET_MINIO_ACCESS_KEY}" - secret_key: "${SECRET_MINIO_SECRET_KEY}" - insecure: false - - postRenderers: - - kustomize: - patchesJson6902: - - target: - kind: Ingress - name: thanos-query - patch: - - op: add - path: /spec/ingressClassName - value: nginx diff --git a/cluster/apps/monitoring/thanos/kustomization.yaml b/cluster/apps/monitoring/thanos/kustomization.yaml index 73ca882a9..2fa2de20c 100644 --- a/cluster/apps/monitoring/thanos/kustomization.yaml +++ b/cluster/apps/monitoring/thanos/kustomization.yaml @@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - helm-release.yaml - # - prometheus-rule.yaml diff --git a/cluster/apps/monitoring/thanos/prometheus-rule.yaml b/cluster/apps/monitoring/thanos/prometheus-rule.yaml deleted file mode 100644 index 5b64cfaec..000000000 --- a/cluster/apps/monitoring/thanos/prometheus-rule.yaml +++ /dev/null @@ -1,39 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: thanos.rules - namespace: monitoring -spec: - groups: - - name: thanos.rules - rules: - - alert: ThanosCompactionHalted - expr: | - thanos_compactor_halted == 1 - for: 0m - labels: - severity: critical - annotations: - summary: "Thanos compaction halted on {{ $labels.instance }}" - description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ThanosCompactBucketOperationFailure - expr: | - rate(thanos_objstore_bucket_operation_failures_total[1m]) - > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Thanos compact bucket operation failure on {{ $labels.instance }}" - description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ThanosCompactNotRun - expr: | - (time() - thanos_objstore_bucket_last_successful_upload_time) - > 24*60*60 - for: 0m - labels: - severity: critical - annotations: - summary: "Thanos compact not run on {{ $labels.instance }}" - description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/cluster/charts/infracloudio-charts.yaml b/cluster/charts/infracloudio-charts.yaml deleted file mode 100644 index c4a009310..000000000 --- a/cluster/charts/infracloudio-charts.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: source.toolkit.fluxcd.io/v1beta1 -kind: HelmRepository -metadata: - name: infracloudio-charts - namespace: flux-system -spec: - interval: 1h - url: https://infracloudio.github.io/charts - timeout: 3m diff --git a/cluster/charts/kustomization.yaml b/cluster/charts/kustomization.yaml index 4e400644a..bb0dfe6ed 100644 --- a/cluster/charts/kustomization.yaml +++ b/cluster/charts/kustomization.yaml @@ -12,7 +12,6 @@ resources: - gitea-charts.yaml - grafana-charts.yaml - influxdata-charts.yaml - - infracloudio-charts.yaml - ingress-nginx-charts.yaml - jetstack-charts.yaml - k8s-at-home.yaml