new gitops template

This commit is contained in:
auricom
2021-04-13 10:34:08 +02:00
parent 67c4d6a855
commit a95f32b44d
335 changed files with 3131 additions and 3650 deletions

View File

@@ -0,0 +1,41 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: longhorn
namespace: longhorn-system
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://charts.longhorn.io
chart: longhorn
version: 1.1.0
sourceRef:
kind: HelmRepository
name: longhorn-charts
namespace: flux-system
interval: 5m
values:
defaultSettings:
backupTarget: s3://longhorn@us-east-1/
backupTargetCredentialSecret: minio-truenas-credentials
createDefaultDiskLabeledNodes: true
defaultDataPath: /var/lib/longhorn/
replicaSoftAntiAffinity: false
storageOverProvisioningPercentage: 300
storageMinimalAvailablePercentage: 25
upgradeChecker: true
defaultReplicaCount: 3
guaranteedEngineCPU: 0.25
defaultLonghornStaticStorageClass: longhorn-backups
backupstorePollInterval: 10800
autoSalvage: true
disableSchedulingOnCordonedNode: true
replicaZoneSoftAntiAffinity: true
volumeAttachmentRecoveryPolicy: wait
csi:
kubeletRootDir: /var/lib/kubelet
tls: true
ingress:
enabled: false

View File

@@ -0,0 +1,26 @@
---
kind: Ingress
apiVersion: networking.k8s.io/v1
metadata:
name: longhorn-ui
namespace: longhorn-system
annotations:
kubernetes.io/ingress.class: "nginx"
ingress.kubernetes.io/secure-backends: "true"
nginx.ingress.kubernetes.io/auth-url: "http://authelia.networking.svc.cluster.local/api/verify"
nginx.ingress.kubernetes.io/auth-signin: "https://login.${SECRET_CLUSTER_DOMAIN_CERT}/"
spec:
tls:
- hosts:
- longhorn.${SECRET_CLUSTER_DOMAIN}
rules:
- host: longhorn.${SECRET_CLUSTER_DOMAIN}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: longhorn-frontend
port:
number: 80

View File

@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- ingress.yaml
- monitoring.yaml
- storageclass.yaml
- secret.enc.yaml

View File

@@ -0,0 +1,109 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: longhorn-prometheus-servicemonitor
namespace: longhorn-system
labels:
name: longhorn-prometheus-servicemonitor
spec:
selector:
matchLabels:
app: longhorn-manager
namespaceSelector:
matchNames:
- longhorn-system
endpoints:
- port: manager
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: longhorn
role: alert-rules
name: prometheus-longhorn-rules
namespace: monitoring
spec:
groups:
- name: longhorn.rules
rules:
#- alert: LonghornVolumeActualSpaceUsedWarning
# annotations:
# description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for
# more than 5 minutes.
# summary: The actual used space of Longhorn volume is over 90% of the capacity.
# expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90
# for: 5m
# labels:
# issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
# severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning
- alert: LonghornNodeDown
annotations:
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
summary: Longhorn nodes is offline
expr: longhorn_node_total - (count(longhorn_node_status{condition="ready"}==1) OR on() vector(0))
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} comsumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning

View File

@@ -0,0 +1,39 @@
kind: Secret
apiVersion: v1
metadata:
name: minio-truenas-credentials
namespace: longhorn-system
data:
AWS_ACCESS_KEY_ID: ENC[AES256_GCM,data:uE5CV9wcWg8=,iv:l41hwC+43JWRbcsqpRwukwkpHcWjMmGf9eNtR8kV0VM=,tag:TrHP2GlnSbqWE7TS9neGfw==,type:str]
AWS_SECRET_ACCESS_KEY: ENC[AES256_GCM,data:Jhg/KgZzOmU8jB3K0pMuke8BuUIWRVoQ1US3cw==,iv:lRidTSpintFfwd4/W32FGHEMy/v06ILrN62nPoMB3ew=,tag:NYT3ST+lsp6QkvjTEeXHBw==,type:str]
AWS_ENDPOINTS: ENC[AES256_GCM,data:SdIM5UQmzsibf6lD0UN/2ztF03WeM5GqoEi71HtaNKeDRNqCXAssFhUd0l0=,iv:Ep5Xdpu48QriwOA1qmBPaNpcbiudNkpH+I2YiFpYCFY=,tag:4oJYhEMyUsIG8OJ+73wf1g==,type:str]
#ENC[AES256_GCM,data:/pUAj7tHPkqci0vh/I5x5M6LebjodkftjOsXFCpQyW2D,iv:qTDtrQVblNVeUfAtBoUgO0rbqGzf4jQbjna0OQZdUf0=,tag:XZc9TX3zGZGbNz3CyYmKLw==,type:comment]
type: Opaque
sops:
kms: []
gcp_kms: []
azure_kv: []
hc_vault: []
lastmodified: "2021-04-14T14:49:06Z"
mac: ENC[AES256_GCM,data:CdXYSx72+JQMw4ZuCma8u0VTM5wNYNC0L2iBSBuLA0nr8YzMh59CAjc2S3ITpnusFQ3onisurrDoKj25GRJu0Dns4d1oluKGdsiIc8nfwSsRxxfRKb+iPa0B0lGsI2XvuvqBYcWLZ0S988NXfi8VCyaXIdoFMFjOPel9+KqPSio=,iv:8aq1YspzEiXqOIPHzZhAs930uwomdtKQtdKxSHjb90Y=,tag:sHikRF8/3+VDnVKtWEtcSA==,type:str]
pgp:
- created_at: "2021-04-14T14:49:06Z"
enc: |
-----BEGIN PGP MESSAGE-----
hQGMA/JorPHm1g9XAQv/RCNYZMMGchIhqCt7S0jCFaGTqWvtydckIGQLZN3CCwmo
xfMoaGf43yMKER21ilP3CY/EXQNzwz2di5M0/biofkaH5yiohcufECS6+rB9J/wI
Ub5RsMuNdnZSNzsNTd/T3PgUbhuqNOiOBv3BM59SfbMa3z1w3StFdWk0h4zXfezc
Vj/wtpV+1SonfCZ0QWqRB/crnAYSASoINS8kqU3I53VkoDM6pWoX4mjA7V+5x3aL
5ZdqvUte42ANqNG9SLnnLQzhjKxEnb1K3R1VB2qmvCmWB3aY8hq9zKuK/x6WH9B4
rtBiIB3BCtJeUC0rGRvBNlfxPDdegDWqae7y6JdQWRB4QaoYxVzKPNS0Msz7zjlH
Rf75ZWWUJnKmHKzAQBHrgegUiR4GipEe5v63m0kInM3J8MHtolkJ22kCXeancYWl
XnnZwWmyVz46BTR71EvdbApSmlDQjRCK3x/5FodtCZeWP1QEfC0lwRAlk2lyrPx7
/L8KnFLK+NF9uR2Xylzf0l4BD+mNEAfIq7hvy4Gh8Ek50gpAmNGLq6zRNj0Sh6dz
zbVyYHYIwEXCnvaN8UNumSqvTQ9e322bRXsYwVLLQXT58ZX/jbzvSwUkNalTJamx
X6t5Qj8/5XOjupH0IoR0
=8fGE
-----END PGP MESSAGE-----
fp: C8F8A49D04A1AB639F8EA21CDBA4B1DCB1FA5BDD
encrypted_regex: ^(data|stringData)$
version: 3.6.1

View File

@@ -0,0 +1,17 @@
---
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: longhorn-backups
provisioner: driver.longhorn.io
allowVolumeExpansion: true
reclaimPolicy: Retain
parameters:
numberOfReplicas: "3"
staleReplicaTimeout: "2880"
fromBackup: ""
diskSelector: "ssd,fast"
nodeSelector: "storage,fast"
recurringJobs:
'[{"name":"backup", "task":"backup", "cron":"30 23 * * *", "retain":1,
"labels": {"interval":"daily"}}]'