From 5b82fd774287b7b457baf9e639b24d1637da421e Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Mon, 18 Aug 2025 21:57:45 +0200 Subject: [PATCH] feat: cilium-gateway --- .taskfiles/bootstrap/Taskfile.yaml | 49 +--- .../apps/default/atuin/app/helmrelease.yaml | 48 ++-- kubernetes/apps/kube-system/cilium/README.md | 22 ++ .../kube-system/cilium/app/helm/values.yaml | 9 +- .../kube-system/cilium/gateway/external.yaml | 35 +++ .../cilium/gateway/gatewayclass.yaml | 7 + .../kube-system/cilium/gateway/internal.yaml | 35 +++ .../cilium/gateway/kustomization.yaml | 9 + .../kube-system/cilium/gateway/redirect.yaml | 22 ++ kubernetes/apps/kube-system/cilium/ks.yaml | 23 +- .../gateway-api-crds/app/helmrelease.yaml | 32 +++ .../gateway-api-crds/app/kustomization.yaml | 6 + .../apps/kube-system/gateway-api-crds/ks.yaml | 22 ++ .../apps/kube-system/kustomization.yaml | 1 + kubernetes/bootstrap/apps/crds/helmfile.yaml | 34 +++ kubernetes/bootstrap/apps/helmfile.yaml | 49 +--- kubernetes/bootstrap/apps/resources.yaml | 24 ++ scripts/bootstrap-talos.sh | 211 ++++++++++++++++++ scripts/render-machine-config.sh | 60 +++++ 19 files changed, 591 insertions(+), 107 deletions(-) create mode 100644 kubernetes/apps/kube-system/cilium/README.md create mode 100644 kubernetes/apps/kube-system/cilium/gateway/external.yaml create mode 100644 kubernetes/apps/kube-system/cilium/gateway/gatewayclass.yaml create mode 100644 kubernetes/apps/kube-system/cilium/gateway/internal.yaml create mode 100644 kubernetes/apps/kube-system/cilium/gateway/kustomization.yaml create mode 100644 kubernetes/apps/kube-system/cilium/gateway/redirect.yaml create mode 100644 kubernetes/apps/kube-system/gateway-api-crds/app/helmrelease.yaml create mode 100644 kubernetes/apps/kube-system/gateway-api-crds/app/kustomization.yaml create mode 100644 kubernetes/apps/kube-system/gateway-api-crds/ks.yaml create mode 100644 kubernetes/bootstrap/apps/crds/helmfile.yaml create mode 100644 kubernetes/bootstrap/apps/resources.yaml create mode 100644 scripts/bootstrap-talos.sh create mode 100644 scripts/render-machine-config.sh diff --git a/.taskfiles/bootstrap/Taskfile.yaml b/.taskfiles/bootstrap/Taskfile.yaml index 215f7bde7..57fe89f10 100644 --- a/.taskfiles/bootstrap/Taskfile.yaml +++ b/.taskfiles/bootstrap/Taskfile.yaml @@ -3,49 +3,14 @@ version: '3' vars: - BOOTSTRAP_RESOURCES_DIR: '{{.ROOT_DIR}}/.taskfiles/bootstrap/resources' - CLUSTER_DIR: '{{.ROOT_DIR}}/kubernetes' + BOOTSTRAP_SCRIPT: '{{.SCRIPTS_DIR}}/bootstrap-talos.sh' tasks: - base: - desc: Bootstrap Base Apps - cmds: - - until kubectl wait nodes --for=condition=Ready=False --all --timeout=10m; do sleep 5; done - - helmfile --quiet --file {{.CLUSTER_DIR}}/bootstrap/apps/helmfile.yaml apply --skip-diff-on-install --suppress-diff - - until kubectl wait nodes --for=condition=Ready --all --timeout=10m; do sleep 5; done + default: + desc: Bootstrap Talos and Kubernetes cluster + cmd: bash {{.BOOTSTRAP_SCRIPT}} preconditions: - - talosctl config info - # - test -f {{.CLUSTER_DIR}}/talos/cluster-0/talosconfig - - test -f {{.CLUSTER_DIR}}/bootstrap/apps/helmfile.yaml - - which helmfile kubectl - - # NOTE: Nodes must all be part of the Ceph cluster and Ceph disks must share the same disk model - rook: - desc: Bootstrap Rook-Ceph - cmds: - - minijinja-cli {{.BOOTSTRAP_RESOURCES_DIR}}/wipe-rook.yaml.j2 | kubectl apply --server-side --filename - - - until kubectl --namespace default get job/wipe-rook &>/dev/null; do sleep 5; done - - kubectl --namespace default wait job/wipe-rook --for=condition=complete --timeout=5m - - stern --namespace default job/wipe-rook --no-follow - - kubectl --namespace default delete job wipe-rook - env: - NODE_COUNT: - sh: talosctl config info --output json | jq --raw-output '.nodes | length' - preconditions: - - test -f {{.BOOTSTRAP_RESOURCES_DIR}}/wipe-rook.yaml.j2 - - which jq kubectl minijinja-cli stern talosctl - - flux: - desc: Bootstrap Flux - cmds: - - kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply --filename - - - cat {{.SOPS_AGE_KEY}} | kubectl --namespace flux-system create secret generic sops-age --from-file=age.agekey=/dev/stdin - - kubectl apply --server-side --kustomize {{.CLUSTER_DIR}}/bootstrap/flux - - SOPS_AGE_KEY_FILE={{.SOPS_AGE_KEY}} sops exec-file {{.CLUSTER_DIR}}/bootstrap/flux/github-deploy-key.sops.yaml "kubectl apply --server-side --filename {}" - - SOPS_AGE_KEY_FILE={{.SOPS_AGE_KEY}} sops exec-file {{.CLUSTER_DIR}}/flux/vars/cluster-secrets.sops.yaml "kubectl apply --server-side --filename {}" - - kubectl apply --server-side --filename ./flux/vars/cluster-settings.yaml - - apps: - desc: Bootstrap Flux - - kubectl apply --server-side --kustomize {{.CLUSTER_DIR}}/flux/config + - find --version | grep -q GNU + - test -f {{.BOOTSTRAP_SCRIPT}} + - which gum helm helmfile kubectl op talosctl yq diff --git a/kubernetes/apps/default/atuin/app/helmrelease.yaml b/kubernetes/apps/default/atuin/app/helmrelease.yaml index d715febbd..9c110d92f 100644 --- a/kubernetes/apps/default/atuin/app/helmrelease.yaml +++ b/kubernetes/apps/default/atuin/app/helmrelease.yaml @@ -68,31 +68,31 @@ spec: ports: http: port: *port - # route: - # app: - # hostnames: ["sh.${SECRET_EXTERNAL_DOMAIN}"] - # parentRefs: - # - name: internal - # namespace: network - # sectionName: https - # rules: - # - backendRefs: - # - name: app - # port: *port - ingress: + route: app: - enabled: true - className: internal - hosts: - - host: &host "sh.${SECRET_EXTERNAL_DOMAIN}" - paths: - - path: / - service: - identifier: app - port: http - tls: - - hosts: - - *host + hostnames: ["sh.${SECRET_EXTERNAL_DOMAIN}"] + parentRefs: + - name: internal + namespace: kube-system + sectionName: https + rules: + - backendRefs: + - name: app + port: *port + # ingress: + # app: + # enabled: true + # className: internal + # hosts: + # - host: &host "sh.${SECRET_EXTERNAL_DOMAIN}" + # paths: + # - path: / + # service: + # identifier: app + # port: http + # tls: + # - hosts: + # - *host persistence: config: existingClaim: atuin diff --git a/kubernetes/apps/kube-system/cilium/README.md b/kubernetes/apps/kube-system/cilium/README.md new file mode 100644 index 000000000..28b8f1ed4 --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/README.md @@ -0,0 +1,22 @@ +# Cilium + +## UniFi BGP + +```sh +router bgp 64513 + bgp router-id 192.168.1.1 + no bgp ebgp-requires-policy + + neighbor k8s peer-group + neighbor k8s remote-as 64514 + + neighbor 192.168.42.10 peer-group k8s + neighbor 192.168.42.11 peer-group k8s + neighbor 192.168.42.12 peer-group k8s + + address-family ipv4 unicast + neighbor k8s next-hop-self + neighbor k8s soft-reconfiguration inbound + exit-address-family +exit +``` diff --git a/kubernetes/apps/kube-system/cilium/app/helm/values.yaml b/kubernetes/apps/kube-system/cilium/app/helm/values.yaml index af522e857..a0683723f 100644 --- a/kubernetes/apps/kube-system/cilium/app/helm/values.yaml +++ b/kubernetes/apps/kube-system/cilium/app/helm/values.yaml @@ -18,10 +18,13 @@ enableIPv4BIGTCP: true endpointRoutes: enabled: true envoy: - enabled: false + rollOutPods: true + prometheus: + serviceMonitor: + enabled: true gatewayAPI: - enabled: false - enableAlpn: false + enabled: true + enableAlpn: true xffNumTrustedHops: 1 hubble: enabled: false diff --git a/kubernetes/apps/kube-system/cilium/gateway/external.yaml b/kubernetes/apps/kube-system/cilium/gateway/external.yaml new file mode 100644 index 000000000..f19b4a3c6 --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/gateway/external.yaml @@ -0,0 +1,35 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/gateway.networking.k8s.io/gateway_v1.json +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: external + annotations: + external-dns.alpha.kubernetes.io/target: &hostname external.${SECRET_EXTERNAL_DOMAIN} +spec: + gatewayClassName: cilium + addresses: + - type: IPAddress + value: 192.168.169.122 + infrastructure: + annotations: + external-dns.alpha.kubernetes.io/hostname: *hostname + listeners: + - name: http + protocol: HTTP + port: 80 + hostname: "*.${SECRET_EXTERNAL_DOMAIN}" + allowedRoutes: + namespaces: + from: Same + - name: https + protocol: HTTPS + port: 443 + hostname: "*.${SECRET_EXTERNAL_DOMAIN}" + allowedRoutes: + namespaces: + from: All + tls: + certificateRefs: + - kind: Secret + name: ${SECRET_EXTERNAL_DOMAIN//./-}-tls diff --git a/kubernetes/apps/kube-system/cilium/gateway/gatewayclass.yaml b/kubernetes/apps/kube-system/cilium/gateway/gatewayclass.yaml new file mode 100644 index 000000000..cb911ef4f --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/gateway/gatewayclass.yaml @@ -0,0 +1,7 @@ +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/gateway.networking.k8s.io/gatewayclass_v1.json +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: cilium +spec: + controllerName: io.cilium/gateway-controller diff --git a/kubernetes/apps/kube-system/cilium/gateway/internal.yaml b/kubernetes/apps/kube-system/cilium/gateway/internal.yaml new file mode 100644 index 000000000..eeb8d3c96 --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/gateway/internal.yaml @@ -0,0 +1,35 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/gateway.networking.k8s.io/gateway_v1.json +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: internal + annotations: + external-dns.alpha.kubernetes.io/target: &hostname internal.${SECRET_EXTERNAL_DOMAIN} +spec: + gatewayClassName: cilium + addresses: + - type: IPAddress + value: 192.168.169.121 + infrastructure: + annotations: + external-dns.alpha.kubernetes.io/hostname: *hostname + listeners: + - name: http + protocol: HTTP + port: 80 + hostname: "*.${SECRET_EXTERNAL_DOMAIN}" + allowedRoutes: + namespaces: + from: Same + - name: https + protocol: HTTPS + port: 443 + hostname: "*.${SECRET_EXTERNAL_DOMAIN}" + allowedRoutes: + namespaces: + from: All + tls: + certificateRefs: + - kind: Secret + name: ${SECRET_EXTERNAL_DOMAIN//./-}-tls diff --git a/kubernetes/apps/kube-system/cilium/gateway/kustomization.yaml b/kubernetes/apps/kube-system/cilium/gateway/kustomization.yaml new file mode 100644 index 000000000..465ca1f21 --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/gateway/kustomization.yaml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./external.yaml + - ./internal.yaml + - ./gatewayclass.yaml + - ./redirect.yaml diff --git a/kubernetes/apps/kube-system/cilium/gateway/redirect.yaml b/kubernetes/apps/kube-system/cilium/gateway/redirect.yaml new file mode 100644 index 000000000..4a0d847af --- /dev/null +++ b/kubernetes/apps/kube-system/cilium/gateway/redirect.yaml @@ -0,0 +1,22 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/gateway.networking.k8s.io/httproute_v1.json +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httpsredirect + annotations: + external-dns.alpha.kubernetes.io/controller: none +spec: + parentRefs: + - name: internal + namespace: kube-system + sectionName: http + - name: external + namespace: kube-system + sectionName: http + rules: + - filters: + - requestRedirect: + scheme: https + statusCode: 301 + type: RequestRedirect diff --git a/kubernetes/apps/kube-system/cilium/ks.yaml b/kubernetes/apps/kube-system/cilium/ks.yaml index 0aaf9c5ec..28a95b00f 100644 --- a/kubernetes/apps/kube-system/cilium/ks.yaml +++ b/kubernetes/apps/kube-system/cilium/ks.yaml @@ -11,7 +11,7 @@ spec: app.kubernetes.io/name: *app interval: 1h path: ./kubernetes/apps/kube-system/cilium/app - prune: false + prune: true retryInterval: 2m sourceRef: kind: GitRepository @@ -20,3 +20,24 @@ spec: targetNamespace: *namespace timeout: 5m wait: false +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app cilium-gateway + namespace: &namespace kube-system +spec: + commonMetadata: + labels: + app.kubernetes.io/name: *app + interval: 1h + path: ./kubernetes/apps/kube-system/cilium/gateway + prune: true + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + targetNamespace: *namespace + timeout: 15m + wait: false diff --git a/kubernetes/apps/kube-system/gateway-api-crds/app/helmrelease.yaml b/kubernetes/apps/kube-system/gateway-api-crds/app/helmrelease.yaml new file mode 100644 index 000000000..ef134d460 --- /dev/null +++ b/kubernetes/apps/kube-system/gateway-api-crds/app/helmrelease.yaml @@ -0,0 +1,32 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/source.toolkit.fluxcd.io/ocirepository_v1.json +apiVersion: source.toolkit.fluxcd.io/v1 +kind: OCIRepository +metadata: + name: gateway-api-crds +spec: + interval: 5m + layerSelector: + mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip + operation: copy + ref: + tag: 1.3.0 + url: oci://ghcr.io/wiremind/wiremind-helm-charts/gateway-api-crds +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: gateway-api-crds +spec: + interval: 1h + chartRef: + kind: OCIRepository + name: gateway-api-crds + install: + remediation: + retries: -1 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 diff --git a/kubernetes/apps/kube-system/gateway-api-crds/app/kustomization.yaml b/kubernetes/apps/kube-system/gateway-api-crds/app/kustomization.yaml new file mode 100644 index 000000000..17cbc72b2 --- /dev/null +++ b/kubernetes/apps/kube-system/gateway-api-crds/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/apps/kube-system/gateway-api-crds/ks.yaml b/kubernetes/apps/kube-system/gateway-api-crds/ks.yaml new file mode 100644 index 000000000..b8d819df4 --- /dev/null +++ b/kubernetes/apps/kube-system/gateway-api-crds/ks.yaml @@ -0,0 +1,22 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app gateway-api-crds + namespace: &namespace kube-system +spec: + commonMetadata: + labels: + app.kubernetes.io/name: *app + interval: 1h + path: ./kubernetes/apps/kube-system/gateway-api-crds/app + prune: true + retryInterval: 2m + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + targetNamespace: *namespace + timeout: 5m + wait: false diff --git a/kubernetes/apps/kube-system/kustomization.yaml b/kubernetes/apps/kube-system/kustomization.yaml index f28c60b31..65056fdc3 100644 --- a/kubernetes/apps/kube-system/kustomization.yaml +++ b/kubernetes/apps/kube-system/kustomization.yaml @@ -10,6 +10,7 @@ resources: - ./coredns/ks.yaml - ./descheduler/ks.yaml - ./intel-device-plugin/ks.yaml + - ./gateway-api-crds/ks.yaml - ./kubelet-csr-approver/ks.yaml - ./metrics-server/ks.yaml - ./node-feature-discovery/ks.yaml diff --git a/kubernetes/bootstrap/apps/crds/helmfile.yaml b/kubernetes/bootstrap/apps/crds/helmfile.yaml new file mode 100644 index 000000000..27e08dbf7 --- /dev/null +++ b/kubernetes/bootstrap/apps/crds/helmfile.yaml @@ -0,0 +1,34 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/helmfile + +# This helmfile is for installing Custom Resource Definitions (CRDs) from Helm charts. +# It is not intended to be used with helmfile apply or sync. + +helmDefaults: + args: ['--include-crds', '--no-hooks'] # Prevent helmfile apply or sync + +releases: + - name: cloudflare-dns + namespace: network + chart: oci://ghcr.io/home-operations/charts-mirror/external-dns + version: 1.18.0 + + - name: external-secrets + namespace: external-secrets + chart: oci://ghcr.io/external-secrets/charts/external-secrets + version: 0.19.2 + + - name: gateway-api-crds + namespace: kube-system + chart: oci://ghcr.io/wiremind/wiremind-helm-charts/gateway-api-crds + version: 1.3.0 + + - name: keda + namespace: observability + chart: oci://ghcr.io/home-operations/charts-mirror/keda + version: 2.17.2 + + - name: kube-prometheus-stack + namespace: observability + chart: oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack + version: 76.4.0 diff --git a/kubernetes/bootstrap/apps/helmfile.yaml b/kubernetes/bootstrap/apps/helmfile.yaml index 7d70b8e9f..190cf2f6b 100644 --- a/kubernetes/bootstrap/apps/helmfile.yaml +++ b/kubernetes/bootstrap/apps/helmfile.yaml @@ -1,13 +1,8 @@ --- # yaml-language-server: $schema=https://json.schemastore.org/helmfile -# renovate: datasource=docker depName=ghcr.io/siderolabs/kubelet -kubeVersion: v1.33.2 - helmDefaults: - force: true - recreatePods: true - timeout: 600 + cleanupOnFail: true wait: true waitForJobs: true @@ -16,62 +11,42 @@ repositories: url: https://postfinance.github.io/kubelet-csr-approver releases: - - name: kube-prometheus-stack-crds - namespace: observability - chart: oci://ghcr.io/prometheus-community/charts/prometheus-operator-crds - version: 22.0.2 - - name: cilium namespace: kube-system - atomic: true chart: oci://ghcr.io/home-operations/charts-mirror/cilium - version: 1.17.6 - values: ["../../apps/kube-system/cilium/app/helm-values.yaml"] - hooks: - - # Wait for cilium CRDs to be available - events: ['postsync'] - command: bash - args: - - -c - - until kubectl get crd ciliumbgppeeringpolicies.cilium.io ciliuml2announcementpolicies.cilium.io ciliumloadbalancerippools.cilium.io &>/dev/null; do sleep 10; done - showlogs: true - needs: ["observability/kube-prometheus-stack-crds"] + version: 1.18.1 + values: ['../kubernetes/apps/kube-system/cilium/app/helm/values.yaml'] - name: coredns namespace: kube-system - atomic: true chart: oci://ghcr.io/coredns/charts/coredns version: 1.43.2 - values: ["../../apps/kube-system/coredns/app/helm-values.yaml"] - needs: ["kube-system/cilium"] + values: ['../kubernetes/apps/kube-system/coredns/app/helm/values.yaml'] + needs: ['kube-system/cilium'] - name: kubelet-csr-approver namespace: kube-system - atomic: true chart: postfinance/kubelet-csr-approver version: 1.2.10 - values: ["../../apps/kube-system/kubelet-csr-approver/app/helm-values.yaml"] - needs: ["kube-system/coredns"] + values: ['../../apps/kube-system/kubelet-csr-approver/app/helm-values.yaml'] + needs: ['kube-system/coredns'] - name: spegel namespace: kube-system - atomic: true chart: oci://ghcr.io/spegel-org/helm-charts/spegel version: 0.3.0 - values: ["../../apps/kube-system/spegel/app/helm-values.yaml"] - needs: ["kube-system/kubelet-csr-approver"] + values: ['../kubernetes/apps/kube-system/spegel/app/helm/values.yaml'] + needs: ['kube-system/coredns'] - name: cert-manager namespace: cert-manager - atomic: true - chart: oci://ghcr.io/home-operations/charts-mirror/cert-manager - version: v1.17.1 - values: ['../../apps/cert-manager/cert-manager/app/helm/values.yaml'] + chart: oci://quay.io/jetstack/charts/cert-manager + version: v1.18.2 + values: ['../kubernetes/apps/cert-manager/cert-manager/app/helm/values.yaml'] needs: ['kube-system/spegel'] - name: external-secrets namespace: external-secrets - atomic: true chart: oci://ghcr.io/external-secrets/charts/external-secrets version: 0.19.1 values: ['../../apps/external-secrets/external-secrets/app/helm/values.yaml'] diff --git a/kubernetes/bootstrap/apps/resources.yaml b/kubernetes/bootstrap/apps/resources.yaml new file mode 100644 index 000000000..ed3537b42 --- /dev/null +++ b/kubernetes/bootstrap/apps/resources.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: onepassword-secret + namespace: external-secrets +stringData: + token: op://kubernetes/1password/OP_CONNECT_TOKEN +--- +apiVersion: v1 +kind: Secret +metadata: + name: sops-age + namespace: flux-system +stringData: + age.agekey: op://kubernetes/sops/SOPS_PRIVATE_KEY +--- +apiVersion: v1 +kind: Secret +metadata: + name: cloudflare-tunnel-id-secret + namespace: network +stringData: + CLOUDFLARE_TUNNEL_ID: op://kubernetes/cloudflare/CLOUDFLARE_TUNNEL_ID diff --git a/scripts/bootstrap-talos.sh b/scripts/bootstrap-talos.sh new file mode 100644 index 000000000..02fc66ed6 --- /dev/null +++ b/scripts/bootstrap-talos.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +export ROOT_DIR="$(git rev-parse --show-toplevel)" + +# Log messages with structured output +function log() { + local lvl="${1:?}" msg="${2:?}" + shift 2 + gum log --time=rfc3339 --structured --level "${lvl}" "[${FUNCNAME[1]}] ${msg}" "$@" +} + +# Apply the Talos configuration to all the nodes +function install_talos() { + log info "Installing Talos configuration" + + local machineconfig_file="${ROOT_DIR}/talos/machineconfig.yaml.j2" + + if [[ ! -f ${machineconfig_file} ]]; then + log fatal "No Talos machine files found for machineconfig" "file" "${machineconfig_file}" + fi + + # Check if Talos nodes are present + if ! nodes=$(talosctl config info --output yaml | yq --exit-status '.nodes | join (" ")') || [[ -z "${nodes}" ]]; then + log fatal "No Talos nodes found" + fi + + # Check that all nodes have a Talos configuration file + for node in ${nodes}; do + local node_file="${ROOT_DIR}/talos/nodes/${node}.yaml.j2" + + if [[ ! -f "${node_file}" ]]; then + log fatal "No Talos machine files found for node" "node" "${node}" "file" "${node_file}" + fi + done + + # Apply the Talos configuration to the nodes + for node in ${nodes}; do + local node_file="${ROOT_DIR}/talos/nodes/${node}.yaml.j2" + + log info "Applying Talos node configuration" "node" "${node}" + + if ! machine_config=$(bash "${ROOT_DIR}/scripts/render-machine-config.sh" "${machineconfig_file}" "${node_file}" 2>/dev/null) || [[ -z "${machine_config}" ]]; then + log fatal "Failed to render Talos node configuration" "node" "${node}" "file" "${node_file}" + fi + + log debug "Talos node configuration rendered successfully" "node" "${node}" + + if ! output=$(echo "${machine_config}" | talosctl --nodes "${node}" apply-config --insecure --file /dev/stdin 2>&1); then + if [[ "${output}" == *"certificate required"* ]]; then + log warn "Talos node is already configured, skipping apply of config" "node" "${node}" + continue + fi + log fatal "Failed to apply Talos node configuration" "node" "${node}" "output" "${output}" + fi + + log info "Talos node configuration applied successfully" "node" "${node}" + done +} + +# Bootstrap Talos on a controller node +function install_kubernetes() { + log info "Installing Kubernetes" + + if ! controller=$(talosctl config info --output yaml | yq --exit-status '.endpoints[0]') || [[ -z "${controller}" ]]; then + log fatal "No Talos controller found" + fi + + log debug "Talos controller discovered" "controller" "${controller}" + + until output=$(talosctl --nodes "${controller}" bootstrap 2>&1 || true) && [[ "${output}" == *"AlreadyExists"* ]]; do + log info "Talos bootstrap in progress, waiting 5 seconds..." "controller" "${controller}" + sleep 5 + done + + log info "Kubernetes installed successfully" "controller" "${controller}" +} + +# Fetch the kubeconfig to local machine +function fetch_kubeconfig() { + log info "Fetching kubeconfig" + + if ! controller=$(talosctl config info --output yaml | yq --exit-status '.endpoints[0]') || [[ -z "${controller}" ]]; then + log fatal "No Talos controller found" + fi + + if ! talosctl kubeconfig --nodes "${controller}" --force --force-context-name main "$(basename "${KUBECONFIG}")" &>/dev/null; then + log fatal "Failed to fetch kubeconfig" + fi + + log info "Kubeconfig fetched successfully" +} + +# Talos requires the nodes to be 'Ready=False' before applying resources +function wait_for_nodes() { + log info "Waiting for nodes to be available" + + # Skip waiting if all nodes are 'Ready=True' + if kubectl wait nodes --for=condition=Ready=True --all --timeout=10s &>/dev/null; then + log info "Nodes are available and ready, skipping wait for nodes" + return + fi + + # Wait for all nodes to be 'Ready=False' + until kubectl wait nodes --for=condition=Ready=False --all --timeout=10s &>/dev/null; do + log info "Nodes are not available, waiting for nodes to be available. Retrying in 5 seconds..." + sleep 5 + done +} + +# Apply namespaces to the cluster +function apply_namespaces() { + log info "Applying namespaces" + + local -r apps_dir="${ROOT_DIR}/kubernetes/apps" + + if [[ ! -d "${apps_dir}" ]]; then + log error "Directory does not exist" "directory" "${apps_dir}" + fi + + find "${apps_dir}" -mindepth 1 -maxdepth 1 -type d -printf "%f\n" | while IFS= read -r namespace; do + if kubectl get namespace "${namespace}" &>/dev/null; then + log info "Namespace is up-to-date" "namespace" "${namespace}" + continue + fi + + if ! kubectl create namespace "${namespace}" --dry-run=client --output=yaml | kubectl apply --server-side --filename - &>/dev/null; then + log error "Failed to apply namespace" "namespace" "${namespace}" + fi + + log info "Namespace applied successfully" "namespace" "${namespace}" + done +} + +# Apply resources before the helmfile charts are installed +function apply_resources() { + log info "Applying resources" + + local -r resources_file="${ROOT_DIR}/bootstrap/resources.yaml" + + if [[ ! -f "${resources_file}" ]]; then + log fatal "File does not exist" "file" "${resources_file}" + fi + + if op inject --in-file "${resources_file}" | kubectl diff --filename - &>/dev/null; then + log info "Resources are up-to-date" + return + fi + + if ! op inject --in-file "${resources_file}" | kubectl apply --server-side --filename - &>/dev/null; then + log fatal "Failed to apply resources" + fi + + log info "Resources applied successfully" +} + +# Apply Custom Resource Definitions (CRDs) +function apply_crds() { + log info "Applying CRDs" + + local -r helmfile_file="${ROOT_DIR}/bootstrap/crds/helmfile.yaml" + + if [[ ! -f "${helmfile_file}" ]]; then + log fatal "File does not exist" "file" "${helmfile_file}" + fi + + if ! crds=$(helmfile --file "${helmfile_file}" template --include-crds --no-hooks --quiet | yq ea --exit-status 'select(.kind == "CustomResourceDefinition")' -) || [[ -z "${crds}" ]]; then + log fatal "Failed to render CRDs from Helmfile" "file" "${helmfile_file}" + fi + + if echo "${crds}" | kubectl diff --filename - &>/dev/null; then + log info "CRDs are up-to-date" + return + fi + + if ! echo "${crds}" | kubectl apply --server-side --filename - &>/dev/null; then + log fatal "Failed to apply crds from Helmfile" "file" "${helmfile_file}" + fi + + log info "CRDs applied successfully" +} + +# Apply applications using Helmfile +function apply_apps() { + log info "Applying apps" + + local -r helmfile_file="${ROOT_DIR}/bootstrap/helmfile.yaml" + + if [[ ! -f "${helmfile_file}" ]]; then + log fatal "File does not exist" "file" "${helmfile_file}" + fi + + if ! helmfile --file "${helmfile_file}" sync --hide-notes; then + log fatal "Failed to apply apps from Helmfile" "file" "${helmfile_file}" + fi + + log info "Apps applied successfully" +} + +function main() { + install_talos + install_kubernetes + fetch_kubeconfig + wait_for_nodes + apply_namespaces + apply_resources + apply_crds + apply_apps +} + +main "$@" diff --git a/scripts/render-machine-config.sh b/scripts/render-machine-config.sh new file mode 100644 index 000000000..3a7764aad --- /dev/null +++ b/scripts/render-machine-config.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +# Description: +# This script renders and merges Talos machine configurations using minijinja-cli, op and talosctl. +# It uses templates and patches to generate a final configuration for Talos nodes. +# +# Arguments: +# 1. Path to the Talos machineconfig file. +# 2. Path to the patch file for the machineconfig. +# +# Example Usage: +# ./render-maching-config.sh machineconfig.yaml.j2 nodes/k8s-0.yaml.j2 +# +# Output: +# The merged Talos configuration is printed to standard output. + +readonly MACHINEBASE="${1:?}" MACHINEPATCH="${2:?}" + +# Log messages with structured output +function log() { + local lvl="${1:?}" msg="${2:?}" + shift 2 + gum log --time=rfc3339 --structured --level "${lvl}" "[${FUNCNAME[1]}] ${msg}" "$@" +} + +function main() { + + local base patch type result + + # Determine the machine type from the patch file + if ! type=$(yq --exit-status 'select(documentIndex == 0) | .machine.type' "${MACHINEPATCH}") || [[ -z "${type}" ]]; then + log fatal "Failed to determine machine type from patch file" "file" "${MACHINEPATCH}" + fi + + # Render the base machine configurations + if ! base=$(minijinja-cli --define "machinetype=${type}" "${MACHINEBASE}" | op inject) || [[ -z "${base}" ]]; then + log fatal "Failed to render base machine configuration" "file" "${MACHINEBASE}" + fi + + BASE_TMPFILE=$(mktemp) + echo "${base}" >"${BASE_TMPFILE}" + + # Render the patch machine configurations + if ! patch=$(minijinja-cli --define "machinetype=${type}" "${MACHINEPATCH}" | op inject) || [[ -z "${patch}" ]]; then + log fatal "Failed to render patch machine configuration" "file" "${MACHINEPATCH}" + fi + + PATCH_TMPFILE=$(mktemp) + echo "${patch}" >"${PATCH_TMPFILE}" + + # Apply the patch to the base machine configuration + if ! result=$(talosctl machineconfig patch "${BASE_TMPFILE}" --patch "@${PATCH_TMPFILE}") || [[ -z "${result}" ]]; then + log fatal "Failed to apply patch to machine configuration" "base_file" "${BASE_TMPFILE}" "patch_file" "${PATCH_TMPFILE}" + fi + + echo "${result}" +} + +main "$@"