From 6a063c062e7a4d3204d12dffc7df5fa6e1b7f65b Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Sun, 27 Aug 2023 19:26:06 +0200 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20intel-device-plugin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../intel-device-plugin/app/helmrelease.yaml | 32 +++++++ .../app}/kustomization.yaml | 0 .../exporter/helmrelease.yaml | 21 ++--- .../exporter}/kustomization.yaml | 0 .../intel-device-plugin/gpu/helmrelease.yaml | 34 ++++++++ .../gpu/kustomization.yaml | 7 ++ .../ks.yaml | 44 +++++----- .../intel-gpu/plugin/helmrelease.yaml | 83 ------------------- .../apps/kube-system/kustomization.yaml | 2 +- kubernetes/flux/repositories/helm/intel.yaml | 10 +++ .../flux/repositories/helm/kustomization.yaml | 1 + 11 files changed, 116 insertions(+), 118 deletions(-) create mode 100644 kubernetes/apps/kube-system/intel-device-plugin/app/helmrelease.yaml rename kubernetes/apps/kube-system/{intel-gpu/exporter => intel-device-plugin/app}/kustomization.yaml (100%) rename kubernetes/apps/kube-system/{intel-gpu => intel-device-plugin}/exporter/helmrelease.yaml (75%) rename kubernetes/apps/kube-system/{intel-gpu/plugin => intel-device-plugin/exporter}/kustomization.yaml (100%) create mode 100644 kubernetes/apps/kube-system/intel-device-plugin/gpu/helmrelease.yaml create mode 100644 kubernetes/apps/kube-system/intel-device-plugin/gpu/kustomization.yaml rename kubernetes/apps/kube-system/{intel-gpu => intel-device-plugin}/ks.yaml (51%) delete mode 100644 kubernetes/apps/kube-system/intel-gpu/plugin/helmrelease.yaml create mode 100644 kubernetes/flux/repositories/helm/intel.yaml diff --git a/kubernetes/apps/kube-system/intel-device-plugin/app/helmrelease.yaml b/kubernetes/apps/kube-system/intel-device-plugin/app/helmrelease.yaml new file mode 100644 index 000000000..0e4d1ce71 --- /dev/null +++ b/kubernetes/apps/kube-system/intel-device-plugin/app/helmrelease.yaml @@ -0,0 +1,32 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: intel-device-plugin-operator + namespace: kube-system +spec: + interval: 30m + chart: + spec: + chart: intel-device-plugins-operator + version: 0.27.1 + sourceRef: + kind: HelmRepository + name: intel + namespace: flux-system + maxHistory: 2 + install: + crds: CreateReplace + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + crds: CreateReplace + remediation: + retries: 3 + uninstall: + keepHistory: false + dependsOn: + - name: node-feature-discovery + namespace: kube-system diff --git a/kubernetes/apps/kube-system/intel-gpu/exporter/kustomization.yaml b/kubernetes/apps/kube-system/intel-device-plugin/app/kustomization.yaml similarity index 100% rename from kubernetes/apps/kube-system/intel-gpu/exporter/kustomization.yaml rename to kubernetes/apps/kube-system/intel-device-plugin/app/kustomization.yaml diff --git a/kubernetes/apps/kube-system/intel-gpu/exporter/helmrelease.yaml b/kubernetes/apps/kube-system/intel-device-plugin/exporter/helmrelease.yaml similarity index 75% rename from kubernetes/apps/kube-system/intel-gpu/exporter/helmrelease.yaml rename to kubernetes/apps/kube-system/intel-device-plugin/exporter/helmrelease.yaml index 576d6f7da..68770d4eb 100644 --- a/kubernetes/apps/kube-system/intel-gpu/exporter/helmrelease.yaml +++ b/kubernetes/apps/kube-system/intel-device-plugin/exporter/helmrelease.yaml @@ -51,20 +51,13 @@ spec: targetLabel: node securityContext: privileged: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: feature.node.kubernetes.io/custom-intel-gpu - operator: In - values: - - "true" + nodeSelector: + intel.feature.node.kubernetes.io/gpu: "true" resources: requests: - gpu.intel.com/i915: 1 - cpu: 15m - memory: 105Mi + gpu.intel.com/i915_monitoring: 1 + cpu: 100m + memory: 100Mi limits: - gpu.intel.com/i915: 1 - memory: 105Mi + gpu.intel.com/i915_monitoring: 1 + memory: 500Mi diff --git a/kubernetes/apps/kube-system/intel-gpu/plugin/kustomization.yaml b/kubernetes/apps/kube-system/intel-device-plugin/exporter/kustomization.yaml similarity index 100% rename from kubernetes/apps/kube-system/intel-gpu/plugin/kustomization.yaml rename to kubernetes/apps/kube-system/intel-device-plugin/exporter/kustomization.yaml diff --git a/kubernetes/apps/kube-system/intel-device-plugin/gpu/helmrelease.yaml b/kubernetes/apps/kube-system/intel-device-plugin/gpu/helmrelease.yaml new file mode 100644 index 000000000..d52011c1f --- /dev/null +++ b/kubernetes/apps/kube-system/intel-device-plugin/gpu/helmrelease.yaml @@ -0,0 +1,34 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: intel-device-plugin-gpu + namespace: kube-system +spec: + interval: 30m + chart: + spec: + chart: intel-device-plugins-gpu + version: 0.27.1 + sourceRef: + kind: HelmRepository + name: intel + namespace: flux-system + maxHistory: 2 + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + uninstall: + keepHistory: false + dependsOn: + - name: intel-device-plugin-operator + namespace: kube-system + values: + name: intel-device-plugin-gpu + sharedDevNum: 4 + nodeFeatureRule: true diff --git a/kubernetes/apps/kube-system/intel-device-plugin/gpu/kustomization.yaml b/kubernetes/apps/kube-system/intel-device-plugin/gpu/kustomization.yaml new file mode 100644 index 000000000..1af0c2237 --- /dev/null +++ b/kubernetes/apps/kube-system/intel-device-plugin/gpu/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/SchemaStore/schemastore/master/src/schemas/json/kustomization.json +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kube-system +resources: + - ./helmrelease.yaml diff --git a/kubernetes/apps/kube-system/intel-gpu/ks.yaml b/kubernetes/apps/kube-system/intel-device-plugin/ks.yaml similarity index 51% rename from kubernetes/apps/kube-system/intel-gpu/ks.yaml rename to kubernetes/apps/kube-system/intel-device-plugin/ks.yaml index 27e587a3a..18ea64391 100644 --- a/kubernetes/apps/kube-system/intel-gpu/ks.yaml +++ b/kubernetes/apps/kube-system/intel-device-plugin/ks.yaml @@ -3,48 +3,52 @@ apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: cluster-apps-intel-gpu-plugin + name: cluster-apps-intel-device-plugin namespace: flux-system labels: substitution.flux.home.arpa/enabled: "true" spec: - dependsOn: - - name: cluster-apps-node-feature-discovery - path: ./kubernetes/apps/kube-system/intel-gpu/plugin + path: ./kubernetes/apps/kube-system/intel-device-plugin/app prune: true sourceRef: kind: GitRepository name: home-ops-kubernetes - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2beta1 - kind: HelmRelease - name: intel-gpu-plugin - namespace: kube-system interval: 30m retryInterval: 1m - timeout: 3m + timeout: 5m --- # yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: cluster-apps-intel-gpu-exporter + name: cluster-apps-intel-device-plugin-gpu namespace: flux-system labels: substitution.flux.home.arpa/enabled: "true" spec: - dependsOn: - - name: cluster-apps-intel-gpu-plugin - path: ./kubernetes/apps/kube-system/intel-gpu/exporter + path: ./kubernetes/apps/kube-system/intel-device-plugin/gpu prune: true sourceRef: kind: GitRepository name: home-ops-kubernetes - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2beta1 - kind: HelmRelease - name: intel-gpu-exporter - namespace: kube-system interval: 30m retryInterval: 1m - timeout: 3m + timeout: 5m +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: cluster-apps-intel-device-plugin-exporter + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + path: ./kubernetes/apps/kube-system/intel-device-plugin/exporter + prune: true + sourceRef: + kind: GitRepository + name: home-ops-kubernetes + interval: 30m + retryInterval: 1m + timeout: 5m diff --git a/kubernetes/apps/kube-system/intel-gpu/plugin/helmrelease.yaml b/kubernetes/apps/kube-system/intel-gpu/plugin/helmrelease.yaml deleted file mode 100644 index 16b566ec7..000000000 --- a/kubernetes/apps/kube-system/intel-gpu/plugin/helmrelease.yaml +++ /dev/null @@ -1,83 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: &app intel-gpu-plugin - namespace: kube-system -spec: - interval: 30m - chart: - spec: - chart: app-template - version: 1.5.1 - sourceRef: - kind: HelmRepository - name: bjw-s - namespace: flux-system - maxHistory: 2 - install: - createNamespace: true - remediation: - retries: 3 - upgrade: - cleanupOnFail: true - remediation: - retries: 3 - uninstall: - keepHistory: false - values: - controller: - type: daemonset - strategy: RollingUpdate - image: - repository: docker.io/intel/intel-gpu-plugin - tag: 0.27.1 - pullPolicy: IfNotPresent - args: - - -shared-dev-num - - "4" - service: - main: - enabled: false - # TODO(intel-gpu-plugin): Write probes to check for something to tell if it's working - probes: - liveness: - enabled: false - readiness: - enabled: false - startup: - enabled: false - persistence: - devfs: - enabled: true - type: hostPath - hostPath: /dev/dri - hostPathType: Directory - readOnly: true - sysfs: - enabled: true - type: hostPath - hostPath: /sys/class/drm - hostPathType: Directory - readOnly: true - kubeletsockets: - enabled: true - type: hostPath - hostPathType: Directory - hostPath: /var/lib/kubelet/device-plugins - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: feature.node.kubernetes.io/custom-intel-gpu - operator: In - values: - - "true" - resources: - requests: - cpu: 15m - memory: 105Mi - limits: - memory: 105Mi diff --git a/kubernetes/apps/kube-system/kustomization.yaml b/kubernetes/apps/kube-system/kustomization.yaml index ddb20da73..85425bf14 100644 --- a/kubernetes/apps/kube-system/kustomization.yaml +++ b/kubernetes/apps/kube-system/kustomization.yaml @@ -9,7 +9,7 @@ resources: - ./cilium/ks.yaml - ./descheduler/ks.yaml - ./external-secrets/ks.yaml - - ./intel-gpu/ks.yaml + - ./intel-device-plugin/ks.yaml - ./kubelet-csr-approver/ks.yaml - ./metrics-server/ks.yaml - ./node-feature-discovery/ks.yaml diff --git a/kubernetes/flux/repositories/helm/intel.yaml b/kubernetes/flux/repositories/helm/intel.yaml new file mode 100644 index 000000000..79b6bd5ba --- /dev/null +++ b/kubernetes/flux/repositories/helm/intel.yaml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.devbu.io/helmrepository_v1beta2.json +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: intel + namespace: flux-system +spec: + interval: 2h + url: https://intel.github.io/helm-charts diff --git a/kubernetes/flux/repositories/helm/kustomization.yaml b/kubernetes/flux/repositories/helm/kustomization.yaml index 34202a903..0f7b3f970 100644 --- a/kubernetes/flux/repositories/helm/kustomization.yaml +++ b/kubernetes/flux/repositories/helm/kustomization.yaml @@ -21,6 +21,7 @@ resources: - ./grafana.yaml - ./hajimari.yaml - ./ingress-nginx.yaml + - ./intel.yaml - ./jetstack.yaml - ./kyverno.yaml - ./metrics-server.yaml