Merge pull request #764 from red-hat-storage/sync_us--master

Syncing latest changes from upstream master for rook
red-hat-storage · Oct 28, 2024 · c0b23b3 · c0b23b3
2 parents 8782652 + 6788572
commit c0b23b3
Show file tree

Hide file tree

Showing 60 changed files with 3,281 additions and 3,792 deletions.
diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml
@@ -1085,6 +1085,7 @@ jobs:
           yq write -i tests/manifests/test-cluster-on-pvc-encrypted.yaml "spec.storage.storageClassDeviceSets[0].count" 2
           yq write -i tests/manifests/test-cluster-on-pvc-encrypted.yaml "spec.storage.storageClassDeviceSets[0].volumeClaimTemplates[0].spec.resources.requests.storage" 6Gi
           kubectl create -f tests/manifests/test-cluster-on-pvc-encrypted.yaml
+          kubectl patch -n rook-ceph cephcluster rook-ceph --type merge -p '{"spec":{"security":{"keyRotation":{"enabled": true, "schedule":"*/1 * * * *"}}}}'
           yq merge --inplace --arrays append tests/manifests/test-object.yaml tests/manifests/test-kms-vault-spec-token-auth.yaml
           yq write -i tests/manifests/test-object.yaml "spec.security.kms.connectionDetails.VAULT_BACKEND_PATH" rook/ver2
           kubectl create -f tests/manifests/test-object.yaml
@@ -1105,6 +1106,9 @@ jobs:
           tests/scripts/deploy-validate-vault.sh validate_osd
           sudo lsblk
 
+      - name: wait and verify key rotation
+        run: tests/scripts/deploy-validate-vault.sh validate_key_rotation rook/ver1
+
       - name: validate rgw vault kv
         run: |
           tests/scripts/deploy-validate-vault.sh validate_rgw
@@ -1170,6 +1174,7 @@ jobs:
           yq write -i tests/manifests/test-cluster-on-pvc-encrypted.yaml "spec.storage.storageClassDeviceSets[0].count" 2
           yq write -i tests/manifests/test-cluster-on-pvc-encrypted.yaml "spec.storage.storageClassDeviceSets[0].volumeClaimTemplates[0].spec.resources.requests.storage" 6Gi
           kubectl create -f tests/manifests/test-cluster-on-pvc-encrypted.yaml
+          kubectl patch -n rook-ceph cephcluster rook-ceph --type merge -p '{"spec":{"security":{"keyRotation":{"enabled": true, "schedule":"*/1 * * * *"}}}}'
           tests/scripts/github-action-helper.sh deploy_manifest_with_local_build deploy/examples/toolbox.yaml
 
       - name: wait for prepare pod
@@ -1186,6 +1191,9 @@ jobs:
           tests/scripts/deploy-validate-vault.sh validate_osd
           sudo lsblk
 
+      - name: wait and verify key rotation
+        run: KUBERNETES_AUTH=true tests/scripts/deploy-validate-vault.sh validate_key_rotation rook/ver1
+
       - name: collect common logs
         if: always()
         uses: ./.github/workflows/collect-logs
@@ -1632,7 +1640,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           artifact-name: ${{ github.job }}-${{ matrix.ceph-image }}
 
-  multus-cluster-network:
+  multus-public-and-cluster:
     runs-on: ubuntu-22.04
     if: "!contains(github.event.pull_request.labels.*.name, 'skip-ci')"
     strategy:
@@ -1649,45 +1657,50 @@ jobs:
         with:
           use-tmate: ${{ secrets.USE_TMATE }}
 
-      - name: setup golang
-        uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
-        with:
-          go-version: "1.22"
-
       - name: setup cluster resources
         uses: ./.github/workflows/canary-test-config
 
       - name: set Ceph version in CephCluster manifest
-        run: tests/scripts/github-action-helper.sh replace_ceph_image  "deploy/examples/cluster-test.yaml" "${{ matrix.ceph-image }}"
+        run: tests/scripts/github-action-helper.sh replace_ceph_image "deploy/examples/cluster-multus-test.yaml" "${{ matrix.ceph-image }}"
 
-      - name: allow holder pod deployment
-        run: sed -i "s|CSI_DISABLE_HOLDER_PODS|# CSI_DISABLE_HOLDER_PODS|g" "deploy/examples/operator.yaml"
+      - name: Setup multus
+        run: ./tests/scripts/multus/setup-multus.sh
 
-      - name: validate-yaml
-        run: tests/scripts/github-action-helper.sh validate_yaml
+      - name: Set up multus prerequisite host routing
+        run: kubectl create -f tests/scripts/multus/host-cfg-ds.yaml
+
+      - name: create cluster prerequisites
+        run: tests/scripts/github-action-helper.sh create_cluster_prerequisites
+
+      - name: Install public and cluster NADs in rook-ceph namespace
+        run: kubectl create -f tests/scripts/multus/rook-ceph-public-cluster-nads.yaml
 
       - name: use local disk and create partitions for osds
         run: |
           tests/scripts/github-action-helper.sh use_local_disk
           tests/scripts/github-action-helper.sh create_partitions_for_osds
 
-      - name: deploy multus
-        run: tests/scripts/github-action-helper.sh deploy_multus
+      - name: deploy cluster
+        run: |
+          tests/scripts/github-action-helper.sh deploy_manifest_with_local_build deploy/examples/operator.yaml
+
+          export BLOCK="$(tests/scripts/github-action-helper.sh find_extra_block_dev)"
+          yq write -i deploy/examples/cluster-multus-test.yaml "spec.storage.deviceFilter" "${BLOCK}"
+          kubectl create -f deploy/examples/cluster-multus-test.yaml
+
+          tests/scripts/github-action-helper.sh deploy_toolbox
 
-      - name: deploy multus cluster
-        run: tests/scripts/github-action-helper.sh deploy_multus_cluster
+          kubectl create -f deploy/examples/filesystem-test.yaml
+          kubectl create -f deploy/examples/nfs-test.yaml
 
       - name: wait for prepare pod
         run: tests/scripts/github-action-helper.sh wait_for_prepare_pod 2
 
       - name: wait for ceph to be ready
-        run: IS_POD_NETWORK=true IS_MULTUS=true tests/scripts/github-action-helper.sh wait_for_ceph_to_be_ready osd 2
-
-      - name: wait for ceph-csi configmap to be updated with network namespace
-        run: tests/scripts/github-action-helper.sh wait_for_ceph_csi_configmap_to_be_updated
+        run: tests/scripts/github-action-helper.sh wait_for_ceph_to_be_ready osd 2
 
       - name: wait for cephnfs to be ready
-        run: IS_POD_NETWORK=true IS_MULTUS=true tests/scripts/github-action-helper.sh wait_for_ceph_to_be_ready nfs 1
+        run: tests/scripts/github-action-helper.sh wait_for_ceph_to_be_ready nfs 1
 
       - name: check multus connections
         run: tests/scripts/github-action-helper.sh test_multus_connections
@@ -1708,64 +1721,6 @@ jobs:
           name: ${{ github.job }}-${{ matrix.ceph-image }}
           additional-namespace: kube-system
 
-  csi-hostnetwork-disabled:
-    runs-on: ubuntu-22.04
-    if: "!contains(github.event.pull_request.labels.*.name, 'skip-ci')"
-    strategy:
-      matrix:
-        ceph-image: ${{ fromJson(inputs.ceph_images) }}
-    steps:
-      - name: checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-        with:
-          fetch-depth: 0
-
-      - name: consider debugging
-        uses: ./.github/workflows/tmate_debug
-        with:
-          use-tmate: ${{ secrets.USE_TMATE }}
-
-      - name: setup cluster resources
-        uses: ./.github/workflows/canary-test-config
-
-      - name: allow holder pod deployment
-        run: sed -i "s|CSI_DISABLE_HOLDER_PODS|# CSI_DISABLE_HOLDER_PODS|g" "deploy/examples/operator.yaml"
-
-      - name: set Ceph version in CephCluster manifest
-        run: tests/scripts/github-action-helper.sh replace_ceph_image  "deploy/examples/cluster-test.yaml" "${{ matrix.ceph-image }}"
-
-      - name: use local disk and create partitions for osds
-        run: |
-          tests/scripts/github-action-helper.sh use_local_disk
-          tests/scripts/github-action-helper.sh create_partitions_for_osds
-
-      - name: deploy CSI hostNetworking disabled cluster
-        run: tests/scripts/github-action-helper.sh deploy_csi_hostnetwork_disabled_cluster
-
-      - name: wait for prepare pod
-        run: tests/scripts/github-action-helper.sh wait_for_prepare_pod 2
-
-      - name: wait for ceph to be ready
-        run: IS_POD_NETWORK=true tests/scripts/github-action-helper.sh wait_for_ceph_to_be_ready osd 2
-
-      - name: wait for ceph-csi configmap to be updated with network namespace
-        run: tests/scripts/github-action-helper.sh wait_for_ceph_csi_configmap_to_be_updated
-
-      - name: test ceph-csi-rbd plugin restart
-        run: tests/scripts/github-action-helper.sh test_csi_rbd_workload
-
-      - name: test ceph-csi-cephfs plugin restart
-        run: tests/scripts/github-action-helper.sh test_csi_cephfs_workload
-
-      - name: test ceph-csi-nfs plugin restart
-        run: tests/scripts/github-action-helper.sh test_csi_nfs_workload
-
-      - name: collect common logs
-        if: always()
-        uses: ./.github/workflows/collect-logs
-        with:
-          name: ${{ github.job }}-${{ matrix.ceph-image }}
-
   two-object-one-zone:
     runs-on: ubuntu-22.04
     if: "!contains(github.event.pull_request.labels.*.name, 'skip-ci')"

diff --git a/.mergify.yml b/.mergify.yml
@@ -272,7 +272,7 @@ pull_request_rules:
       - "check-success=canary-tests / multi-cluster-mirroring (quay.io/ceph/ceph:v18)"
       - "check-success=canary-tests / rgw-multisite-testing (quay.io/ceph/ceph:v18)"
       - "check-success=canary-tests / encryption-pvc-kms-ibm-kp (quay.io/ceph/ceph:v18)"
-      - "check-success=canary-tests / multus-cluster-network (quay.io/ceph/ceph:v18)"
+      - "check-success=canary-tests / multus-cluster-network (quay.io/ceph/ceph:v18)" # note: changed name for 1.16 (multus-public-and-cluster)
       - "check-success=canary-tests / csi-hostnetwork-disabled (quay.io/ceph/ceph:v18)"
       - "check-success=TestCephSmokeSuite (v1.26.15)"
       - "check-success=TestCephSmokeSuite (v1.31.0)"

diff --git a/Documentation/CRDs/Cluster/external-cluster/provider-export.md b/Documentation/CRDs/Cluster/external-cluster/provider-export.md
@@ -16,7 +16,7 @@ python3 create-external-cluster-resources.py --rbd-data-pool-name <pool_name> --
 * `--alias-rbd-data-pool-name`: Provides an alias for the  RBD data pool name, necessary if a special character is present in the pool name such as a period or underscore
 * `--rgw-endpoint`: (optional) The RADOS Gateway endpoint in the format `<IP>:<PORT>` or `<FQDN>:<PORT>`.
 * `--rgw-pool-prefix`: (optional) The prefix of the RGW pools. If not specified, the default prefix is `default`
-* `--rgw-tls-cert-path`: (optional) RADOS Gateway endpoint TLS certificate file path
+* `--rgw-tls-cert-path`: (optional) RADOS Gateway endpoint TLS certificate (or intermediate signing certificate) file path
 * `--rgw-skip-tls`: (optional) Ignore TLS certification validation when a self-signed certificate is provided (NOT RECOMMENDED)
 * `--rbd-metadata-ec-pool-name`: (optional) Provides the name of erasure coded RBD metadata pool, used for creating ECRBDStorageClass.
 * `--monitoring-endpoint`: (optional) Ceph Manager prometheus exporter endpoints (comma separated list of IP entries of active and standby mgrs)

diff --git a/Documentation/CRDs/Cluster/network-providers.md b/Documentation/CRDs/Cluster/network-providers.md
@@ -414,182 +414,3 @@ spec:
       }
     }'
 ```
-
-## Holder Pod Deprecation
-
-Rook plans to remove CSI "holder" pods in Rook v1.16. CephCluster with `csi-*plugin-holder-*` pods
-present in the Rook operator namespace must plan to set `CSI_DISABLE_HOLDER_PODS` to `"true"` after
-Rook v1.14 is installed and before v1.16 is installed by following the migration sections below.
-CephClusters with no holder pods do not need to follow migration steps.
-
-Helm users will set `csi.disableHolderPods: true` in values.yaml instead of `CSI_DISABLE_HOLDER_PODS`.
-
-CephClusters that do not use `network.provider: multus` can follow the
-[Disabling Holder Pods](#disabling-holder-pods) section.
-
-CephClusters that use `network.provider: multus` will need to plan the migration more carefully.
-Read the [Disabling Holder Pods with Multus](#disabling-holder-pods-with-multus) section in full
-before beginning.
-
-!!! hint
-    To determine if holder pods are deployed, use
-    `kubectl --namespace $ROOK_OPERATOR get pods | grep plugin-holder`
-
-### Disabling Holder Pods with Multus
-
-This migration section applies when any CephCluster `network.provider` is `"multus"`. If the
-scenario does not apply, skip ahead to the [Disabling Holder Pods](#disabling-holder-pods) section.
-
-**Step 1**
-
-Before setting `CSI_ENABLE_HOST_NETWORK: "true"` and `CSI_DISABLE_HOLDER_PODS: "true"`, thoroughly
-read through the [Multus Prerequisites section](#multus-prerequisites). Use the prerequisites
-section to develop a plan for modifying host configurations as well as the public
-NetworkAttachmentDefinition.
-
-Once the plan is developed, execute the plan by following the steps below.
-
-**Step 2**
-
-First, modify the public NetworkAttachmentDefinition as needed. For example, it may be necessary to
-add the `routes` directive to the Whereabouts IPAM configuration as in
-[this example](#macvlan-whereabouts-node-static-ips).
-
-**Step 3**
-
-Next, modify the host configurations in the host configuration system. The host configuration system
-may be something like PXE, ignition config, cloud-init, Ansible, or any other such system. A node
-reboot is likely necessary to apply configuration updates, but wait until the next step to reboot
-nodes.
-
-If desired, check that the NetworkAttachmentDefinition modification and host configurations are
-compatible using the [Multus validation tool](#validating-multus-configuration). For the upgrade
-case, use the `hostCheckOnly: true` config option or `--host-check-only` CLI flag.
-
-**Step 4**
-
-After the NetworkAttachmentDefinition is modified, OSD pods must be restarted. It is easiest to
-complete this requirement at the same time nodes are being rebooted to apply configuration updates.
-
-For each node in the Kubernetes cluster:
-
-1. `cordon` and `drain` the node
-2. Wait for all pods to drain
-3. Reboot the node, ensuring the new host configuration will be applied
-4. `uncordon` and `undrain` the node
-5. Wait for the node to be rehydrated and stable
-6. Proceed to the next node
-
-By following this process, host configurations will be updated, and OSDs are also automatically
-restarted as part of the `drain` and `undrain` process on each node.
-
-OSDs can be restarted manually if node configuration updates do not require reboot.
-
-**Step 5**
-
-Once all nodes are running the new configuration and all OSDs have been restarted, check that the
-new node and NetworkAttachmentDefinition configurations are compatible. To do so, verify that each
-node can `ping` OSD pods via the public network.
-
-Use the [toolbox](../../Troubleshooting/ceph-toolbox.md) or the
-[kubectl plugin](../../Troubleshooting/kubectl-plugin.md) to list OSD IPs.
-
-The example below uses
-the kubectl plugin, and the OSD public network is 192.168.20.0/24.
-
-```console
-$ kubectl rook-ceph ceph osd dump | grep 'osd\.'
-osd.0 up   in  weight 1 up_from 7 up_thru 0 down_at 0 last_clean_interval [0,0) [v2:192.168.20.19:6800/213587265,v1:192.168.20.19:6801/213587265] [v2:192.168.30.1:6800/213587265,v1:192.168.30.1:6801/213587265] exists,up 7ebbc19a-d45a-4b12-8fef-0f9423a59e78
-osd.1 up   in  weight 1 up_from 24 up_thru 24 down_at 20 last_clean_interval [8,23) [v2:192.168.20.20:6800/3144257456,v1:192.168.20.20:6801/3144257456] [v2:192.168.30.2:6804/3145257456,v1:192.168.30.2:6805/3145257456] exists,up 146b27da-d605-4138-9748-65603ed0dfa5
-osd.2 up   in  weight 1 up_from 21 up_thru 0 down_at 20 last_clean_interval [18,20) [v2:192.168.20.21:6800/1809748134,v1:192.168.20.21:6801/1809748134] [v2:192.168.30.3:6804/1810748134,v1:192.168.30.3:6805/1810748134] exists,up ff3d6592-634e-46fd-a0e4-4fe9fafc0386
-```
-
-Now check that each node (NODE) can reach OSDs over the public network:
-
-```console
-$ ssh user@NODE
-$ user@NODE $> ping -c3 192.168.20.19
-# [truncated, successful output]
-$ user@NODE $> ping -c3 192.168.20.20
-# [truncated, successful output]
-$ user@NODE $> ping -c3 192.168.20.21
-# [truncated, successful output]
-```
-
-If any node does not get a successful `ping` to a running OSD, it is not safe to proceed. A problem
-may arise here for many reasons. Some reasons include: the host may not be properly attached to the
-Multus public network, the public NetworkAttachmentDefinition may not be properly configured to
-route back to the host, the host may have a firewall rule blocking the connection in either
-direction, or the network switch may have a firewall rule blocking the connection. Diagnose and fix
-the issue, then return to **Step 1**.
-
-**Step 6**
-
-If the above check succeeds for all nodes, proceed with the
-[Disabling Holder Pods](#disabling-holder-pods) steps below.
-
-### Disabling Holder Pods
-
-**Step 1**
-
-If any CephClusters have Multus enabled (`network.provider: "multus"`), follow the
-[Disabling Holder Pods with Multus](#disabling-holder-pods-with-multus)
-steps above before continuing.
-
-**Step 2**
-
-Begin by setting `CSI_DISABLE_HOLDER_PODS: "true"`. If `CSI_ENABLE_HOST_NETWORK` is set to
-`"false"`, also set this value to `"true"` at the same time.
-
-After this, `csi-*plugin-*` pods will restart, and `csi-*plugin-holder-*` pods will remain running.
-
-**Step 3**
-
-Check that CSI pods are using the correct host networking configuration using the example below as
-guidance (in the example, `CSI_ENABLE_HOST_NETWORK` is `"true"`):
-
-```console
-$ kubectl -n rook-ceph get -o yaml daemonsets.apps csi-rbdplugin | grep -i hostnetwork
-      hostNetwork: true
-$ kubectl -n rook-ceph get -o yaml daemonsets.apps csi-cephfsplugin | grep -i hostnetwork
-      hostNetwork: true
-$ kubectl -n rook-ceph get -o yaml daemonsets.apps csi-nfsplugin | grep -i hostnetwork
-      hostNetwork: true
-```
-
-**Step 4**
-
-At this stage, PVCs for running applications are still using the holder pods. These PVCs must be
-migrated from the holder to the new network. Follow the below process to do so.
-
-For each node in the Kubernetes cluster:
-
-1. `cordon` and `drain` the node
-2. Wait for all pods to drain
-3. Delete all `csi-*plugin-holder*` pods on the node (a new holder will take it's place)
-4. `uncordon` and `undrain` the node
-5. Wait for the node to be rehydrated and stable
-6. Proceed to the next node
-
-**Step 5**
-
-After this process is done for all Kubernetes nodes, it is safe to delete the `csi-*plugin-holder*`
-daemonsets.
-
-Delete the holder daemonsets using the example below as guidance:
-
-```console
-$ kubectl -n rook-ceph get daemonset -o name | grep plugin-holder
-daemonset.apps/csi-cephfsplugin-holder-my-cluster
-daemonset.apps/csi-rbdplugin-holder-my-cluster
-
-$ kubectl -n rook-ceph delete daemonset.apps/csi-cephfsplugin-holder-my-cluster
-daemonset.apps "csi-cephfsplugin-holder-my-cluster" deleted
-
-$ kubectl -n rook-ceph delete daemonset.apps/csi-rbdplugin-holder-my-cluster
-daemonset.apps "csi-rbdplugin-holder-my-cluster" deleted
-```
-
-**Step 6**
-
-The migration is now complete! Congratulations!
diff --git a/Documentation/CRDs/specification.md b/Documentation/CRDs/specification.md
@@ -10959,6 +10959,17 @@ string
 </tr>
 <tr>
 <td>
+<code>default</code><br/>
+<em>
+bool
+</em>
+</td>
+<td>
+<p>Sets given placement as default. Only one placement in the list can be marked as default.</p>
+</td>
+</tr>
+<tr>
+<td>
 <code>metadataPoolName</code><br/>
 <em>
 string