From fc94e11155569d4aa4f721320c8cfb965fe7a9ad Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Thu, 10 Oct 2024 16:39:59 +0200 Subject: [PATCH] csi: disable fencing in Rook Disabling the RBD and CephFS fencing in Rook for now as its having bugs where Rook is blocklisting wrong IP address due to timing issues. Signed-off-by: Madhu Rajanna (cherry picked from commit 1d1ed5e9629b3c7acccfd63627aaa839b1b2fefa) (cherry picked from commit 34f81ec820e1b6794d664bddf628754e1c99ccce) --- .../Block-Storage-RBD/block-storage.md | 7 +++++-- Documentation/Troubleshooting/ceph-csi-common-issues.md | 3 --- pkg/operator/ceph/cluster/watcher.go | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md b/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md index 7b311479b88b..4e1ecfc5b3d8 100644 --- a/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md +++ b/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md @@ -196,8 +196,6 @@ The erasure coded pool must be set as the `dataPool` parameter in If a node goes down where a pod is running where a RBD RWO volume is mounted, the volume cannot automatically be mounted on another node. The node must be guaranteed to be offline before the volume can be mounted on another node. -!!! Note - These instructions are for clusters with Kubernetes version 1.26 or greater. For K8s 1.25 or older, see the [manual steps in the CSI troubleshooting guide](../../Troubleshooting/ceph-csi-common-issues.md#node-loss) to recover from the node loss. ### Configure CSI-Addons @@ -206,6 +204,11 @@ Deploy csi-addons controller and enable `csi-addons` sidecar as mentioned in the ### Handling Node Loss +!!! warning + Automated node loss handling is currently disabled, please refer to the [manual steps](../../Troubleshooting/ceph-csi-common-issues.md#node-loss) to recover from the node loss. + We are actively working on a new design for this feature. + For more details see the [tracking issue](https://github.com/rook/rook/issues/14832). + When a node is confirmed to be down, add the following taints to the node: ```console diff --git a/Documentation/Troubleshooting/ceph-csi-common-issues.md b/Documentation/Troubleshooting/ceph-csi-common-issues.md index d02a8ab9a0fc..c78930076853 100644 --- a/Documentation/Troubleshooting/ceph-csi-common-issues.md +++ b/Documentation/Troubleshooting/ceph-csi-common-issues.md @@ -413,9 +413,6 @@ Where `-m` is one of the mon endpoints and the `--key` is the key used by the CS When a node is lost, you will see application pods on the node stuck in the `Terminating` state while another pod is rescheduled and is in the `ContainerCreating` state. -!!! important - For clusters with Kubernetes version 1.26 or greater, see the [improved automation](../Storage-Configuration/Block-Storage-RBD/block-storage.md#recover-rbd-rwo-volume-in-case-of-node-loss) to recover from the node loss. If using K8s 1.25 or older, continue with these instructions. - ### Force deleting the pod To force delete the pod stuck in the `Terminating` state: diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index f8f767091338..41cd44c1b533 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -91,9 +91,9 @@ func (c *clientCluster) onK8sNode(ctx context.Context, object runtime.Object, op cluster := c.getCephCluster() // Continue reconcile in case of failure too since we don't want to block other node reconcile - if err := c.handleNodeFailure(ctx, cluster, node, opNamespace); err != nil { - logger.Errorf("failed to handle node failure. %v", err) - } + // if err := c.handleNodeFailure(ctx, cluster, node, opNamespace); err != nil { + // logger.Errorf("failed to handle node failure. %v", err) + // } // skip reconcile if node is already checked in a previous reconcile if nodesCheckedForReconcile.Has(node.Name) {