Skip to content

Commit 18ae24f

Browse files
committed
Add support for machines
External remediation can be trigger by MHC or NHC, and the ppr name will be machine or node respectively. This adds support for machine name in ppr name Signed-off-by: Nir <[email protected]>
1 parent 0929037 commit 18ae24f

File tree

5 files changed

+480
-1
lines changed

5 files changed

+480
-1
lines changed

config/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ rules:
1818
- patch
1919
- update
2020
- watch
21+
- apiGroups:
22+
- machine.openshift.io
23+
resources:
24+
- machines
25+
verbs:
26+
- get
27+
- list
28+
- watch
2129
- apiGroups:
2230
- poison-pill.medik8s.io
2331
resources:

controllers/poisonpillremediation_controller.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"sigs.k8s.io/controller-runtime/pkg/client"
4141

4242
"github.com/medik8s/poison-pill/api/v1alpha1"
43+
machinev1beta1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
4344
)
4445

4546
const (
@@ -89,6 +90,7 @@ type PoisonPillRemediationReconciler struct {
8990
//+kubebuilder:rbac:groups=poison-pill.medik8s.io,resources=poisonpillremediations/status,verbs=get;update;patch
9091
//+kubebuilder:rbac:groups=poison-pill.medik8s.io,resources=poisonpillremediations/finalizers,verbs=update
9192
//+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete
93+
//+kubebuilder:rbac:groups=machine.openshift.io,resources=machines,verbs=get;list;watch
9294

9395
func (r *PoisonPillRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
9496
r.logger = r.Log.WithValues("poisonpillremediation", req.NamespacedName)
@@ -280,8 +282,21 @@ func (r *PoisonPillRemediationReconciler) restoreNode(nodeToRestore *v1.Node) (c
280282
return ctrl.Result{RequeueAfter: reconcileInterval}, nil
281283
}
282284

283-
// getNodeByMachine returns the node object referenced by machine
285+
// getNodeFromPpr returns the unhealthy node reported in the given ppr
284286
func (r *PoisonPillRemediationReconciler) getNodeFromPpr(ppr *v1alpha1.PoisonPillRemediation) (*v1.Node, error) {
287+
//PPR could be created by either machine based controller (e.g. MHC) or
288+
//by a node based controller (e.g. NHC). This assumes that machine based controller
289+
//will create the ppr with machine owner reference
290+
291+
for _, ownerRef := range ppr.OwnerReferences {
292+
if ownerRef.Kind == "Machine" {
293+
r.logger.Info("assuming the unhealthy resource is a machine")
294+
return r.getNodeFromMachine(ownerRef, ppr.Namespace)
295+
}
296+
}
297+
298+
r.logger.Info("assuming the unhealthy resource is a node")
299+
//since we didn't find a machine owner ref, we assume that ppr name is the unhealthy node name
285300
node := &v1.Node{}
286301
key := client.ObjectKey{
287302
Name: ppr.Name,
@@ -295,6 +310,40 @@ func (r *PoisonPillRemediationReconciler) getNodeFromPpr(ppr *v1alpha1.PoisonPil
295310
return node, nil
296311
}
297312

313+
func (r *PoisonPillRemediationReconciler) getNodeFromMachine(ref metav1.OwnerReference, ns string) (*v1.Node, error) {
314+
machine := &machinev1beta1.Machine{}
315+
machineKey := client.ObjectKey{
316+
Name: ref.Name,
317+
Namespace: ns,
318+
}
319+
320+
if err := r.Client.Get(context.Background(), machineKey, machine); err != nil {
321+
r.logger.Error(err, "failed to get machine from PoisonPillRemediation CR owner ref",
322+
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
323+
return nil, err
324+
}
325+
326+
if machine.Status.NodeRef == nil {
327+
err := errors.New("nodeRef is nil")
328+
r.logger.Error(err, "failed to retrieve node from the unhealthy machine")
329+
return nil, err
330+
}
331+
332+
node := &v1.Node{}
333+
key := client.ObjectKey{
334+
Name: machine.Status.NodeRef.Name,
335+
Namespace: machine.Status.NodeRef.Namespace,
336+
}
337+
338+
if err := r.ApiReader.Get(context.Background(), key, node); err != nil {
339+
r.logger.Error(err, "failed to retrieve node from the unhealthy machine",
340+
"node name", node.Name, "machine name", machine.Name)
341+
return nil, err
342+
}
343+
344+
return node, nil
345+
}
346+
298347
func (r *PoisonPillRemediationReconciler) markNodeAsUnschedulable(node *v1.Node) (ctrl.Result, error) {
299348
node.Spec.Unschedulable = true
300349
r.logger.Info("Marking node as unschedulable", "node name", node.Name)

go.mod

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,11 @@ require (
1212
k8s.io/apimachinery v0.20.0
1313
k8s.io/client-go v0.20.0
1414
sigs.k8s.io/controller-runtime v0.7.2
15+
16+
github.com/openshift/machine-api-operator v0.2.1-0.20210104142355-8e6ae0acdfcf
17+
)
18+
19+
replace (
20+
sigs.k8s.io/cluster-api-provider-aws => github.com/openshift/cluster-api-provider-aws v0.2.1-0.20201216171336-0b00fb8d96ac
21+
sigs.k8s.io/cluster-api-provider-azure => github.com/openshift/cluster-api-provider-azure v0.1.0-alpha.3.0.20201209184807-075372e2ed03
1522
)

0 commit comments

Comments
 (0)