Skip to content

Commit 65c87d0

Browse files
authored
Merge pull request #1475 from hanizang77/250224-1
Enhance K8s UpgradeCluster to upgrade both control plane and node poo…
2 parents 20ab7df + f1b9516 commit 65c87d0

File tree

1 file changed

+131
-12
lines changed

1 file changed

+131
-12
lines changed

cloud-control-manager/cloud-driver/drivers/gcp/resources/ClusterHandler.go

Lines changed: 131 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -798,24 +798,121 @@ func (ClusterHandler *GCPClusterHandler) UpgradeCluster(clusterIID irs.IID, newV
798798
hiscallInfo := GetCallLogScheme(ClusterHandler.Region, call.CLUSTER, clusterIID.NameId, "UpgradeCluster()")
799799

800800
parent := getParentClusterAtContainer(projectID, zone, clusterIID.NameId)
801-
rb := &container.UpdateMasterRequest{
802-
MasterVersion: newVersion,
801+
802+
// 현재 클러스터 버전 확인
803+
currentCluster, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.Get(parent).Do()
804+
if err != nil {
805+
return clusterInfo, fmt.Errorf("Failed to get current cluster version: %v", err)
803806
}
804807

805-
start := call.Start()
806-
op, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.UpdateMaster(parent, rb).Do()
807-
hiscallInfo.ElapsedTime = call.Elapsed(start)
808+
// 이미 원하는 버전으로 업그레이드되었다면 노드 풀만 업그레이드
809+
if currentCluster.CurrentMasterVersion == newVersion {
810+
cblogger.Info(fmt.Sprintf("Control plane is already at version %s, skipping control plane upgrade", newVersion))
811+
} else {
812+
// 기존 컨트롤 플레인 업그레이드 로직
813+
rb := &container.UpdateMasterRequest{
814+
MasterVersion: newVersion,
815+
}
816+
817+
start := call.Start()
818+
op, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.UpdateMaster(parent, rb).Do()
819+
hiscallInfo.ElapsedTime = call.Elapsed(start)
820+
if err != nil {
821+
err := fmt.Errorf("Failed to UpgradeCluster: %v", err)
822+
cblogger.Error(err)
823+
return clusterInfo, err
824+
}
825+
cblogger.Debug(op)
826+
827+
// WaitContainerOperationDone 함수 사용 (20분 타임아웃)
828+
operationErr := WaitContainerOperationDone(ClusterHandler.ContainerClient, projectID, region, zone, op.Name, GCP_CONTAINER_OPERATION_UPDATE_CLUSTER, 1200)
829+
if operationErr != nil {
830+
cblogger.Error(operationErr)
831+
return clusterInfo, operationErr
832+
}
833+
834+
// 컨트롤 플레인 업그레이드 후 클러스터 상태 확인
835+
updatedCluster, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.Get(parent).Do()
836+
if err != nil {
837+
return clusterInfo, fmt.Errorf("Failed to get cluster after master upgrade: %v", err)
838+
}
839+
840+
// 컨트롤 플레인 버전이 실제로 업그레이드 되었는지 확인
841+
if updatedCluster.CurrentMasterVersion != newVersion {
842+
return clusterInfo, fmt.Errorf("Control plane upgrade not complete. Current version: %s, Expected: %s",
843+
updatedCluster.CurrentMasterVersion, newVersion)
844+
}
845+
cblogger.Info(fmt.Sprintf("Control plane upgraded successfully to version %s", newVersion))
846+
}
847+
848+
// 업그레이드 재시도 로직
849+
maxRetries := 10
850+
retryInterval := 120
851+
backoffFactor := 1.5
852+
853+
currentInterval := retryInterval
854+
for i := 0; i < maxRetries; i++ {
855+
hasActive, err := ClusterHandler.hasActiveOperations(projectID, zone, clusterIID.NameId)
856+
if err != nil {
857+
return clusterInfo, err
858+
}
859+
860+
if !hasActive {
861+
break // 진행 중인 작업이 없으면 계속 진행
862+
}
863+
864+
if i == maxRetries-1 {
865+
return clusterInfo, fmt.Errorf("Cluster has active operations after %d retries", maxRetries)
866+
}
867+
868+
cblogger.Info(fmt.Sprintf("Cluster has active operations, waiting %d seconds before retry (%d/%d)",
869+
currentInterval, i+1, maxRetries))
870+
time.Sleep(time.Duration(currentInterval) * time.Second)
871+
872+
// 지수 백오프 적용
873+
currentInterval = int(float64(currentInterval) * backoffFactor)
874+
}
875+
876+
// 노드풀 리스트 조회
877+
cblogger.Info(fmt.Sprintf("Fetching node pools for cluster: %s", clusterIID.NameId))
878+
879+
nodePools, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.NodePools.List(parent).Do()
808880
if err != nil {
809-
err := fmt.Errorf("Failed to UpgradeCluster : %v", err)
881+
err := fmt.Errorf("Failed to list Node Pools: %v", err)
810882
cblogger.Error(err)
811-
return clusterInfo, err
883+
return clusterInfo, err // 노드풀 리스트 조회 오류 시 clusterInfo 반환
812884
}
813-
cblogger.Debug(op)
814885

815-
operationErr := WaitContainerOperationFail(ClusterHandler.ContainerClient, projectID, region, zone, op.Name, GCP_CONTAINER_OPERATION_UPDATE_CLUSTER)
816-
if operationErr != nil {
817-
cblogger.Error(err)
818-
return clusterInfo, err
886+
// Worker Node(노드풀) 업그레이드 기능
887+
for _, nodePool := range nodePools.NodePools { // 각 노드풀을 순회하며 업그레이드
888+
// cblogger.Info(fmt.Sprintf("Upgrading Node Pool: %s", nodePool.Name))
889+
cblogger.Info(fmt.Sprintf("Upgrading Node Pool: %s to version %s", nodePool.Name, newVersion))
890+
891+
nodePoolParent := fmt.Sprintf("projects/%s/locations/%s/clusters/%s/nodePools/%s", projectID, zone, clusterIID.NameId, nodePool.Name)
892+
nodePoolRequest := &container.UpdateNodePoolRequest{
893+
NodeVersion: newVersion, // 각 노드풀의 버전 변경 요청
894+
}
895+
896+
// 업그레이드 요청 전에 로그 추가
897+
cblogger.Info(fmt.Sprintf("Sending upgrade request for Node Pool: %s with path: %s", nodePool.Name, nodePoolParent))
898+
899+
nodeOp, err := ClusterHandler.ContainerClient.Projects.Locations.Clusters.NodePools.Update(nodePoolParent, nodePoolRequest).Do()
900+
if err != nil {
901+
err := fmt.Errorf("Failed to Upgrade Node Pool: %v", err)
902+
cblogger.Error(err)
903+
return clusterInfo, err // 노드풀 업그레이드 실패 시 clusterInfo 반환
904+
}
905+
906+
// GCP 업그레이드 요청 완료 로그
907+
cblogger.Info(fmt.Sprintf("Upgrade request sent for Node Pool: %s, Operation Name: %s", nodePool.Name, nodeOp.Name))
908+
909+
// WaitContainerOperationDone 함수 사용 (20분 타임아웃)
910+
operationErr := WaitContainerOperationDone(ClusterHandler.ContainerClient, projectID, region, zone, nodeOp.Name, GCP_CONTAINER_OPERATION_UPGRADE_NODES, 1200)
911+
if operationErr != nil {
912+
return clusterInfo, operationErr
913+
}
914+
915+
cblogger.Info(fmt.Sprintf("Node Pool %s upgrade completed", nodePool.Name))
819916
}
820917

821918
return ClusterHandler.GetCluster(clusterIID)
@@ -1393,3 +1490,25 @@ func (ClusterHandler *GCPClusterHandler) ListIID() ([]*irs.IID, error) {
13931490
}
13941491
return iidList, nil
13951492
}
1493+
1494+
// 클러스터에 진행 중인 작업이 있는지 확인하는 함수
1495+
func (ClusterHandler *GCPClusterHandler) hasActiveOperations(projectID, zone, clusterName string) (bool, error) {
1496+
listOperationsParent := fmt.Sprintf("projects/%s/locations/%s", projectID, zone)
1497+
1498+
// 진행 중인 작업 목록 조회
1499+
operations, err := ClusterHandler.ContainerClient.Projects.Locations.Operations.List(listOperationsParent).Do()
1500+
if err != nil {
1501+
return false, err
1502+
}
1503+
1504+
// 클러스터와 관련된 진행 중인 작업 검색
1505+
clusterPattern := fmt.Sprintf("/clusters/%s/", clusterName)
1506+
for _, op := range operations.Operations {
1507+
if strings.Contains(op.TargetLink, clusterPattern) && op.Status != "DONE" {
1508+
cblogger.Info(fmt.Sprintf("Found active operation: %s, status: %s", op.Name, op.Status))
1509+
return true, nil
1510+
}
1511+
}
1512+
1513+
return false, nil
1514+
}

0 commit comments

Comments
 (0)