diff --git a/vclusterops/add_node.go b/vclusterops/add_node.go index 75b5e06..e4d383b 100644 --- a/vclusterops/add_node.go +++ b/vclusterops/add_node.go @@ -207,7 +207,7 @@ func (vcc *VClusterCommands) VAddNode(options *VAddNodeOptions) (VCoordinationDa // to add already exists in db. func checkAddNodeRequirements(vdb *VCoordinationDatabase, hostsToAdd []string) error { // we don't want any of the new host to be part of the db. - if nodes := vdb.containNodes(hostsToAdd); len(nodes) != 0 { + if nodes, _ := vdb.containNodes(hostsToAdd); len(nodes) != 0 { return fmt.Errorf("%s already exist in the database", strings.Join(nodes, ",")) } diff --git a/vclusterops/coordinator_database.go b/vclusterops/coordinator_database.go index 0e0acaf..3eaded9 100644 --- a/vclusterops/coordinator_database.go +++ b/vclusterops/coordinator_database.go @@ -21,6 +21,7 @@ import ( "path/filepath" "strings" + mapset "github.com/deckarep/golang-set/v2" "github.com/vertica/vcluster/vclusterops/util" "github.com/vertica/vcluster/vclusterops/vlog" "golang.org/x/exp/maps" @@ -264,21 +265,22 @@ func (vdb *VCoordinationDatabase) getSCNames() []string { return scNames } -// containNodes returns the number of input nodes contained in the vdb. -func (vdb *VCoordinationDatabase) containNodes(nodes []string) []string { - hostSet := make(map[string]struct{}) - for _, n := range nodes { - hostSet[n] = struct{}{} - } - dupHosts := []string{} +// containNodes determines which nodes are in the vdb and which ones are not. +// The node is determined by looking up the host address. +func (vdb *VCoordinationDatabase) containNodes(nodes []string) (nodesInDB, nodesNotInDB []string) { + hostSet := mapset.NewSet(nodes...) + nodesInDB = []string{} for _, vnode := range vdb.HostNodeMap { address := vnode.Address - if _, exist := hostSet[address]; exist { - dupHosts = append(dupHosts, address) + if exist := hostSet.Contains(address); exist { + nodesInDB = append(nodesInDB, address) } } - return dupHosts + if len(nodesInDB) == len(nodes) { + return nodesInDB, nil + } + return nodesInDB, util.SliceDiff(nodes, nodesInDB) } // hasAtLeastOneDownNode returns true if the current VCoordinationDatabase instance diff --git a/vclusterops/https_drop_node_op.go b/vclusterops/https_drop_node_op.go index ea733af..59c9f6d 100644 --- a/vclusterops/https_drop_node_op.go +++ b/vclusterops/https_drop_node_op.go @@ -17,6 +17,7 @@ package vclusterops import ( "errors" + "strconv" "github.com/vertica/vcluster/vclusterops/util" "github.com/vertica/vcluster/vclusterops/vlog" @@ -29,12 +30,14 @@ type httpsDropNodeOp struct { RequestParams map[string]string } +// makeHTTPSDropNodeOp is a constructor for httpsDropNodeOp. The cascade option +// should be true if an EON deployment and the node we are dropping is down. func makeHTTPSDropNodeOp(logger vlog.Printer, vnode string, initiatorHost []string, useHTTPPassword bool, userName string, httpsPassword *string, - isEon bool) (httpsDropNodeOp, error) { + cascade bool) (httpsDropNodeOp, error) { op := httpsDropNodeOp{} op.name = "HTTPSDropNodeOp" op.logger = logger.WithName(op.name) @@ -48,11 +51,7 @@ func makeHTTPSDropNodeOp(logger vlog.Printer, vnode string, op.userName = userName op.httpsPassword = httpsPassword op.RequestParams = make(map[string]string) - if isEon { - op.RequestParams["cascade"] = "true" - return op, nil - } - op.RequestParams["cascade"] = "false" + op.RequestParams["cascade"] = strconv.FormatBool(cascade) return op, nil } diff --git a/vclusterops/remove_node.go b/vclusterops/remove_node.go index a4d6b68..e7ae13a 100644 --- a/vclusterops/remove_node.go +++ b/vclusterops/remove_node.go @@ -18,7 +18,6 @@ package vclusterops import ( "errors" "fmt" - "strings" "github.com/vertica/vcluster/vclusterops/util" "github.com/vertica/vcluster/vclusterops/vlog" @@ -135,11 +134,15 @@ func (vcc *VClusterCommands) VRemoveNode(options *VRemoveNodeOptions) (VCoordina options.DBName = &dbName options.Hosts = hosts - // get depot and data prefix from config file or options + // get depot, data and catalog prefix from config file or options *options.DepotPrefix, *options.DataPrefix, err = options.getDepotAndDataPrefix(options.Config) if err != nil { return vdb, err } + options.CatalogPrefix, err = options.getCatalogPrefix(options.Config) + if err != nil { + return vdb, err + } err = vcc.getVDBFromRunningDB(&vdb, &options.DatabaseOptions) if err != nil { @@ -151,20 +154,46 @@ func (vcc *VClusterCommands) VRemoveNode(options *VRemoveNodeOptions) (VCoordina return vdb, err } - // remove_node is aborted if requirements are not met - err = checkRemoveNodeRequirements(&vdb, options.HostsToRemove) + // remove_node is aborted if requirements are not met. + err = checkRemoveNodeRequirements(&vdb) if err != nil { return vdb, err } + // Figure out if the nodes to remove exist in the catalog. We follow + // *normal* remove node logic if it still exists in the catalog. We tolerate + // requests for nodes that aren't in the catalog because the caller may not + // know (e.g. previous attempt to remove node didn't come back successful). + // We have a simplified remove process for those requests to remove state + // that the caller may be checking. + var hostsNotInCatalog []string + options.HostsToRemove, hostsNotInCatalog = vdb.containNodes(options.HostsToRemove) + + vdb, err = vcc.removeNodesInCatalog(options, &vdb) + if err != nil || len(hostsNotInCatalog) == 0 { + return vdb, err + } + + return vcc.handleRemoveNodeForHostsNotInCatalog(&vdb, options, hostsNotInCatalog) +} - err = options.setInitiator(vdb.PrimaryUpNodes) +// removeNodesInCatalog will perform the steps to remove nodes. The node list in +// options.HostsToRemove has already been verified that each node is in the +// catalog. +func (vcc *VClusterCommands) removeNodesInCatalog(options *VRemoveNodeOptions, vdb *VCoordinationDatabase) (VCoordinationDatabase, error) { + if len(options.HostsToRemove) == 0 { + vcc.Log.Info("Exit early because there are no hosts to remove") + return *vdb, nil + } + vcc.Log.V(1).Info("validated input hosts", "HostsToRemove", options.HostsToRemove) + + err := options.setInitiator(vdb.PrimaryUpNodes) if err != nil { - return vdb, err + return *vdb, err } - instructions, err := vcc.produceRemoveNodeInstructions(&vdb, options) + instructions, err := vcc.produceRemoveNodeInstructions(vdb, options) if err != nil { - return vdb, fmt.Errorf("fail to produce remove node instructions, %w", err) + return *vdb, fmt.Errorf("fail to produce remove node instructions, %w", err) } remainingHosts := util.SliceDiff(vdb.HostList, options.HostsToRemove) @@ -177,7 +206,7 @@ func (vcc *VClusterCommands) VRemoveNode(options *VRemoveNodeOptions) (VCoordina // Here we check whether the to-be-removed nodes are still in the catalog. // If they have been removed from catalog, we let remove_node succeed. if vcc.findRemovedNodesInCatalog(options, remainingHosts) { - return vdb, fmt.Errorf("fail to complete remove node operation, %w", runError) + return *vdb, fmt.Errorf("fail to complete remove node operation, %w", runError) } // If the target nodes have already been removed from catalog, // show a warning about the run error for users to trouble shoot their machines @@ -189,20 +218,57 @@ func (vcc *VClusterCommands) VRemoveNode(options *VRemoveNodeOptions) (VCoordina return vdb.copy(remainingHosts), nil } -// checkRemoveNodeRequirements validates the following remove_node requirements: -// - Check the existence of the nodes to remove -// - Check if all nodes are up or standby (enterprise only) -func checkRemoveNodeRequirements(vdb *VCoordinationDatabase, hostsToRemove []string) error { - if nodes := vdb.containNodes(hostsToRemove); len(nodes) != len(hostsToRemove) { - notFoundHosts := util.SliceDiff(hostsToRemove, nodes) - return fmt.Errorf("%s do not exist in the database", strings.Join(notFoundHosts, ",")) +// handleRemoveNodeForHostsNotInCatalog will build and execute a list of +// instructions to do remove of hosts that aren't present in the catalog. We +// will do basic cleanup logic for this needed by the operator. +func (vcc *VClusterCommands) handleRemoveNodeForHostsNotInCatalog(vdb *VCoordinationDatabase, options *VRemoveNodeOptions, + missingHosts []string) (VCoordinationDatabase, error) { + vcc.Log.Info("Doing cleanup of hosts missing from database", "hostsNotInCatalog", missingHosts) + + // We need to find the paths for the hosts we are removing. + nmaGetNodesInfoOp := makeNMAGetNodesInfoOp(vcc.Log, missingHosts, *options.DBName, *options.CatalogPrefix, + false /* report all errors */, vdb) + instructions := []clusterOp{&nmaGetNodesInfoOp} + certs := httpsCerts{key: options.Key, cert: options.Cert, caCert: options.CaCert} + opEng := makeClusterOpEngine(instructions, &certs) + err := opEng.run(vcc.Log) + if err != nil { + return *vdb, fmt.Errorf("failed to get node info for missing hosts: %w", err) } + + // Make a vdb of just the missing hosts. The host list for + // nmaDeleteDirectoriesOp uses the host list from the vdb. + vdbForDeleteDir := vdb.copy(missingHosts) + err = options.completeVDBSetting(&vdbForDeleteDir) + if err != nil { + return *vdb, err + } + + // Using the paths fetched earlier, we can now build the list of directories + // that the NMA should remove. + nmaDeleteDirectoriesOp, err := makeNMADeleteDirectoriesOp(vcc.Log, &vdbForDeleteDir, *options.ForceDelete) + if err != nil { + return *vdb, err + } + instructions = []clusterOp{&nmaDeleteDirectoriesOp} + opEng = makeClusterOpEngine(instructions, &certs) + err = opEng.run(vcc.Log) + if err != nil { + return *vdb, fmt.Errorf("failed to delete directories for missing hosts: %w", err) + } + + remainingHosts := util.SliceDiff(vdb.HostList, missingHosts) + return vdb.copy(remainingHosts), nil +} + +// checkRemoveNodeRequirements validates any remove_node requirements. It will +// return an error if a requirement isn't met. +func checkRemoveNodeRequirements(vdb *VCoordinationDatabase) error { if !vdb.IsEon { if vdb.hasAtLeastOneDownNode() { return errors.New("all nodes must be up or standby") } } - return nil } @@ -379,7 +445,8 @@ func (vcc *VClusterCommands) produceDropNodeOps(instructions *[]clusterOp, targe hostNodeMap vHostNodeMap, isEon bool) error { for _, host := range targetHosts { httpsDropNodeOp, err := makeHTTPSDropNodeOp(vcc.Log, hostNodeMap[host].Name, hosts, - useHTTPPassword, userName, httpsPassword, isEon) + useHTTPPassword, userName, httpsPassword, + isEon && hostNodeMap[host].State == util.NodeDownState) if err != nil { return err } diff --git a/vclusterops/start_db.go b/vclusterops/start_db.go index 5dbbe25..4e429ff 100644 --- a/vclusterops/start_db.go +++ b/vclusterops/start_db.go @@ -189,7 +189,7 @@ func (vcc *VClusterCommands) VStartDatabase(options *VStartDatabaseOptions) erro func (vcc *VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, vdb *VCoordinationDatabase) error { // pre-instruction to perform basic checks and get basic information - preInstructions, err := vcc.produceStartDBPreCheck(options, vdb) + preInstructions, err := vcc.produceStartDBPreCheck(options, vdb, *options.TrimHostList) if err != nil { return fmt.Errorf("fail to production instructions: %w", err) } @@ -202,28 +202,35 @@ func (vcc *VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, return fmt.Errorf("fail to start database pre-checks: %w", runError) } - // if TrimHostList is true, - // update the host list as some provided hosts may not exist in the catalog + // If requested, remove any provided hosts that are not in the catalog. Use + // the vdb that we just fetched by the catalog editor. It will be the from + // the latest catalog. if *options.TrimHostList { - var trimmedHostList []string - var extraHosts []string - - for _, h := range options.Hosts { - if _, exist := vdb.HostNodeMap[h]; exist { - trimmedHostList = append(trimmedHostList, h) - } else { - extraHosts = append(extraHosts, h) - } - } + options.Hosts = vcc.removeHostsNotInCatalog(&clusterOpEngine.execContext.nmaVDatabase, options.Hosts) + } + + return nil +} - if len(extraHosts) > 0 { - vcc.Log.PrintInfo("The following hosts will be trimmed as they are not found in catalog: %+v", - extraHosts) - options.Hosts = trimmedHostList +func (vcc *VClusterCommands) removeHostsNotInCatalog(vdb *nmaVDatabase, hosts []string) []string { + var trimmedHostList []string + var extraHosts []string + + vcc.Log.Info("checking if any input hosts can be removed", + "hosts", hosts, "hostNodeMap", vdb.HostNodeMap) + for _, h := range hosts { + if _, exist := vdb.HostNodeMap[h]; exist { + trimmedHostList = append(trimmedHostList, h) + } else { + extraHosts = append(extraHosts, h) } } - return nil + if len(extraHosts) > 0 { + vcc.Log.PrintInfo("The following hosts will be trimmed as they are not found in catalog: %+v", + extraHosts) + } + return trimmedHostList } // produceStartDBPreCheck will build a list of pre-check instructions to execute for @@ -234,7 +241,9 @@ func (vcc *VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, // - Check NMA connectivity // - Check to see if any dbs run // - Get nodes' information by calling the NMA /nodes endpoint -func (vcc *VClusterCommands) produceStartDBPreCheck(options *VStartDatabaseOptions, vdb *VCoordinationDatabase) ([]clusterOp, error) { +// - Find latest catalog to use for removal of nodes not in the catalog +func (vcc *VClusterCommands) produceStartDBPreCheck(options *VStartDatabaseOptions, vdb *VCoordinationDatabase, + findLatestCatalog bool) ([]clusterOp, error) { var instructions []clusterOp nmaHealthOp := makeNMAHealthOp(vcc.Log, options.Hosts) @@ -261,6 +270,14 @@ func (vcc *VClusterCommands) produceStartDBPreCheck(options *VStartDatabaseOptio instructions = append(instructions, &nmaGetNodesInfoOp) } + if findLatestCatalog { + nmaReadCatalogEditorOp, err := makeNMAReadCatalogEditorOp(vcc.Log, vdb) + if err != nil { + return instructions, err + } + instructions = append(instructions, &nmaReadCatalogEditorOp) + } + return instructions, nil } diff --git a/vclusterops/start_node.go b/vclusterops/start_node.go index a335ba7..b8bda91 100644 --- a/vclusterops/start_node.go +++ b/vclusterops/start_node.go @@ -117,9 +117,10 @@ func (options *VStartNodesOptions) validateAnalyzeOptions(logger vlog.Printer) e } // VStartNodes starts the given nodes for a cluster that has not yet lost -// cluster quorum and returns any error encountered. -// If necessary, it updates the node's IP in the Vertica catalog. -// If cluster quorum is already lost, use VStartDatabase. +// cluster quorum. Returns any error encountered. If necessary, it updates the +// node's IP in the Vertica catalog. If cluster quorum is already lost, use +// VStartDatabase. It will skip any nodes given that no longer exist in the +// catalog. func (vcc *VClusterCommands) VStartNodes(options *VStartNodesOptions) error { /* * - Produce Instructions @@ -163,8 +164,12 @@ func (vcc *VClusterCommands) VStartNodes(options *VStartNodesOptions) error { for nodename, newIP := range options.Nodes { oldIP, ok := hostNodeNameMap[nodename] if !ok { - vcc.Log.PrintError("fail to provide a non-existent node name %s", nodename) - return fmt.Errorf("the node with the provided name %s does not exist", nodename) + // We can get here if the caller requests a node that we were in the + // middle of removing. Log a warning and continue without starting + // that node. + vcc.Log.Info("skipping start of node that doesn't exist in the catalog", + "nodename", nodename, "newIP", newIP) + continue } // if the IP that is given is different than the IP in the catalog, a re-ip is necessary if oldIP != newIP { @@ -181,6 +186,13 @@ func (vcc *VClusterCommands) VStartNodes(options *VStartNodesOptions) error { restartNodeInfo.HostsToStart = append(restartNodeInfo.HostsToStart, restartNodeInfo.ReIPList...) restartNodeInfo.HostsToStart = append(restartNodeInfo.HostsToStart, hostsNoNeedToReIP...) + // If no nodes found to start. We can simply exit here. This can happen if + // given a list of nodes that aren't in the catalog any longer. + if len(restartNodeInfo.HostsToStart) == 0 { + vcc.Log.Info("None of the nodes provided are in the catalog. There is nothing to start.") + return nil + } + // produce restart_node instructions instructions, err := vcc.produceStartNodesInstructions(restartNodeInfo, options, &vdb) if err != nil {