Skip to content

Commit e3cf7fd

Browse files
api,agent,server,engine-schema: scalability improvements
Following changes and improvements have been added: - Improvements in handling of PingRoutingCommand 1. Added global config - `vm.sync.power.state.transitioning`, default value: true, to control syncing of power states for transitioning VMs. This can be set to false to prevent computation of transitioning state VMs. 2. Improved VirtualMachinePowerStateSync to allow power state sync for host VMs in a batch 3. Optimized scanning stalled VMs - Added option to set worker threads for capacity calculation using config - `capacity.calculate.workers` - Added caching framework based on Caffeine in-memory caching library, https://github.com/ben-manes/caffeine - Added caching for account/use role API access with expiration after write can be configured using config - `dynamic.apichecker.cache.period`. If set to zero then there will be no caching. Default is 0. - Added caching for account/use role API access with expiration after write set to 60 seconds. - Added caching for some recurring DB retrievals 1. CapacityManager - listing service offerings - beneficial in host capacity calculation 2. LibvirtServerDiscoverer existing host for the cluster - beneficial for host joins 3. DownloadListener - hypervisors for zone - beneficial for host joins 5. VirtualMachineManagerImpl - VMs in progress- beneficial for processing stalled VMs during PingRoutingCommands - Optimized MS list retrieval for agent connect - Optimize finding ready systemvm template for zone - Database retrieval optimisations - fix and refactor for cases where only IDs or counts are used mainly for hosts and other infra entities. Also similar cases for VMs and other entities related to host concerning background tasks - Changes in agent-agentmanager connection with NIO client-server classes 1. Optimized the use of the executor service 2. Refactore Agent class to better handle connections. 3. Do SSL handshakes within worker threads 5. Added global configs to control the behaviour depending on the infra. SSL handshake could be a bottleneck during agent connections. Configs - `agent.ssl.handshake.min.workers` and `agent.ssl.handshake.max.workers` can be used to control number of new connections management server handles at a time. `agent.ssl.handshake.timeout` can be used to set number of seconds after which SSL handshake times out at MS end. 6. On agent side backoff and sslhandshake timeout can be controlled by agent properties. `backoff.seconds` and `ssl.handshake.timeout` properties can be used. - Improvements in StatsCollection - minimize DB retrievals. - Improvements in DeploymentPlanner allow for the retrieval of only desired host fields and fewer retrievals. - Improvements in hosts connection for a storage pool. Added config - `storage.pool.host.connect.workers` to control the number of worker threads that can be used to connect hosts to a storage pool. Worker thread approach is followed currently only for NFS and ScaleIO pools. - Minor improvements in resource limit calculations wrt DB retrievals Signed-off-by: Abhishek Kumar <[email protected]> Co-authored-by: Abhishek Kumar <[email protected]> Co-authored-by: Rohit Yadav <[email protected]>
1 parent 019f2c6 commit e3cf7fd

File tree

128 files changed

+3072
-2041
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

128 files changed

+3072
-2041
lines changed

.python-version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.6
1+
3.10

agent/conf/agent.properties

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,9 @@ iscsi.session.cleanup.enabled=false
433433

434434
# Implicit host tags managed by agent.properties
435435
# host.tags=
436+
437+
# Timeout(in seconds) for SSL handshake when agent connects to server
438+
#ssl.handshake.timeout=
439+
440+
# Wait(in seconds) during agent reconnections
441+
#backoff.seconds=

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 401 additions & 371 deletions
Large diffs are not rendered by default.

agent/src/main/java/com/cloud/agent/AgentShell.java

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,6 @@
1616
// under the License.
1717
package com.cloud.agent;
1818

19-
import com.cloud.agent.Agent.ExitStatus;
20-
import com.cloud.agent.dao.StorageComponent;
21-
import com.cloud.agent.dao.impl.PropertiesStorage;
22-
import com.cloud.agent.properties.AgentProperties;
23-
import com.cloud.agent.properties.AgentPropertiesFileHandler;
24-
import com.cloud.resource.ServerResource;
25-
import com.cloud.utils.LogUtils;
26-
import com.cloud.utils.ProcessUtil;
27-
import com.cloud.utils.PropertiesUtil;
28-
import com.cloud.utils.backoff.BackoffAlgorithm;
29-
import com.cloud.utils.backoff.impl.ConstantTimeBackoff;
30-
import com.cloud.utils.exception.CloudRuntimeException;
31-
import org.apache.commons.daemon.Daemon;
32-
import org.apache.commons.daemon.DaemonContext;
33-
import org.apache.commons.daemon.DaemonInitException;
34-
import org.apache.commons.lang.math.NumberUtils;
35-
import org.apache.commons.lang3.BooleanUtils;
36-
import org.apache.commons.lang3.StringUtils;
37-
import org.apache.logging.log4j.Logger;
38-
import org.apache.logging.log4j.LogManager;
39-
import org.apache.logging.log4j.core.config.Configurator;
40-
41-
import javax.naming.ConfigurationException;
4219
import java.io.File;
4320
import java.io.FileNotFoundException;
4421
import java.io.IOException;
@@ -53,6 +30,31 @@
5330
import java.util.Properties;
5431
import java.util.UUID;
5532

33+
import javax.naming.ConfigurationException;
34+
35+
import org.apache.commons.daemon.Daemon;
36+
import org.apache.commons.daemon.DaemonContext;
37+
import org.apache.commons.daemon.DaemonInitException;
38+
import org.apache.commons.lang.math.NumberUtils;
39+
import org.apache.commons.lang3.BooleanUtils;
40+
import org.apache.commons.lang3.StringUtils;
41+
import org.apache.logging.log4j.LogManager;
42+
import org.apache.logging.log4j.Logger;
43+
import org.apache.logging.log4j.core.config.Configurator;
44+
45+
import com.cloud.agent.Agent.ExitStatus;
46+
import com.cloud.agent.dao.StorageComponent;
47+
import com.cloud.agent.dao.impl.PropertiesStorage;
48+
import com.cloud.agent.properties.AgentProperties;
49+
import com.cloud.agent.properties.AgentPropertiesFileHandler;
50+
import com.cloud.resource.ServerResource;
51+
import com.cloud.utils.LogUtils;
52+
import com.cloud.utils.ProcessUtil;
53+
import com.cloud.utils.PropertiesUtil;
54+
import com.cloud.utils.backoff.BackoffAlgorithm;
55+
import com.cloud.utils.backoff.impl.ConstantTimeBackoff;
56+
import com.cloud.utils.exception.CloudRuntimeException;
57+
5658
public class AgentShell implements IAgentShell, Daemon {
5759
protected static Logger LOGGER = LogManager.getLogger(AgentShell.class);
5860

@@ -406,7 +408,9 @@ public void init(String[] args) throws ConfigurationException {
406408

407409
LOGGER.info("Defaulting to the constant time backoff algorithm");
408410
_backoff = new ConstantTimeBackoff();
409-
_backoff.configure("ConstantTimeBackoff", new HashMap<String, Object>());
411+
Map<String, Object> map = new HashMap<>();
412+
map.put("seconds", _properties.getProperty("backoff.seconds"));
413+
_backoff.configure("ConstantTimeBackoff", map);
410414
}
411415

412416
private void launchAgent() throws ConfigurationException {
@@ -455,6 +459,11 @@ public void launchNewAgent(ServerResource resource) throws ConfigurationExceptio
455459
agent.start();
456460
}
457461

462+
@Override
463+
public Integer getSslHandshakeTimeout() {
464+
return AgentPropertiesFileHandler.getPropertyValue(AgentProperties.SSL_HANDSHAKE_TIMEOUT);
465+
}
466+
458467
public synchronized int getNextAgentId() {
459468
return _nextAgentId++;
460469
}

agent/src/main/java/com/cloud/agent/IAgentShell.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,6 @@ public interface IAgentShell {
7070
String getConnectedHost();
7171

7272
void launchNewAgent(ServerResource resource) throws ConfigurationException;
73+
74+
Integer getSslHandshakeTimeout();
7375
}

agent/src/main/java/com/cloud/agent/properties/AgentProperties.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,13 @@ public Property<Integer> getWorkers() {
810810
*/
811811
public static final Property<String> HOST_TAGS = new Property<>("host.tags", null, String.class);
812812

813+
/**
814+
* Timeout for SSL handshake in seconds
815+
* Data type: Integer.<br>
816+
* Default value: <code>null</code>
817+
*/
818+
public static final Property<Integer> SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", null, Integer.class);
819+
813820
public static class Property <T>{
814821
private String name;
815822
private T defaultValue;

api/src/main/java/org/apache/cloudstack/acl/RoleService.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ public interface RoleService {
3030
ConfigKey<Boolean> EnableDynamicApiChecker = new ConfigKey<>("Advanced", Boolean.class, "dynamic.apichecker.enabled", "false",
3131
"If set to true, this enables the dynamic role-based api access checker and disables the default static role-based api access checker.", true);
3232

33+
ConfigKey<Integer> DynamicApiCheckerCachePeriod = new ConfigKey<>("Advanced", Integer.class,
34+
"dynamic.apichecker.cache.period", "0",
35+
"Defines the expiration time in seconds for the Dynamic API Checker cache, determining how long cached data is retained before being refreshed. If set to zero then caching will be disabled",
36+
false);
37+
3338
boolean isEnabled();
3439

3540
/**

api/src/main/java/org/apache/cloudstack/api/command/admin/domain/ListDomainsCmd.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ public EnumSet<DomainDetails> getDetails() throws InvalidParameterValueException
100100
dv = EnumSet.of(DomainDetails.all);
101101
} else {
102102
try {
103-
ArrayList<DomainDetails> dc = new ArrayList<DomainDetails>();
103+
ArrayList<DomainDetails> dc = new ArrayList<>();
104104
for (String detail : viewDetails) {
105105
dc.add(DomainDetails.valueOf(detail));
106106
}
@@ -142,7 +142,10 @@ protected void updateDomainResponse(List<DomainResponse> response) {
142142
if (CollectionUtils.isEmpty(response)) {
143143
return;
144144
}
145-
_resourceLimitService.updateTaggedResourceLimitsAndCountsForDomains(response, getTag());
145+
EnumSet<DomainDetails> details = getDetails();
146+
if (details.contains(DomainDetails.all) || details.contains(DomainDetails.resource)) {
147+
_resourceLimitService.updateTaggedResourceLimitsAndCountsForDomains(response, getTag());
148+
}
146149
if (!getShowIcon()) {
147150
return;
148151
}

api/src/main/java/org/apache/cloudstack/api/command/user/account/ListAccountsCmd.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,10 @@ protected void updateAccountResponse(List<AccountResponse> response) {
149149
if (CollectionUtils.isEmpty(response)) {
150150
return;
151151
}
152-
_resourceLimitService.updateTaggedResourceLimitsAndCountsForAccounts(response, getTag());
152+
EnumSet<DomainDetails> details = getDetails();
153+
if (details.contains(DomainDetails.all) || details.contains(DomainDetails.resource)) {
154+
_resourceLimitService.updateTaggedResourceLimitsAndCountsForAccounts(response, getTag());
155+
}
153156
if (!getShowIcon()) {
154157
return;
155158
}

api/src/main/java/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public interface OutOfBandManagementService {
3939
long getId();
4040
boolean isOutOfBandManagementEnabled(Host host);
4141
void submitBackgroundPowerSyncTask(Host host);
42-
boolean transitionPowerStateToDisabled(List<? extends Host> hosts);
42+
boolean transitionPowerStateToDisabled(List<Long> hostIds);
4343

4444
OutOfBandManagementResponse enableOutOfBandManagement(DataCenter zone);
4545
OutOfBandManagementResponse enableOutOfBandManagement(Cluster cluster);

core/src/main/java/com/cloud/agent/api/CheckNetworkCommand.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public class CheckNetworkCommand extends Command {
2929

3030
public CheckNetworkCommand(List<PhysicalNetworkSetupInfo> networkInfoList) {
3131
this.networkInfoList = networkInfoList;
32+
setWait(120);
3233
}
3334

3435
public List<PhysicalNetworkSetupInfo> getPhysicalNetworkInfoList() {

core/src/main/java/com/cloud/resource/ServerResource.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,12 @@ public interface ServerResource extends Manager {
7878

7979
void setAgentControl(IAgentControl agentControl);
8080

81+
default boolean isExitOnFailures() {
82+
return true;
83+
}
84+
85+
default boolean isAppendAgentNameToLogs() {
86+
return false;
87+
}
88+
8189
}

engine/api/src/main/java/com/cloud/vm/VirtualMachineManager.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.util.List;
2323
import java.util.Map;
2424

25-
import com.cloud.exception.ResourceAllocationException;
2625
import org.apache.cloudstack.context.CallContext;
2726
import org.apache.cloudstack.framework.config.ConfigKey;
2827

@@ -38,6 +37,7 @@
3837
import com.cloud.exception.InsufficientCapacityException;
3938
import com.cloud.exception.InsufficientServerCapacityException;
4039
import com.cloud.exception.OperationTimedoutException;
40+
import com.cloud.exception.ResourceAllocationException;
4141
import com.cloud.exception.ResourceUnavailableException;
4242
import com.cloud.host.Host;
4343
import com.cloud.hypervisor.Hypervisor.HypervisorType;
@@ -101,6 +101,10 @@ public interface VirtualMachineManager extends Manager {
101101
"refer documentation",
102102
true, ConfigKey.Scope.Zone);
103103

104+
ConfigKey<Boolean> VmSyncPowerStateTransitioning = new ConfigKey<>("Advanced", Boolean.class, "vm.sync.power.state.transitioning", "true",
105+
"Whether to sync power states of the transitioning and stalled VMs while processing VM power reports.", false);
106+
107+
104108
interface Topics {
105109
String VM_POWER_STATE = "vm.powerstate";
106110
}
@@ -286,24 +290,22 @@ static String getHypervisorHostname(String name) {
286290

287291
/**
288292
* Obtains statistics for a list of VMs; CPU and network utilization
289-
* @param hostId ID of the host
290-
* @param hostName name of the host
293+
* @param host host
291294
* @param vmIds list of VM IDs
292295
* @return map of VM ID and stats entry for the VM
293296
*/
294-
HashMap<Long, ? extends VmStats> getVirtualMachineStatistics(long hostId, String hostName, List<Long> vmIds);
297+
HashMap<Long, ? extends VmStats> getVirtualMachineStatistics(Host host, List<Long> vmIds);
295298
/**
296299
* Obtains statistics for a list of VMs; CPU and network utilization
297-
* @param hostId ID of the host
298-
* @param hostName name of the host
299-
* @param vmMap map of VM IDs and the corresponding VirtualMachine object
300+
* @param host host
301+
* @param vmMap map of VM instanceName and its ID
300302
* @return map of VM ID and stats entry for the VM
301303
*/
302-
HashMap<Long, ? extends VmStats> getVirtualMachineStatistics(long hostId, String hostName, Map<Long, ? extends VirtualMachine> vmMap);
304+
HashMap<Long, ? extends VmStats> getVirtualMachineStatistics(Host host, Map<String, Long> vmMap);
303305

304-
HashMap<Long, List<? extends VmDiskStats>> getVmDiskStatistics(long hostId, String hostName, Map<Long, ? extends VirtualMachine> vmMap);
306+
HashMap<Long, List<? extends VmDiskStats>> getVmDiskStatistics(Host host, Map<String, Long> vmInstanceNameIdMap);
305307

306-
HashMap<Long, List<? extends VmNetworkStats>> getVmNetworkStatistics(long hostId, String hostName, Map<Long, ? extends VirtualMachine> vmMap);
308+
HashMap<Long, List<? extends VmNetworkStats>> getVmNetworkStatistics(Host host, Map<String, Long> vmInstanceNameIdMap);
307309

308310
Map<Long, Boolean> getDiskOfferingSuitabilityForVm(long vmId, List<Long> diskOfferingIds);
309311

engine/components-api/src/main/java/com/cloud/capacity/CapacityManager.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,11 @@
1616
// under the License.
1717
package com.cloud.capacity;
1818

19-
import java.util.Map;
20-
2119
import org.apache.cloudstack.framework.config.ConfigKey;
2220
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
2321

2422
import com.cloud.host.Host;
2523
import com.cloud.offering.ServiceOffering;
26-
import com.cloud.service.ServiceOfferingVO;
2724
import com.cloud.storage.VMTemplateVO;
2825
import com.cloud.utils.Pair;
2926
import com.cloud.vm.VirtualMachine;
@@ -118,6 +115,10 @@ public interface CapacityManager {
118115
"Percentage (as a value between 0 and 1) of secondary storage capacity threshold.",
119116
true);
120117

118+
ConfigKey<Integer> CapacityCalculateWorkers = new ConfigKey<>(ConfigKey.CATEGORY_ADVANCED, Integer.class,
119+
"capacity.calculate.workers", "1",
120+
"Number of worker threads to be used for capacities calculation", true);
121+
121122
public boolean releaseVmCapacity(VirtualMachine vm, boolean moveFromReserved, boolean moveToReservered, Long hostId);
122123

123124
void allocateVmCapacity(VirtualMachine vm, boolean fromLastHost);
@@ -133,8 +134,6 @@ boolean checkIfHostHasCapacity(long hostId, Integer cpu, long ram, boolean check
133134

134135
void updateCapacityForHost(Host host);
135136

136-
void updateCapacityForHost(Host host, Map<Long, ServiceOfferingVO> offeringsMap);
137-
138137
/**
139138
* @param pool storage pool
140139
* @param templateForVmCreation template that will be used for vm creation
@@ -151,12 +150,12 @@ boolean checkIfHostHasCapacity(long hostId, Integer cpu, long ram, boolean check
151150

152151
/**
153152
* Check if specified host has capability to support cpu cores and speed freq
154-
* @param hostId the host to be checked
153+
* @param host the host to be checked
155154
* @param cpuNum cpu number to check
156155
* @param cpuSpeed cpu Speed to check
157156
* @return true if the count of host's running VMs >= hypervisor limit
158157
*/
159-
boolean checkIfHostHasCpuCapability(long hostId, Integer cpuNum, Integer cpuSpeed);
158+
boolean checkIfHostHasCpuCapability(Host host, Integer cpuNum, Integer cpuSpeed);
160159

161160
/**
162161
* Check if cluster will cross threshold if the cpu/memory requested are accommodated

engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,13 @@ public interface ResourceManager extends ResourceService, Configurable {
138138

139139
public List<HostVO> listAllHostsInOneZoneNotInClusterByHypervisors(List<HypervisorType> types, long dcId, long clusterId);
140140

141-
public List<HypervisorType> listAvailHypervisorInZone(Long hostId, Long zoneId);
141+
public List<HypervisorType> listAvailHypervisorInZone(Long zoneId);
142142

143143
public HostVO findHostByGuid(String guid);
144144

145145
public HostVO findHostByName(String name);
146146

147-
HostStats getHostStatistics(long hostId);
147+
HostStats getHostStatistics(Host host);
148148

149149
Long getGuestOSCategoryId(long hostId);
150150

engine/components-api/src/main/java/com/cloud/storage/StorageManager.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import org.apache.cloudstack.engine.subsystem.api.storage.DataStore;
2424
import org.apache.cloudstack.engine.subsystem.api.storage.HypervisorHostListener;
25+
import org.apache.cloudstack.engine.subsystem.api.storage.Scope;
2526
import org.apache.cloudstack.framework.config.ConfigKey;
2627
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
2728

@@ -42,6 +43,7 @@
4243
import com.cloud.offering.ServiceOffering;
4344
import com.cloud.storage.Storage.ImageFormat;
4445
import com.cloud.utils.Pair;
46+
import com.cloud.utils.exception.CloudRuntimeException;
4547
import com.cloud.vm.DiskProfile;
4648
import com.cloud.vm.VMInstanceVO;
4749

@@ -209,6 +211,10 @@ public interface StorageManager extends StorageService {
209211
ConfigKey<Long> HEURISTICS_SCRIPT_TIMEOUT = new ConfigKey<>("Advanced", Long.class, "heuristics.script.timeout", "3000",
210212
"The maximum runtime, in milliseconds, to execute the heuristic rule; if it is reached, a timeout will happen.", true);
211213

214+
ConfigKey<Integer> StoragePoolHostConnectWorkers = new ConfigKey<>("Storage", Integer.class,
215+
"storage.pool.host.connect.workers", "1",
216+
"Number of worker threads to be used to connect hosts to a primary storage", true);
217+
212218
/**
213219
* should we execute in sequence not involving any storages?
214220
* @return tru if commands should execute in sequence
@@ -360,6 +366,9 @@ static Boolean getFullCloneConfiguration(Long storeId) {
360366

361367
String getStoragePoolMountFailureReason(String error);
362368

369+
void connectHostsToPool(DataStore primaryStore, List<Long> hostIds, Scope scope,
370+
boolean handleStorageConflictException, boolean errorOnNoUpHost) throws CloudRuntimeException;
371+
363372
boolean connectHostToSharedPool(long hostId, long poolId) throws StorageUnavailableException, StorageConflictException;
364373

365374
void disconnectHostFromSharedPool(long hostId, long poolId) throws StorageUnavailableException, StorageConflictException;

0 commit comments

Comments
 (0)