Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ hibernate_version=6.5.3.Final
opensaml_version=5.1.6
jwt_version=0.13.0
jaxb_runtime_version=4.0.6
hazelcast_version=5.5.0
hazelcast_version=5.6.0
fasterxml_version=2.20.1
netty_version=4.2.7.Final
jgit_version=7.5.0.202512021534-r
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/de/tum/cit/aet/artemis/ArtemisApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import org.springframework.core.env.Environment;
import org.springframework.core.env.Profiles;

import com.hazelcast.spring.HazelcastObjectExtractionConfiguration;

import de.tum.cit.aet.artemis.core.PrintStartupBeansEvent;
import de.tum.cit.aet.artemis.core.config.ArtemisCompatibleVersionsConfiguration;
import de.tum.cit.aet.artemis.core.config.ArtemisConfigHelper;
Expand All @@ -39,7 +41,9 @@
import tech.jhipster.config.DefaultProfileUtil;
import tech.jhipster.config.JHipsterConstants;

@SpringBootApplication
// Exclude HazelcastObjectExtractionConfiguration due to incompatibility with custom Hazelcast configuration
// See: https://github.com/hazelcast/hazelcast/issues/26553
@SpringBootApplication(exclude = HazelcastObjectExtractionConfiguration.class)
@EnableConfigurationProperties({ LiquibaseProperties.class, ProgrammingLanguageConfiguration.class, TheiaConfiguration.class, LicenseConfiguration.class,
ArtemisCompatibleVersionsConfiguration.class })
public class ArtemisApp {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
Expand All @@ -38,6 +39,10 @@
import org.springframework.core.env.Environment;
import org.springframework.core.env.Profiles;

import com.hazelcast.client.HazelcastClient;
import com.hazelcast.client.config.ClientConfig;
import com.hazelcast.client.config.ClientConnectionStrategyConfig;
import com.hazelcast.client.config.RoutingMode;
import com.hazelcast.config.Config;
import com.hazelcast.config.EvictionConfig;
import com.hazelcast.config.EvictionPolicy;
Expand Down Expand Up @@ -97,6 +102,16 @@ public class CacheConfiguration {

private static final Logger log = LoggerFactory.getLogger(CacheConfiguration.class);

/**
* Metadata key used to identify the Hazelcast member type in the service registry.
* Core nodes are marked as "member", build agent clients are marked as "client".
*/
public static final String HAZELCAST_MEMBER_TYPE_KEY = "hazelcast.member-type";

public static final String HAZELCAST_MEMBER_TYPE_MEMBER = "member";

public static final String HAZELCAST_MEMBER_TYPE_CLIENT = "client";

private final ServerProperties serverProperties;

// the service registry, in our current deployment this is the jhipster registry which offers a Eureka Server under the hood
Expand All @@ -106,6 +121,9 @@ public class CacheConfiguration {

private final Environment env;

// Lazy-injected to avoid circular dependency (HazelcastConnection depends on the HazelcastInstance we create)
private final HazelcastConnection hazelcastConnection;

@Value("${spring.jpa.properties.hibernate.cache.hazelcast.instance_name}")
private String instanceName;

Expand All @@ -118,10 +136,12 @@ public class CacheConfiguration {
@Value("${spring.hazelcast.localInstances:true}")
private boolean hazelcastLocalInstances;

public CacheConfiguration(ApplicationContext applicationContext, ServerProperties serverProperties, Optional<Registration> registration, Environment env) {
public CacheConfiguration(ApplicationContext applicationContext, ServerProperties serverProperties, Optional<Registration> registration,
@Lazy HazelcastConnection hazelcastConnection, Environment env) {
this.applicationContext = applicationContext;
this.serverProperties = serverProperties;
this.registration = registration;
this.hazelcastConnection = hazelcastConnection;
this.env = env;

// Do not send telemetry to Hazelcast.
Expand All @@ -132,7 +152,9 @@ public CacheConfiguration(ApplicationContext applicationContext, ServerPropertie
@PreDestroy
public void destroy() {
log.info("Closing Cache Manager");
// Shutdown all Hazelcast instances (both cluster members and clients)
Hazelcast.shutdownAll();
HazelcastClient.shutdownAll();
}

@Bean
Expand Down Expand Up @@ -183,6 +205,21 @@ public HazelcastInstance hazelcastInstance(JHipsterProperties jHipsterProperties
}
// ========================= TESTING ONLY =========================

Collection<String> activeProfiles = Arrays.asList(env.getActiveProfiles());

// ========================= BUILD AGENT CLIENT MODE =========================
// Build agents connect as Hazelcast clients instead of cluster members.
// This isolates the core cluster from build agent failures and eliminates
// heartbeat overhead between build agents and core nodes.
// Note: Single-node deployments (with both core and buildagent profiles) run as
// cluster members since client mode provides no benefit when there's no cluster separation.
// Note: Test profiles are excluded - build agent tests create a local Hazelcast instance.
if (activeProfiles.contains(PROFILE_BUILDAGENT) && !activeProfiles.contains(PROFILE_CORE) && !activeProfiles.contains(PROFILE_TEST_BUILDAGENT)) {
log.info("Build agent connecting to core cluster as Hazelcast client");
return createHazelcastClient();
}
// ========================= END BUILD AGENT CLIENT MODE =========================

log.debug("Configuring Hazelcast");
HazelcastInstance hazelCastInstance = Hazelcast.getHazelcastInstanceByName(instanceName);
if (hazelCastInstance != null) {
Expand Down Expand Up @@ -250,6 +287,8 @@ public HazelcastInstance hazelcastInstance(JHipsterProperties jHipsterProperties
config.getNetworkConfig().setPort(hazelcastPort); // Own port
registration.get().getMetadata().put("hazelcast.port", String.valueOf(hazelcastPort));
}
// Mark this instance as a cluster member (not a client) for service discovery
registration.get().getMetadata().put(HAZELCAST_MEMBER_TYPE_KEY, HAZELCAST_MEMBER_TYPE_MEMBER);
}

config.getMapConfigs().put("default", initializeDefaultMapConfig(jHipsterProperties));
Expand All @@ -271,8 +310,52 @@ public HazelcastInstance hazelcastInstance(JHipsterProperties jHipsterProperties
ClusterProperty.MERGE_FIRST_RUN_DELAY_SECONDS.setSystemProperty("30");
ClusterProperty.MERGE_NEXT_RUN_DELAY_SECONDS.setSystemProperty("30");

// ===================== Cluster Stability Configuration =====================
// These settings prevent cascading failures when individual members become unresponsive.
// See: https://docs.hazelcast.com/hazelcast/5.5/clusters/failure-detector-configuration

// Use Phi Accrual failure detector instead of deadline-based detection.
// Phi Accrual is adaptive and calculates suspicion probability based on historical
// heartbeat patterns, making it more resilient to temporary GC pauses or network hiccups.
config.setProperty("hazelcast.heartbeat.failuredetector.type", "phi-accrual");
// Suspicion threshold: lower = more aggressive (default 10, range 1-16)
// Value of 8 provides faster detection while still tolerating brief delays
config.setProperty("hazelcast.heartbeat.phiaccrual.failuredetector.threshold", "8");
// Number of heartbeat samples to keep for calculating variance (default 200)
config.setProperty("hazelcast.heartbeat.phiaccrual.failuredetector.sample.size", "100");
// Minimum standard deviation in milliseconds (default 100)
config.setProperty("hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis", "100");

// Heartbeat configuration - reduced from defaults for faster detection
// Heartbeat interval: how often heartbeats are sent (default 5 seconds)
ClusterProperty.HEARTBEAT_INTERVAL_SECONDS.setSystemProperty("5");
// Maximum time without heartbeat before suspecting a member (default 60 seconds)
// With phi-accrual detector, this acts as an absolute upper bound
ClusterProperty.MAX_NO_HEARTBEAT_SECONDS.setSystemProperty("15");

// Operation timeouts - prevent threads from blocking too long on unresponsive members
// Timeout for remote operations (default 60000ms) - reduced to fail faster
ClusterProperty.OPERATION_CALL_TIMEOUT_MILLIS.setSystemProperty("15000");
// Timeout for backup acknowledgments (default 5000ms)
ClusterProperty.OPERATION_BACKUP_TIMEOUT_MILLIS.setSystemProperty("5000");

// Invocation retry configuration - fail faster instead of retrying indefinitely
// Maximum retry count for failed invocations (default ~250)
ClusterProperty.INVOCATION_MAX_RETRY_COUNT.setSystemProperty("5");
// Pause between retries in milliseconds (default 500)
ClusterProperty.INVOCATION_RETRY_PAUSE.setSystemProperty("1000");

// Slow operation detection - helps identify problematic operations
// Threshold for logging slow operations (default 10000ms)
ClusterProperty.SLOW_OPERATION_DETECTOR_THRESHOLD_MILLIS.setSystemProperty("5000");
// How long to retain slow operation logs (default 60 seconds)
ClusterProperty.SLOW_OPERATION_DETECTOR_LOG_RETENTION_SECONDS.setSystemProperty("300");

// Connection timeouts
config.setProperty("hazelcast.socket.connect.timeout.seconds", "5");
// ===================== End Cluster Stability Configuration =====================

// only add the queue config if the profile "localci" is active
Collection<String> activeProfiles = Arrays.asList(env.getActiveProfiles());
if (activeProfiles.contains(PROFILE_LOCALCI) || activeProfiles.contains(PROFILE_BUILDAGENT)) {
// add queue config for local ci shared queue
configureQueueCluster(config, jHipsterProperties);
Expand All @@ -287,6 +370,72 @@ public HazelcastInstance hazelcastInstance(JHipsterProperties jHipsterProperties
return Hazelcast.newHazelcastInstance(config);
}

/**
* Creates a Hazelcast client instance for build agents to connect to the core cluster.
* Clients do not participate in cluster membership, eliminating heartbeat overhead
* and isolating the core cluster from build agent failures.
*
* <p>
* The client connects to core nodes via auto-discovery from the service registry.
* If no core nodes are available during startup, the client will start asynchronously
* and keep retrying until core nodes become available.
*
* @return a HazelcastInstance configured as a client
*/
private HazelcastInstance createHazelcastClient() {
ClientConfig clientConfig = new ClientConfig();
clientConfig.setInstanceName(instanceName + "-client");

// Set cluster name (must match the core cluster)
if (!hazelcastLocalInstances) {
clientConfig.setClusterName("prod");
}

// Discover core node addresses from service registry using HazelcastConnection
List<String> discoveredAddresses = hazelcastConnection.discoverCoreNodeAddresses();

// Add discovered core node addresses (if any found)
for (String address : discoveredAddresses) {
log.info("Adding core node address for Hazelcast client: {}", address);
clientConfig.getNetworkConfig().addAddress(address);
}

// Connection strategy configuration for resilience:
// - asyncStart=true: Don't block startup if no core nodes available yet
// - reconnectMode=ON: Automatically reconnect if connection is lost
// This allows build agents to start even when core nodes are temporarily unavailable
// and reconnect automatically when they become available.
boolean hasInitialAddresses = !discoveredAddresses.isEmpty();
clientConfig.getConnectionStrategyConfig().setAsyncStart(!hasInitialAddresses) // Block only if we have addresses to connect to
.setReconnectMode(ClientConnectionStrategyConfig.ReconnectMode.ON);

// Connection retry configuration - aggressive retry to handle temporary core node unavailability
clientConfig.getConnectionStrategyConfig().getConnectionRetryConfig().setInitialBackoffMillis(1000) // Start with 1 second delay
.setMaxBackoffMillis(30000) // Max 30 seconds between retries
.setMultiplier(1.5) // Exponential backoff
.setClusterConnectTimeoutMillis(-1) // Unlimited timeout - keep trying forever
.setJitter(0.2); // Add randomness to prevent thundering herd

// Network configuration for client
clientConfig.getNetworkConfig().setConnectionTimeout(10000) // 10 seconds connection timeout per attempt
.getClusterRoutingConfig().setRoutingMode(RoutingMode.ALL_MEMBERS); // Enable smart routing to distribute load

// Serialization - use same Path serializer as cluster members
clientConfig.getSerializationConfig().addSerializerConfig(createPathSerializerConfig());

// Mark this instance as a client in the service registry
hazelcastConnection.registerAsClient();

if (hasInitialAddresses) {
log.info("Creating Hazelcast client to connect to core cluster at: {}", discoveredAddresses);
}
else {
log.warn("No core nodes found in service registry. Hazelcast client will start asynchronously and retry connection.");
}

return HazelcastClient.newHazelcastClient(clientConfig);
}

/**
* Binds the Hazelcast instance strictly to the given network interface by setting it
* as the local and public address. This ensures that Hazelcast does not bind to or listen on
Expand Down
Loading
Loading