Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ hibernate_version=6.5.3.Final
opensaml_version=5.1.6
jwt_version=0.13.0
jaxb_runtime_version=4.0.6
hazelcast_version=5.5.0
hazelcast_version=5.6.0
fasterxml_version=2.20.1
netty_version=4.2.7.Final
jgit_version=7.5.0.202512021534-r
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/de/tum/cit/aet/artemis/ArtemisApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import org.springframework.core.env.Environment;
import org.springframework.core.env.Profiles;

import com.hazelcast.spring.HazelcastObjectExtractionConfiguration;

import de.tum.cit.aet.artemis.core.PrintStartupBeansEvent;
import de.tum.cit.aet.artemis.core.config.ArtemisCompatibleVersionsConfiguration;
import de.tum.cit.aet.artemis.core.config.ArtemisConfigHelper;
Expand All @@ -39,7 +41,9 @@
import tech.jhipster.config.DefaultProfileUtil;
import tech.jhipster.config.JHipsterConstants;

@SpringBootApplication
// Exclude HazelcastObjectExtractionConfiguration due to incompatibility with custom Hazelcast configuration
// See: https://github.com/hazelcast/hazelcast/issues/26553
@SpringBootApplication(exclude = HazelcastObjectExtractionConfiguration.class)
@EnableConfigurationProperties({ LiquibaseProperties.class, ProgrammingLanguageConfiguration.class, TheiaConfiguration.class, LicenseConfiguration.class,
ArtemisCompatibleVersionsConfiguration.class })
public class ArtemisApp {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ public class BuildAgentDockerService {
@Value("${artemis.continuous-integration.image-architecture:amd64}")
private String imageArchitecture;

@Value("${artemis.continuous-integration.build-agent.short-name}")
private String buildAgentShortName;

private static final String AMD64_ARCHITECTURE = "amd64";

private static final String ARM64_ARCHITECTURE = "arm64";
Expand Down Expand Up @@ -524,7 +527,7 @@ private long convertMegabytesToBytes(int mb) {
private boolean dockerClientNotAvailable(String additionalLogInfo) {
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();
if (dockerClient == null) {
BuildAgentStatus status = distributedDataAccessService.getLocalBuildAgentStatus();
BuildAgentStatus status = distributedDataAccessService.getBuildAgentStatus(buildAgentShortName);
if ((status == BuildAgentStatus.PAUSED || status == BuildAgentStatus.SELF_PAUSED)) {
log.info("Docker client is not available because the build agent is paused. {} This is expected behavior.", additionalLogInfo);
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,31 +64,40 @@ public void updateLocalBuildAgentInformation(boolean isPaused, boolean isPausedD

/**
* Updates the local build agent information with the most recent build job.
* Uses the build agent's short name as the map key for stable identification,
* since the Hazelcast member address may change after initial client connection.
*
* @param recentBuildJob the most recent build job
* @param isPaused whether the build agent is paused
* @param isPausedDueToFailures whether the build agent is paused due to consecutive failures
* @param consecutiveFailures number of consecutive build failures on the build agent
*/
public void updateLocalBuildAgentInformationWithRecentJob(BuildJobQueueItem recentBuildJob, boolean isPaused, boolean isPausedDueToFailures, int consecutiveFailures) {
String memberAddress = distributedDataAccessService.getLocalMemberAddress();
// Use buildAgentShortName as the stable key - memberAddress can change after Hazelcast client connects
String agentKey = buildAgentShortName;
try {
distributedDataAccessService.getDistributedBuildAgentInformation().lock(memberAddress);
distributedDataAccessService.getDistributedBuildAgentInformation().lock(agentKey);
// Add/update
BuildAgentInformation info = getUpdatedLocalBuildAgentInformation(recentBuildJob, isPaused, isPausedDueToFailures, consecutiveFailures);

log.debug("Updating build agent info: key='{}', name='{}', memberAddress='{}', displayName='{}'", agentKey, info.buildAgent().name(), info.buildAgent().memberAddress(),
info.buildAgent().displayName());

try {
distributedDataAccessService.getDistributedBuildAgentInformation().put(info.buildAgent().memberAddress(), info);
// Use the agent's short name as key for stable identification
distributedDataAccessService.getDistributedBuildAgentInformation().put(agentKey, info);
log.debug("Successfully stored build agent info with key '{}'. Current map size: {}", agentKey,
distributedDataAccessService.getDistributedBuildAgentInformation().size());
}
catch (Exception e) {
log.error("Error while updating build agent information for agent {} with address {}", info.buildAgent().name(), info.buildAgent().memberAddress(), e);
}
}
catch (Exception e) {
log.error("Error while updating build agent information for agent with address {}", memberAddress, e);
log.error("Error while updating build agent information for agent {}", agentKey, e);
}
finally {
distributedDataAccessService.getDistributedBuildAgentInformation().unlock(memberAddress);
distributedDataAccessService.getDistributedBuildAgentInformation().unlock(agentKey);
}
}

Expand All @@ -100,7 +109,8 @@ private BuildAgentInformation getUpdatedLocalBuildAgentInformation(BuildJobQueue
: buildAgentConfiguration.getThreadPoolSize();
boolean hasJobs = numberOfCurrentBuildJobs > 0;
BuildAgentStatus status;
BuildAgentInformation agent = distributedDataAccessService.getDistributedBuildAgentInformation().get(memberAddress);
// Use buildAgentShortName as key since that's what we use to store the agent info
BuildAgentInformation agent = distributedDataAccessService.getDistributedBuildAgentInformation().get(buildAgentShortName);
if (isPaused) {
boolean isAlreadySelfPaused = agent != null && agent.status() == BuildAgentStatus.SELF_PAUSED;
status = (isPausedDueToFailures || isAlreadySelfPaused) ? BuildAgentStatus.SELF_PAUSED : BuildAgentStatus.PAUSED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,9 @@ public void updateBuildAgentInformation() {

removeOfflineNodes();

// Add build agent information of local hazelcast member to map if not already present
if (!distributedDataAccessService.getBuildAgentInformationMap().containsKey(distributedDataAccessService.getLocalMemberAddress())) {
// Add build agent information to map if not already present
// Use buildAgentShortName as the key since that's what BuildAgentInformationService uses
if (!distributedDataAccessService.getBuildAgentInformationMap().containsKey(buildAgentShortName)) {
buildAgentInformationService.updateLocalBuildAgentInformation(isPaused.get());
}
}
Expand All @@ -284,8 +285,9 @@ private void checkAvailabilityAndProcessNextBuild() {
}
// Check conditions before acquiring the lock to avoid unnecessary locking
if (!nodeIsAvailable()) {
// Add build agent information of local hazelcast member to map if not already present
if (!distributedDataAccessService.getBuildAgentInformationMap().containsKey(distributedDataAccessService.getLocalMemberAddress())) {
// Add build agent information to map if not already present
// Use buildAgentShortName as the key since that's what BuildAgentInformationService uses
if (!distributedDataAccessService.getBuildAgentInformationMap().containsKey(buildAgentShortName)) {
buildAgentInformationService.updateLocalBuildAgentInformation(isPaused.get());
}

Expand Down Expand Up @@ -381,26 +383,92 @@ private BuildJobQueueItem addToProcessingJobs() {
* <p>
* This cleanup is necessary because when a node goes offline unexpectedly (e.g., crash),
* its build agent information and any jobs it was processing remain in the distributed maps.
* This method detects such stale entries by comparing registered agents with current cluster members.
* This method detects such stale entries by comparing the stored member address of each agent
* with current cluster members.
* <p>
* Note: Build agents running as Hazelcast clients (not cluster members) are not cleaned up by this
* method since their addresses are not in the cluster member list. Client-mode agents have addresses
* on ephemeral ports (e.g., [127.0.0.1]:54321) which will never exactly match cluster member addresses
* (which use the configured Hazelcast port like 5701). A separate mechanism (e.g., heartbeat-based
* cleanup) should be used for client-mode agent cleanup if needed.
*/
private void removeOfflineNodes() {
Set<String> memberAddresses = distributedDataAccessService.getClusterMemberAddresses();
for (String key : distributedDataAccessService.getBuildAgentInformationMap().keySet()) {
if (!memberAddresses.contains(key)) {
removeBuildAgentInformationForNode(key);
removeProcessingJobsForNode(key);
var buildAgentMap = distributedDataAccessService.getBuildAgentInformationMap();

log.debug("removeOfflineNodes: cluster member addresses = {}, build agent map keys = {}", memberAddresses, buildAgentMap.keySet());

// Iterate over entries to access both the key (short name) and the stored member address
for (var entry : buildAgentMap.entrySet()) {
String agentKey = entry.getKey();
String storedMemberAddress = entry.getValue().buildAgent().memberAddress();
boolean isClusterMember = isClusterMemberAddress(storedMemberAddress, memberAddresses);
boolean isInMemberSet = memberAddresses.contains(storedMemberAddress);

log.debug("removeOfflineNodes: checking agent '{}' with address '{}': isClusterMemberAddress={}, isInMemberSet={}", agentKey, storedMemberAddress, isClusterMember,
isInMemberSet);

// Only clean up agents whose stored address matches the exact format of current cluster members
// AND is not in the current cluster member set (i.e., the member went offline).
// Client-mode agents have ephemeral port addresses that won't match cluster member addresses,
// so they are safely ignored by this cleanup logic.
if (isClusterMember && !isInMemberSet) {
log.info("removeOfflineNodes: REMOVING agent '{}' with address '{}' (was cluster member but is now offline)", agentKey, storedMemberAddress);
removeBuildAgentInformationForNode(agentKey, storedMemberAddress);
removeProcessingJobsForNode(storedMemberAddress);
}
}
}

/**
* Checks if the given address appears to be a cluster member address based on port matching.
* Cluster members use configured Hazelcast ports (typically 5701, 5702, etc.), while clients
* use ephemeral ports assigned by the OS.
*
* @param address the address to check
* @param memberAddresses the current set of cluster member addresses
* @return true if the address appears to be a cluster member address (same port as known members)
*/
private boolean isClusterMemberAddress(String address, Set<String> memberAddresses) {
if (address == null || !address.contains("]:")) {
return false;
}
// Extract port from the address (format: [host]:port)
String addressPort = extractPort(address);
if (addressPort == null) {
return false;
}
// Check if any cluster member uses the same port - this indicates it's a cluster member address
// Clients use random ephemeral ports, so they won't match cluster member ports
return memberAddresses.stream().map(this::extractPort).filter(port -> port != null).anyMatch(addressPort::equals);
}

/**
* Extracts the port from an address in [host]:port format.
*
* @param address the address string
* @return the port string, or null if extraction fails
*/
private String extractPort(String address) {
if (address == null) {
return null;
}
int lastColon = address.lastIndexOf(':');
if (lastColon >= 0 && lastColon < address.length() - 1) {
return address.substring(lastColon + 1);
}
return null;
}

/**
* Removes the build agent information entry for a specific node from the distributed map.
*
* @param memberAddress the Hazelcast member address of the offline node
* @param agentKey the map key (build agent short name) identifying the agent
* @param memberAddress the Hazelcast member address of the offline node (for logging)
*/
private void removeBuildAgentInformationForNode(String memberAddress) {
log.debug("Cleaning up build agent information for offline node: {}", memberAddress);
distributedDataAccessService.getDistributedBuildAgentInformation().remove(memberAddress);
private void removeBuildAgentInformationForNode(String agentKey, String memberAddress) {
log.debug("Cleaning up build agent information for offline node: {} (address: {})", agentKey, memberAddress);
distributedDataAccessService.getDistributedBuildAgentInformation().remove(agentKey);
}

/**
Expand Down
Loading
Loading