Skip to content

Commit bca99f9

Browse files
andythsuAndy Su (Apps)
authored andcommitted
Add PENDING type to healthchecks
1 parent 53af36e commit bca99f9

15 files changed

+107
-33
lines changed

gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ public abstract class BaseApp
6868
{
6969
private static final Logger logger = Logger.get(BaseApp.class);
7070
private final ImmutableList.Builder<Module> appModules = ImmutableList.builder();
71+
// this injector reference is needed to use reflection in
72+
// TestGatewayHaSingleBackend and TestGatewayMultipleBackend
73+
private Injector injector;
7174

7275
private Module newModule(String clazz, HaGatewayConfiguration configuration, Environment environment)
7376
{
@@ -129,7 +132,7 @@ private void configureGuice(HaGatewayConfiguration configuration, Environment en
129132
{
130133
appModules.add(new MetricRegistryModule(environment.metrics()));
131134
appModules.addAll(addModules(configuration, environment));
132-
Injector injector = Guice.createInjector(appModules.build());
135+
injector = Guice.createInjector(appModules.build());
133136
injector.injectMembers(this);
134137
registerWithInjector(configuration, environment, injector);
135138
}

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStats.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public record ClusterStats(
2424
int runningQueryCount,
2525
int queuedQueryCount,
2626
int numWorkerNodes,
27-
boolean healthy,
27+
TrinoHealthStateType healthState,
2828
String proxyTo,
2929
String externalUrl,
3030
String routingGroup,
@@ -41,7 +41,7 @@ public static final class Builder
4141
private int runningQueryCount;
4242
private int queuedQueryCount;
4343
private int numWorkerNodes;
44-
private boolean healthy;
44+
private TrinoHealthStateType healthState;
4545
private String proxyTo;
4646
private String externalUrl;
4747
private String routingGroup;
@@ -70,9 +70,9 @@ public Builder numWorkerNodes(int numWorkerNodes)
7070
return this;
7171
}
7272

73-
public Builder healthy(boolean healthy)
73+
public Builder healthy(TrinoHealthStateType healthState)
7474
{
75-
this.healthy = healthy;
75+
this.healthState = healthState;
7676
return this;
7777
}
7878

@@ -107,7 +107,7 @@ public ClusterStats build()
107107
runningQueryCount,
108108
queuedQueryCount,
109109
numWorkerNodes,
110-
healthy,
110+
healthState,
111111
proxyTo,
112112
externalUrl,
113113
routingGroup,

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsHttpMonitor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
7171
.numWorkerNodes(activeWorkers)
7272
.queuedQueryCount((int) result.get("queuedQueries"))
7373
.runningQueryCount((int) result.get("runningQueries"))
74-
.healthy(activeWorkers > 0)
74+
.healthy(activeWorkers > 0 ? TrinoHealthStateType.HEALTHY : TrinoHealthStateType.UNHEALTHY)
7575
.proxyTo(backend.getProxyTo())
7676
.externalUrl(backend.getExternalUrl())
7777
.routingGroup(backend.getRoutingGroup());

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsInfoApiMonitor.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,19 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
4646
.routingGroup(backend.getRoutingGroup()).build();
4747
}
4848

49-
private boolean isReadyStatus(String baseUrl)
49+
private TrinoHealthStateType isReadyStatus(String baseUrl)
5050
{
5151
Request request = prepareGet()
5252
.setUri(uriBuilderFrom(URI.create(baseUrl)).appendPath("/v1/info").build())
5353
.build();
5454

5555
try {
5656
ServerInfo serverInfo = client.execute(request, SERVER_INFO_JSON_RESPONSE_HANDLER);
57-
return !serverInfo.isStarting();
57+
return serverInfo.isStarting() ? TrinoHealthStateType.PENDING : TrinoHealthStateType.HEALTHY;
5858
}
5959
catch (Exception e) {
6060
log.error("Exception checking {} for health: {} ", request.getUri(), e.getMessage());
6161
}
62-
return false;
62+
return TrinoHealthStateType.UNHEALTHY;
6363
}
6464
}

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsJdbcMonitor.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
8282
partialState.put(rs.getString("state"), rs.getInt("count"));
8383
}
8484
return clusterStats
85-
.healthy(true)
85+
// at this point we can set cluster to healthState because otherwise
86+
// it wouldn't have gotten worker stats
87+
.healthy(TrinoHealthStateType.HEALTHY)
8688
.queuedQueryCount(partialState.getOrDefault("QUEUED", 0))
8789
.runningQueryCount(partialState.getOrDefault("RUNNING", 0))
8890
.build();

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/HealthChecker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ public HealthChecker(Notifier notifier)
3434
public void observe(List<ClusterStats> clustersStats)
3535
{
3636
for (ClusterStats clusterStats : clustersStats) {
37-
if (!clusterStats.healthy()) {
37+
if (clusterStats.healthState() == TrinoHealthStateType.UNHEALTHY) {
3838
notifyUnhealthyCluster(clusterStats);
3939
}
4040
else {
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.clustermonitor;
15+
16+
/**
17+
* PENDING is for ui/observability purpose and functionally it's unhealthy
18+
* We should use PENDING when Trino clusters are still spinning up
19+
* HEALTHY is when health checks report clusters as up
20+
* UNHEALTHY is when health checks report clusters as down
21+
*/
22+
public enum TrinoHealthStateType
23+
{
24+
PENDING,
25+
HEALTHY,
26+
UNHEALTHY
27+
}

gateway-ha/src/main/java/io/trino/gateway/ha/resource/EntityEditorResource.java

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
import com.google.common.collect.ImmutableList;
1919
import com.google.inject.Inject;
2020
import io.airlift.log.Logger;
21+
import io.trino.gateway.ha.clustermonitor.ClusterStats;
22+
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
2123
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
24+
import io.trino.gateway.ha.router.BackendStateManager;
2225
import io.trino.gateway.ha.router.GatewayBackendManager;
2326
import io.trino.gateway.ha.router.ResourceGroupsManager;
2427
import io.trino.gateway.ha.router.RoutingManager;
@@ -52,13 +55,19 @@ public class EntityEditorResource
5255
private final GatewayBackendManager gatewayBackendManager;
5356
private final ResourceGroupsManager resourceGroupsManager;
5457
private final RoutingManager routingManager;
58+
private final BackendStateManager backendStateManager;
5559

5660
@Inject
57-
public EntityEditorResource(GatewayBackendManager gatewayBackendManager, ResourceGroupsManager resourceGroupsManager, RoutingManager routingManager)
61+
public EntityEditorResource(
62+
GatewayBackendManager gatewayBackendManager,
63+
ResourceGroupsManager resourceGroupsManager,
64+
RoutingManager routingManager,
65+
BackendStateManager backendStateManager)
5866
{
5967
this.gatewayBackendManager = requireNonNull(gatewayBackendManager, "gatewayBackendManager is null");
6068
this.resourceGroupsManager = requireNonNull(resourceGroupsManager, "resourceGroupsManager is null");
6169
this.routingManager = requireNonNull(routingManager, "routingManager is null");
70+
this.backendStateManager = requireNonNull(backendStateManager, "backendStateManager is null");
6271
}
6372

6473
@GET
@@ -88,6 +97,11 @@ public Response updateEntity(
8897
gatewayBackendManager.updateBackend(backend);
8998
log.info("Turning cluster %s %s", backend.getName(), backend.isActive() ? "on" : "off");
9099
routingManager.updateBackEndHealth(backend.getName(), backend.isActive());
100+
backendStateManager.updateStates(
101+
backend.getName(),
102+
ClusterStats.builder(backend.getName())
103+
.healthy(backend.isActive() ? TrinoHealthStateType.PENDING : TrinoHealthStateType.UNHEALTHY)
104+
.build());
91105
break;
92106
case RESOURCE_GROUP:
93107
ResourceGroupsDetail resourceGroupDetails = OBJECT_MAPPER.readValue(jsonPayload,

gateway-ha/src/main/java/io/trino/gateway/ha/router/QueryCountBasedRouter.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.google.common.collect.ImmutableList;
1919
import com.google.errorprone.annotations.concurrent.GuardedBy;
2020
import io.trino.gateway.ha.clustermonitor.ClusterStats;
21+
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
2122
import org.slf4j.Logger;
2223
import org.slf4j.LoggerFactory;
2324

@@ -46,7 +47,7 @@ static class LocalStats
4647
{
4748
private int runningQueryCount;
4849
private int queuedQueryCount;
49-
private boolean healthy;
50+
private TrinoHealthStateType healthState;
5051
private String proxyTo;
5152
private String routingGroup;
5253
private String clusterId;
@@ -57,7 +58,7 @@ static class LocalStats
5758
clusterId = stats.clusterId();
5859
runningQueryCount = stats.runningQueryCount();
5960
queuedQueryCount = stats.queuedQueryCount();
60-
healthy = stats.healthy();
61+
healthState = stats.healthState();
6162
proxyTo = stats.proxyTo();
6263
routingGroup = stats.routingGroup();
6364
if (stats.userQueuedCount() != null) {
@@ -93,14 +94,14 @@ public void queuedQueryCount(int queuedQueryCount)
9394
this.queuedQueryCount = queuedQueryCount;
9495
}
9596

96-
public boolean healthy()
97+
public TrinoHealthStateType healthState()
9798
{
98-
return this.healthy;
99+
return this.healthState;
99100
}
100101

101-
public void healthy(boolean healthy)
102+
public void healthState(TrinoHealthStateType healthState)
102103
{
103-
this.healthy = healthy;
104+
this.healthState = healthState;
104105
}
105106

106107
public String proxyTo()
@@ -187,7 +188,7 @@ private synchronized Optional<LocalStats> getClusterToRoute(String user, String
187188
{
188189
log.debug("sorting cluster stats for {} {}", user, routingGroup);
189190
List<LocalStats> filteredList = clusterStats.stream()
190-
.filter(stats -> stats.healthy())
191+
.filter(stats -> stats.healthState() == TrinoHealthStateType.HEALTHY)
191192
.filter(stats -> routingGroup.equals(stats.routingGroup()))
192193
.collect(Collectors.toList());
193194

gateway-ha/src/main/java/io/trino/gateway/ha/router/RoutingManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.google.common.cache.LoadingCache;
1919
import io.airlift.log.Logger;
2020
import io.trino.gateway.ha.clustermonitor.ClusterStats;
21+
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
2122
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
2223
import io.trino.gateway.proxyserver.ProxyServerConfiguration;
2324
import jakarta.ws.rs.HttpMethod;
@@ -133,7 +134,7 @@ public void updateBackEndHealth(String backendId, Boolean value)
133134
public void updateBackEndStats(List<ClusterStats> stats)
134135
{
135136
for (ClusterStats clusterStats : stats) {
136-
updateBackEndHealth(clusterStats.clusterId(), clusterStats.healthy());
137+
updateBackEndHealth(clusterStats.clusterId(), clusterStats.healthState() == TrinoHealthStateType.HEALTHY);
137138
}
138139
}
139140

0 commit comments

Comments
 (0)