Skip to content

Commit 98c8e03

Browse files
andythsuebyhr
authored andcommitted
Add PENDING type to healthchecks
1 parent d68be9f commit 98c8e03

18 files changed

+201
-70
lines changed

docs/routing-rules.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,21 @@ condition: 'request.getHeader("X-Trino-Client-Tags") contains "label=foo"'
134134

135135
If no rules match, then request is routed to adhoc.
136136

137+
### TrinoStatus
138+
139+
This class attempts to track the current state of Trino cluster. It is updated per every healthcheck.
140+
There are three possible states
141+
142+
- PENDING
143+
- A Trino cluster will show this state when it is still starting up. It will be treated as
144+
unhealthy by RoutingManager, and therefore requests will not be routed to PENDING clusters
145+
- HEALTHY
146+
- A Trino cluster will show this state when healthchecks report clusters as healthy and ready.
147+
RoutingManager will only route requests to healthy clusters
148+
- UNHEALTHY
149+
- A Trino cluster will show this state when healthchecks report clusters as unhealthy. RoutingManager
150+
will not route requests to unhealthy clusters.
151+
137152
### TrinoRequestUser
138153

139154
This class attempts to extract the user from a request. In order, it attempts

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStats.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public record ClusterStats(
2424
int runningQueryCount,
2525
int queuedQueryCount,
2626
int numWorkerNodes,
27-
boolean healthy,
27+
TrinoStatus trinoStatus,
2828
String proxyTo,
2929
String externalUrl,
3030
String routingGroup,
@@ -41,7 +41,7 @@ public static final class Builder
4141
private int runningQueryCount;
4242
private int queuedQueryCount;
4343
private int numWorkerNodes;
44-
private boolean healthy;
44+
private TrinoStatus trinoStatus;
4545
private String proxyTo;
4646
private String externalUrl;
4747
private String routingGroup;
@@ -70,9 +70,9 @@ public Builder numWorkerNodes(int numWorkerNodes)
7070
return this;
7171
}
7272

73-
public Builder healthy(boolean healthy)
73+
public Builder trinoStatus(TrinoStatus trinoStatus)
7474
{
75-
this.healthy = healthy;
75+
this.trinoStatus = trinoStatus;
7676
return this;
7777
}
7878

@@ -107,7 +107,7 @@ public ClusterStats build()
107107
runningQueryCount,
108108
queuedQueryCount,
109109
numWorkerNodes,
110-
healthy,
110+
trinoStatus,
111111
proxyTo,
112112
externalUrl,
113113
routingGroup,

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsHttpMonitor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
7272
.numWorkerNodes(activeWorkers)
7373
.queuedQueryCount((int) result.get("queuedQueries"))
7474
.runningQueryCount((int) result.get("runningQueries"))
75-
.healthy(activeWorkers > 0)
75+
.trinoStatus(activeWorkers > 0 ? TrinoStatus.HEALTHY : TrinoStatus.UNHEALTHY)
7676
.proxyTo(backend.getProxyTo())
7777
.externalUrl(backend.getExternalUrl())
7878
.routingGroup(backend.getRoutingGroup());

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsInfoApiMonitor.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,31 +49,31 @@ public ClusterStatsInfoApiMonitor(HttpClient client, MonitorConfiguration monito
4949
@Override
5050
public ClusterStats monitor(ProxyBackendConfiguration backend)
5151
{
52-
return ClusterStats.builder(backend.getName()).healthy(isReadyStatus(backend.getProxyTo()))
52+
return ClusterStats.builder(backend.getName()).trinoStatus(checkStatus(backend.getProxyTo()))
5353
.proxyTo(backend.getProxyTo())
5454
.externalUrl(backend.getExternalUrl())
5555
.routingGroup(backend.getRoutingGroup()).build();
5656
}
5757

58-
private boolean isReadyStatus(String baseUrl)
58+
private TrinoStatus checkStatus(String baseUrl)
5959
{
60-
return isReadyStatus(baseUrl, retries);
60+
return checkStatus(baseUrl, retries);
6161
}
6262

63-
private boolean isReadyStatus(String baseUrl, int retriesRemaining)
63+
private TrinoStatus checkStatus(String baseUrl, int retriesRemaining)
6464
{
6565
Request request = prepareGet()
6666
.setUri(uriBuilderFrom(URI.create(baseUrl)).appendPath("/v1/info").build())
6767
.build();
6868
try {
6969
ServerInfo serverInfo = client.execute(request, SERVER_INFO_JSON_RESPONSE_HANDLER);
70-
return !serverInfo.isStarting();
70+
return serverInfo.isStarting() ? TrinoStatus.PENDING : TrinoStatus.HEALTHY;
7171
}
7272
catch (UnexpectedResponseException e) {
7373
if (shouldRetry(e.getStatusCode())) {
7474
if (retriesRemaining > 0) {
7575
log.warn("Retrying health check on error: %s, ", e.toString());
76-
return isReadyStatus(baseUrl, retriesRemaining - 1);
76+
return checkStatus(baseUrl, retriesRemaining - 1);
7777
}
7878
else {
7979
log.error("Encountered error %s, no retries remaining", e.toString());
@@ -86,7 +86,7 @@ private boolean isReadyStatus(String baseUrl, int retriesRemaining)
8686
catch (Exception e) {
8787
log.error(e, "Exception checking %s for health", request.getUri());
8888
}
89-
return false;
89+
return TrinoStatus.UNHEALTHY;
9090
}
9191

9292
public static boolean shouldRetry(int statusCode)

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ClusterStatsJdbcMonitor.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
8383
partialState.put(rs.getString("state"), rs.getInt("count"));
8484
}
8585
return clusterStats
86-
.healthy(true)
86+
// at this point we can set cluster to trinoStatus because otherwise
87+
// it wouldn't have gotten worker stats
88+
.trinoStatus(TrinoStatus.HEALTHY)
8789
.queuedQueryCount(partialState.getOrDefault("QUEUED", 0))
8890
.runningQueryCount(partialState.getOrDefault("RUNNING", 0))
8991
.build();

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/HealthChecker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ public HealthChecker(Notifier notifier)
3434
public void observe(List<ClusterStats> clustersStats)
3535
{
3636
for (ClusterStats clusterStats : clustersStats) {
37-
if (!clusterStats.healthy()) {
37+
if (clusterStats.trinoStatus() == TrinoStatus.UNHEALTHY) {
3838
notifyUnhealthyCluster(clusterStats);
3939
}
4040
else {
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.clustermonitor;
15+
16+
/**
17+
* PENDING is for ui/observability purpose and functionally it's unhealthy
18+
* We should use PENDING when Trino clusters are still spinning up
19+
* HEALTHY is when health checks report clusters as up
20+
* UNHEALTHY is when health checks report clusters as down
21+
*/
22+
public enum TrinoStatus
23+
{
24+
PENDING,
25+
HEALTHY,
26+
UNHEALTHY
27+
}

gateway-ha/src/main/java/io/trino/gateway/ha/resource/EntityEditorResource.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
import com.google.common.collect.ImmutableList;
1919
import com.google.inject.Inject;
2020
import io.airlift.log.Logger;
21+
import io.trino.gateway.ha.clustermonitor.ClusterStats;
22+
import io.trino.gateway.ha.clustermonitor.TrinoStatus;
2123
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
24+
import io.trino.gateway.ha.router.BackendStateManager;
2225
import io.trino.gateway.ha.router.GatewayBackendManager;
2326
import io.trino.gateway.ha.router.ResourceGroupsManager;
2427
import io.trino.gateway.ha.router.RoutingManager;
@@ -52,13 +55,19 @@ public class EntityEditorResource
5255
private final GatewayBackendManager gatewayBackendManager;
5356
private final ResourceGroupsManager resourceGroupsManager;
5457
private final RoutingManager routingManager;
58+
private final BackendStateManager backendStateManager;
5559

5660
@Inject
57-
public EntityEditorResource(GatewayBackendManager gatewayBackendManager, ResourceGroupsManager resourceGroupsManager, RoutingManager routingManager)
61+
public EntityEditorResource(
62+
GatewayBackendManager gatewayBackendManager,
63+
ResourceGroupsManager resourceGroupsManager,
64+
RoutingManager routingManager,
65+
BackendStateManager backendStateManager)
5866
{
5967
this.gatewayBackendManager = requireNonNull(gatewayBackendManager, "gatewayBackendManager is null");
6068
this.resourceGroupsManager = requireNonNull(resourceGroupsManager, "resourceGroupsManager is null");
6169
this.routingManager = requireNonNull(routingManager, "routingManager is null");
70+
this.backendStateManager = requireNonNull(backendStateManager, "backendStateManager is null");
6271
}
6372

6473
@GET
@@ -87,7 +96,15 @@ public Response updateEntity(
8796
OBJECT_MAPPER.readValue(jsonPayload, ProxyBackendConfiguration.class);
8897
gatewayBackendManager.updateBackend(backend);
8998
log.info("Marking the cluster %s %s", backend.getName(), backend.isActive() ? "active" : "inactive");
90-
routingManager.updateBackEndHealth(backend.getName(), backend.isActive());
99+
// We mark Trino PENDING here so gateway won't immediately route traffic to this cluster yet
100+
// until it is marked healthy by the healthcheck
101+
TrinoStatus trinoStatus = backend.isActive() ? TrinoStatus.PENDING : TrinoStatus.UNHEALTHY;
102+
routingManager.updateBackEndHealth(backend.getName(), trinoStatus);
103+
backendStateManager.updateStates(
104+
backend.getName(),
105+
ClusterStats.builder(backend.getName())
106+
.trinoStatus(trinoStatus)
107+
.build());
91108
break;
92109
case RESOURCE_GROUP:
93110
ResourceGroupsDetail resourceGroupDetails = OBJECT_MAPPER.readValue(jsonPayload,

gateway-ha/src/main/java/io/trino/gateway/ha/router/QueryCountBasedRouter.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import com.google.errorprone.annotations.concurrent.GuardedBy;
2020
import io.airlift.log.Logger;
2121
import io.trino.gateway.ha.clustermonitor.ClusterStats;
22+
import io.trino.gateway.ha.clustermonitor.TrinoStatus;
2223

2324
import java.util.ArrayList;
2425
import java.util.Collections;
@@ -45,7 +46,7 @@ static class LocalStats
4546
{
4647
private int runningQueryCount;
4748
private int queuedQueryCount;
48-
private boolean healthy;
49+
private TrinoStatus trinoStatus;
4950
private String proxyTo;
5051
private String routingGroup;
5152
private String clusterId;
@@ -56,7 +57,7 @@ static class LocalStats
5657
clusterId = stats.clusterId();
5758
runningQueryCount = stats.runningQueryCount();
5859
queuedQueryCount = stats.queuedQueryCount();
59-
healthy = stats.healthy();
60+
trinoStatus = stats.trinoStatus();
6061
proxyTo = stats.proxyTo();
6162
routingGroup = stats.routingGroup();
6263
if (stats.userQueuedCount() != null) {
@@ -92,14 +93,14 @@ public void queuedQueryCount(int queuedQueryCount)
9293
this.queuedQueryCount = queuedQueryCount;
9394
}
9495

95-
public boolean healthy()
96+
public TrinoStatus trinoStatus()
9697
{
97-
return this.healthy;
98+
return this.trinoStatus;
9899
}
99100

100-
public void healthy(boolean healthy)
101+
public void trinoStatus(TrinoStatus trinoStatus)
101102
{
102-
this.healthy = healthy;
103+
this.trinoStatus = trinoStatus;
103104
}
104105

105106
public String proxyTo()
@@ -186,7 +187,7 @@ private synchronized Optional<LocalStats> getClusterToRoute(String user, String
186187
{
187188
log.debug("sorting cluster stats for %s %s", user, routingGroup);
188189
List<LocalStats> filteredList = clusterStats.stream()
189-
.filter(stats -> stats.healthy())
190+
.filter(stats -> stats.trinoStatus() == TrinoStatus.HEALTHY)
190191
.filter(stats -> routingGroup.equals(stats.routingGroup()))
191192
.collect(Collectors.toList());
192193

gateway-ha/src/main/java/io/trino/gateway/ha/router/RoutingManager.java

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.google.common.cache.LoadingCache;
1919
import io.airlift.log.Logger;
2020
import io.trino.gateway.ha.clustermonitor.ClusterStats;
21+
import io.trino.gateway.ha.clustermonitor.TrinoStatus;
2122
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
2223
import jakarta.ws.rs.HttpMethod;
2324

@@ -45,7 +46,7 @@ public abstract class RoutingManager
4546
private final LoadingCache<String, String> queryIdBackendCache;
4647
private final ExecutorService executorService = Executors.newFixedThreadPool(5);
4748
private final GatewayBackendManager gatewayBackendManager;
48-
private final ConcurrentHashMap<String, Boolean> backendToHealth;
49+
private final ConcurrentHashMap<String, TrinoStatus> backendToStatus;
4950

5051
public RoutingManager(GatewayBackendManager gatewayBackendManager)
5152
{
@@ -64,7 +65,7 @@ public String load(String queryId)
6465
}
6566
});
6667

67-
this.backendToHealth = new ConcurrentHashMap<String, Boolean>();
68+
this.backendToStatus = new ConcurrentHashMap<>();
6869
}
6970

7071
protected GatewayBackendManager getGatewayBackendManager()
@@ -123,16 +124,16 @@ public String findBackendForQueryId(String queryId)
123124
return backendAddress;
124125
}
125126

126-
public void updateBackEndHealth(String backendId, Boolean value)
127+
public void updateBackEndHealth(String backendId, TrinoStatus value)
127128
{
128129
log.info("backend %s isHealthy %s", backendId, value);
129-
backendToHealth.put(backendId, value);
130+
backendToStatus.put(backendId, value);
130131
}
131132

132133
public void updateBackEndStats(List<ClusterStats> stats)
133134
{
134135
for (ClusterStats clusterStats : stats) {
135-
updateBackEndHealth(clusterStats.clusterId(), clusterStats.healthy());
136+
updateBackEndHealth(clusterStats.clusterId(), clusterStats.trinoStatus());
136137
}
137138
}
138139

@@ -183,14 +184,14 @@ protected String findBackendForUnknownQueryId(String queryId)
183184
// We are returning the unhealthy (not healthy)
184185
private boolean isBackendNotHealthy(String backendId)
185186
{
186-
if (backendToHealth.isEmpty()) {
187+
if (backendToStatus.isEmpty()) {
187188
log.error("backends can not be empty");
188189
return true;
189190
}
190-
Boolean isHealthy = backendToHealth.get(backendId);
191-
if (isHealthy == null) {
191+
TrinoStatus status = backendToStatus.get(backendId);
192+
if (status == null) {
192193
return true;
193194
}
194-
return !isHealthy;
195+
return status != TrinoStatus.HEALTHY;
195196
}
196197
}

0 commit comments

Comments
 (0)