Skip to content

Commit 3a82b83

Browse files
authored
Add health check API endpoints
1 parent e308c93 commit 3a82b83

File tree

6 files changed

+102
-1
lines changed

6 files changed

+102
-1
lines changed

docs/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`,
347347
More detail about the chart are available in the [values
348348
reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md)
349349

350-
### Health Checks
350+
### Health checks on Trino clusters
351351

352352
The Trino Gateway periodically performs health checks and maintains
353353
an in-memory TrinoStatus for each backend. If a backend fails a health check,

docs/operation.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ documentation](https://trino.io/docs/current/admin/graceful-shutdown.html) for
3636
more details.
3737

3838
## Query routing options
39+
3940
- The default router selects the backend randomly to route the queries.
4041
- If you want to route the queries to the least loaded backend for a user
4142
i.e. backend with the least number of queries running or queued from a particular user,
@@ -65,3 +66,15 @@ scrape_configs:
6566
- targets:
6667
- gateway1.example.com:8080
6768
```
69+
70+
## Trino Gateway health endpoints
71+
72+
Trino Gateway provides two API endpoints to indicate the current status of the server:
73+
74+
* `/trino-gateway/livez` always returns status code 200, indicating the server is
75+
alive. However, it might not respond if the Trino Gateway is too busy, stuck, or
76+
taking a long time for garbage collection.
77+
* `/trino-gateway/readyz` returns status code 200, indicating the server has
78+
completed initialization and is ready to serve requests. This means the initial
79+
connection to the database and the first round of health check on Trino clusters
80+
are completed. Otherwise, status code 503 is returned.

gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import io.trino.gateway.ha.module.RouterBaseModule;
2727
import io.trino.gateway.ha.module.StochasticRoutingManagerProvider;
2828
import io.trino.gateway.ha.resource.EntityEditorResource;
29+
import io.trino.gateway.ha.resource.GatewayHealthCheckResource;
2930
import io.trino.gateway.ha.resource.GatewayResource;
3031
import io.trino.gateway.ha.resource.GatewayViewResource;
3132
import io.trino.gateway.ha.resource.GatewayWebAppResource;
@@ -179,6 +180,7 @@ private static void registerResources(Binder binder)
179180
jaxrsBinder(binder).bind(PublicResource.class);
180181
jaxrsBinder(binder).bind(TrinoResource.class);
181182
jaxrsBinder(binder).bind(WebUIStaticResource.class);
183+
jaxrsBinder(binder).bind(GatewayHealthCheckResource.class);
182184
}
183185

184186
private static void registerAuthFilters(Binder binder)

gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ public class ActiveClusterMonitor
3737
public static final int DEFAULT_THREAD_POOL_SIZE = 20;
3838
private static final Logger log = Logger.get(ActiveClusterMonitor.class);
3939

40+
private volatile boolean isInitialized;
4041
private final List<TrinoClusterStatsObserver> clusterStatsObservers;
4142
private final GatewayBackendManager gatewayBackendManager;
4243

@@ -83,6 +84,7 @@ public void start()
8384
observer.observe(stats);
8485
}
8586
}
87+
isInitialized = true;
8688
}
8789
catch (Exception e) {
8890
log.error(e, "Error performing backend monitor tasks");
@@ -96,4 +98,9 @@ public void stop()
9698
executorService.shutdownNow();
9799
scheduledExecutor.shutdownNow();
98100
}
101+
102+
public boolean isInitialized()
103+
{
104+
return isInitialized;
105+
}
99106
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.resource;
15+
16+
import com.google.inject.Inject;
17+
import io.trino.gateway.ha.clustermonitor.ActiveClusterMonitor;
18+
import jakarta.ws.rs.GET;
19+
import jakarta.ws.rs.Path;
20+
import jakarta.ws.rs.core.Response;
21+
22+
import static java.util.Objects.requireNonNull;
23+
24+
@Path("/trino-gateway")
25+
public class GatewayHealthCheckResource
26+
{
27+
private final ActiveClusterMonitor activeClusterMonitor;
28+
29+
@Inject
30+
public GatewayHealthCheckResource(ActiveClusterMonitor activeClusterMonitor)
31+
{
32+
this.activeClusterMonitor = requireNonNull(activeClusterMonitor, "activeClusterMonitor is null");
33+
}
34+
35+
@GET
36+
@Path("/livez")
37+
public Response liveness()
38+
{
39+
return Response.ok("ok").build();
40+
}
41+
42+
@GET
43+
@Path("/readyz")
44+
public Response readiness()
45+
{
46+
if (!activeClusterMonitor.isInitialized()) {
47+
return Response
48+
.status(Response.Status.SERVICE_UNAVAILABLE)
49+
.entity("Trino Gateway is still initializing")
50+
.build();
51+
}
52+
return Response.ok("ok").build();
53+
}
54+
}

gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import static com.google.common.collect.MoreCollectors.onlyElement;
5050
import static com.google.common.net.HttpHeaders.CONTENT_TYPE;
5151
import static com.google.common.net.MediaType.JSON_UTF_8;
52+
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
5253
import static org.assertj.core.api.Assertions.assertThat;
5354
import static org.testcontainers.utility.MountableFile.forClasspathResource;
5455

@@ -362,6 +363,30 @@ void testCookieSigning()
362363
assertThat(callbackResponse.code()).isEqualTo(500);
363364
}
364365

366+
@Test
367+
void testHealthCheckEndpoints()
368+
throws IOException
369+
{
370+
Request livenessCheck = new Request.Builder()
371+
.url("http://localhost:" + routerPort + "/trino-gateway/livez")
372+
.build();
373+
Response livenessResponse = httpClient.newCall(livenessCheck).execute();
374+
assertThat(livenessResponse.code()).isEqualTo(200);
375+
376+
Request readinessCheck = new Request.Builder()
377+
.url("http://localhost:" + routerPort + "/trino-gateway/readyz")
378+
.build();
379+
for (int i = 0; i < 100; i++) {
380+
try (Response readinessResponse = httpClient.newCall(readinessCheck).execute()) {
381+
if (readinessResponse.code() == 200) {
382+
return;
383+
}
384+
}
385+
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
386+
}
387+
throw new IllegalStateException("Trino Gateway health check failed");
388+
}
389+
365390
@AfterAll
366391
void cleanup()
367392
{

0 commit comments

Comments
 (0)