Skip to content

Commit 3e4a6fb

Browse files
committed
add monitorType JMX
1 parent 99cae08 commit 3e4a6fb

File tree

9 files changed

+368
-2
lines changed

9 files changed

+368
-2
lines changed

docs/installation.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,56 @@ monitor:
420420

421421
Other timeout parameters are not applicable to the JDBC connection.
422422

423+
#### JMX
424+
425+
The monitor type `JMX` can be used as an alternative to collect cluster information,
426+
which is required for the `QueryCountBasedRouterProvider`. This uses the `v1/jmx/mbean`
427+
endpoint on Trino clusters.
428+
429+
To enable this:
430+
431+
[JMX monitoring](https://trino.io/docs/current/admin/jmx.html) must be activated on all Trino clusters with:
432+
433+
```properties
434+
jmx.rmiregistry.port=<port>
435+
jmx.rmiserver.port=<port>
436+
```
437+
438+
Allow JMX endpoint access by adding rules to your [file-based access control](https://trino.io/docs/current/security/file-system-access-control.html)
439+
configuration. Example for `user`:
440+
441+
```json
442+
{
443+
"catalogs": [
444+
{
445+
"user": "user",
446+
"catalog": "system",
447+
"allow": "read-only"
448+
}
449+
],
450+
"system_information": [
451+
{
452+
"user": "user",
453+
"allow": ["read"]
454+
}
455+
]
456+
}
457+
```
458+
459+
Ensure that a username and password are configured by adding the `backendState`
460+
section to your configuration. The credentials must be consistent across all
461+
backend clusters and have `read` rights on the `system_information`.
462+
463+
```yaml
464+
backendState:
465+
username: "user"
466+
password: "password"
467+
```
468+
469+
The JMX monitor will use these credentials to authenticate against the
470+
JMX endpoint of each Trino cluster and collect metrics like running queries,
471+
queued queries, and worker nodes information.
472+
423473
#### UI_API
424474

425475
This pulls cluster information from the `ui/api/stats` REST endpoint. This is
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.clustermonitor;
15+
16+
import com.fasterxml.jackson.databind.JsonNode;
17+
import io.airlift.http.client.BasicAuthRequestFilter;
18+
import io.airlift.http.client.HttpClient;
19+
import io.airlift.http.client.HttpRequestFilter;
20+
import io.airlift.http.client.JsonResponseHandler;
21+
import io.airlift.http.client.Request;
22+
import io.airlift.http.client.UnexpectedResponseException;
23+
import io.airlift.log.Logger;
24+
import io.trino.gateway.ha.config.BackendStateConfiguration;
25+
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
26+
27+
import java.net.URI;
28+
import java.util.Map;
29+
import java.util.Optional;
30+
import java.util.stream.Collectors;
31+
32+
import static io.airlift.http.client.HttpUriBuilder.uriBuilderFrom;
33+
import static io.airlift.http.client.JsonResponseHandler.createJsonResponseHandler;
34+
import static io.airlift.http.client.Request.Builder.prepareGet;
35+
import static io.airlift.json.JsonCodec.jsonCodec;
36+
import static java.util.Objects.requireNonNull;
37+
38+
public class ClusterStatsJmxMonitor
39+
implements ClusterStatsMonitor
40+
{
41+
private static final Logger log = Logger.get(ClusterStatsJmxMonitor.class);
42+
private static final JsonResponseHandler<JsonNode> JMX_JSON_RESPONSE_HANDLER = createJsonResponseHandler(jsonCodec(JsonNode.class));
43+
private static final String JMX_PATH = "/v1/jmx/mbean";
44+
45+
private final String username;
46+
private final String password;
47+
private final HttpClient client;
48+
49+
public ClusterStatsJmxMonitor(HttpClient client, BackendStateConfiguration backendStateConfiguration)
50+
{
51+
this.client = requireNonNull(client, "client is null");
52+
this.username = backendStateConfiguration.getUsername();
53+
this.password = backendStateConfiguration.getPassword();
54+
}
55+
56+
private static void updateClusterStatsFromDiscoveryNodeManagerResponse(JmxResponse response, ClusterStats.Builder clusterStats)
57+
{
58+
try {
59+
response.attributes().stream()
60+
.filter(attribute -> "ActiveNodeCount".equals(attribute.name()))
61+
.findFirst()
62+
.ifPresent(attribute -> {
63+
int activeNodes = attribute.value();
64+
TrinoStatus trinoStatus = activeNodes > 0 ? TrinoStatus.HEALTHY : TrinoStatus.UNHEALTHY;
65+
clusterStats.numWorkerNodes(activeNodes);
66+
clusterStats.trinoStatus(trinoStatus);
67+
log.debug("Processed DiscoveryNodeManager: ActiveNodeCount = %d, Health = %s", activeNodes, trinoStatus);
68+
});
69+
}
70+
catch (Exception e) {
71+
log.error(e, "Error parsing DiscoveryNodeManager stats");
72+
clusterStats.trinoStatus(TrinoStatus.UNHEALTHY);
73+
}
74+
}
75+
76+
private static void updateClusterStatsFromQueryManagerResponse(JmxResponse response, ClusterStats.Builder clusterStats)
77+
{
78+
try {
79+
Map<String, Integer> stats = response.attributes().stream()
80+
.filter(attribute -> {
81+
String attributeName = attribute.name();
82+
return "QueuedQueries".equals(attributeName) || "RunningQueries".equals(attributeName);
83+
})
84+
.collect(Collectors.toMap(JmxAttribute::name, JmxAttribute::value));
85+
86+
int queuedQueryCount = stats.getOrDefault("QueuedQueries", 0);
87+
clusterStats.queuedQueryCount(queuedQueryCount);
88+
int runningQueryCount = stats.getOrDefault("RunningQueries", 0);
89+
clusterStats.runningQueryCount(runningQueryCount);
90+
91+
log.debug(String.format("Processed QueryManager: QueuedQueries = %d, RunningQueries = %d", queuedQueryCount, runningQueryCount));
92+
}
93+
catch (Exception e) {
94+
log.error(e, "Error parsing QueryManager stats");
95+
}
96+
}
97+
98+
@Override
99+
public ClusterStats monitor(ProxyBackendConfiguration backend)
100+
{
101+
log.info("Monitoring cluster stats for backend: %s", backend.getProxyTo());
102+
ClusterStats.Builder clusterStatsBuilder = ClusterStatsMonitor.getClusterStatsBuilder(backend);
103+
104+
clusterStatsBuilder.proxyTo(backend.getProxyTo())
105+
.externalUrl(backend.getExternalUrl())
106+
.routingGroup(backend.getRoutingGroup());
107+
108+
Optional<JmxResponse> discoveryResponse = queryJmx(backend, "trino.metadata:name=DiscoveryNodeManager");
109+
Optional<JmxResponse> queryResponse = queryJmx(backend, "trino.execution:name=QueryManager");
110+
111+
if (discoveryResponse.isEmpty() || queryResponse.isEmpty()) {
112+
clusterStatsBuilder.trinoStatus(TrinoStatus.UNHEALTHY);
113+
return clusterStatsBuilder.build();
114+
}
115+
116+
discoveryResponse.ifPresent(response -> updateClusterStatsFromDiscoveryNodeManagerResponse(response, clusterStatsBuilder));
117+
queryResponse.ifPresent(response -> updateClusterStatsFromQueryManagerResponse(response, clusterStatsBuilder));
118+
119+
return clusterStatsBuilder.build();
120+
}
121+
122+
private Optional<JmxResponse> queryJmx(ProxyBackendConfiguration backend, String mbeanName)
123+
{
124+
requireNonNull(backend, "backend is null");
125+
requireNonNull(mbeanName, "mbeanName is null");
126+
127+
String jmxUrl = backend.getProxyTo();
128+
Request preparedRequest = prepareGet()
129+
.setUri(uriBuilderFrom(URI.create(jmxUrl))
130+
.appendPath(JMX_PATH)
131+
.appendPath(mbeanName)
132+
.build())
133+
.addHeader("X-Trino-User", username)
134+
.build();
135+
136+
boolean isHttps = preparedRequest.getUri().getScheme().equalsIgnoreCase("https");
137+
138+
if (isHttps) {
139+
HttpRequestFilter filter = new BasicAuthRequestFilter(username, password);
140+
preparedRequest = filter.filterRequest(preparedRequest);
141+
}
142+
143+
log.debug("Querying JMX at %s for %s", preparedRequest.getUri(), mbeanName);
144+
145+
try {
146+
JsonNode response = client.execute(preparedRequest, JMX_JSON_RESPONSE_HANDLER);
147+
return Optional.ofNullable(response).map(JmxResponse::fromJson);
148+
}
149+
catch (UnexpectedResponseException e) {
150+
log.error(e, "Failed to fetch JMX data for %s, response code: %d", mbeanName, e.getStatusCode());
151+
return Optional.empty();
152+
}
153+
catch (Exception e) {
154+
log.error(e, "Exception while querying JMX at %s", jmxUrl);
155+
return Optional.empty();
156+
}
157+
}
158+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.clustermonitor;
15+
16+
import com.fasterxml.jackson.databind.JsonNode;
17+
18+
public record JmxAttribute(String name, int value)
19+
{
20+
public static JmxAttribute fromJson(JsonNode json)
21+
{
22+
return new JmxAttribute(
23+
json.get("name").asText(),
24+
json.get("value").asInt());
25+
}
26+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.gateway.ha.clustermonitor;
15+
16+
import com.fasterxml.jackson.databind.JsonNode;
17+
import com.google.common.collect.ImmutableList;
18+
19+
import java.util.List;
20+
import java.util.stream.StreamSupport;
21+
22+
import static com.google.common.collect.ImmutableList.toImmutableList;
23+
24+
public record JmxResponse(List<JmxAttribute> attributes)
25+
{
26+
public JmxResponse
27+
{
28+
attributes = ImmutableList.copyOf(attributes);
29+
}
30+
31+
public static JmxResponse fromJson(JsonNode json)
32+
{
33+
List<JmxAttribute> attributes = StreamSupport.stream(json.get("attributes").spliterator(), false)
34+
.map(JmxAttribute::fromJson)
35+
.collect(toImmutableList());
36+
return new JmxResponse(attributes);
37+
}
38+
}

gateway-ha/src/main/java/io/trino/gateway/ha/config/ClusterStatsMonitorType.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@ public enum ClusterStatsMonitorType
1818
NOOP,
1919
INFO_API,
2020
UI_API,
21-
JDBC
21+
JDBC,
22+
JMX
2223
}

gateway-ha/src/main/java/io/trino/gateway/ha/module/HaGatewayProviderModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import io.trino.gateway.ha.clustermonitor.ClusterStatsHttpMonitor;
2323
import io.trino.gateway.ha.clustermonitor.ClusterStatsInfoApiMonitor;
2424
import io.trino.gateway.ha.clustermonitor.ClusterStatsJdbcMonitor;
25+
import io.trino.gateway.ha.clustermonitor.ClusterStatsJmxMonitor;
2526
import io.trino.gateway.ha.clustermonitor.ClusterStatsMonitor;
2627
import io.trino.gateway.ha.clustermonitor.ClusterStatsObserver;
2728
import io.trino.gateway.ha.clustermonitor.ForMonitor;
@@ -228,6 +229,7 @@ public ClusterStatsMonitor getClusterStatsMonitor(@ForMonitor HttpClient httpCli
228229
case INFO_API -> new ClusterStatsInfoApiMonitor(httpClient, configuration.getMonitor());
229230
case UI_API -> new ClusterStatsHttpMonitor(configuration.getBackendState());
230231
case JDBC -> new ClusterStatsJdbcMonitor(configuration.getBackendState(), configuration.getMonitor());
232+
case JMX -> new ClusterStatsJmxMonitor(httpClient, configuration.getBackendState());
231233
case NOOP -> new NoopClusterStatsMonitor();
232234
};
233235
}

gateway-ha/src/test/java/io/trino/gateway/ha/clustermonitor/TestClusterStatsMonitor.java

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@
1313
*/
1414
package io.trino.gateway.ha.clustermonitor;
1515

16+
import com.google.common.net.MediaType;
17+
import io.airlift.http.client.HttpClient;
1618
import io.airlift.http.client.HttpClientConfig;
19+
import io.airlift.http.client.HttpStatus;
1720
import io.airlift.http.client.jetty.JettyHttpClient;
21+
import io.airlift.http.client.testing.TestingHttpClient;
22+
import io.airlift.http.client.testing.TestingResponse;
1823
import io.airlift.units.Duration;
1924
import io.trino.gateway.ha.config.BackendStateConfiguration;
2025
import io.trino.gateway.ha.config.MonitorConfiguration;
@@ -41,7 +46,8 @@ final class TestClusterStatsMonitor
4146
void setUp()
4247
{
4348
trino = new TrinoContainer("trinodb/trino");
44-
trino.withCopyFileToContainer(forClasspathResource("trino-config.properties"), "/etc/trino/config.properties");
49+
trino.withCopyFileToContainer(forClasspathResource("trino-config-with-rmi.properties"), "/etc/trino/config.properties");
50+
trino.withCopyFileToContainer(forClasspathResource("jvm-with-rmi.config"), "/etc/trino/jvm.config");
4551
trino.start();
4652
}
4753

@@ -65,6 +71,63 @@ void testJdbcMonitor()
6571
testClusterStatsMonitor(backendStateConfiguration -> new ClusterStatsJdbcMonitor(backendStateConfiguration, monitorConfigurationWithTimeout));
6672
}
6773

74+
@Test
75+
void testJmxMonitor()
76+
{
77+
testClusterStatsMonitor(backendStateConfiguration -> new ClusterStatsJmxMonitor(new JettyHttpClient(new HttpClientConfig()), backendStateConfiguration));
78+
}
79+
80+
@Test
81+
void testJmxMonitorWithBadRequest()
82+
{
83+
HttpClient client = new TestingHttpClient(ignored -> TestingResponse
84+
.mockResponse(HttpStatus.BAD_REQUEST, MediaType.PLAIN_TEXT_UTF_8, "Bad Request"));
85+
86+
testClusterStatsMonitorWithClient(client);
87+
}
88+
89+
@Test
90+
void testJmxMonitorWithServerError()
91+
{
92+
HttpClient client = new TestingHttpClient(ignored -> TestingResponse
93+
.mockResponse(HttpStatus.INTERNAL_SERVER_ERROR, MediaType.PLAIN_TEXT_UTF_8, "Internal Server Error"));
94+
95+
testClusterStatsMonitorWithClient(client);
96+
}
97+
98+
@Test
99+
void testJmxMonitorWithInvalidJson()
100+
{
101+
HttpClient client = new TestingHttpClient(ignored -> TestingResponse
102+
.mockResponse(HttpStatus.OK, MediaType.JSON_UTF_8, "{invalid:json}"));
103+
104+
testClusterStatsMonitorWithClient(client);
105+
}
106+
107+
@Test
108+
void testJmxMonitorWithNetworkError()
109+
{
110+
HttpClient client = new TestingHttpClient(ignored -> {
111+
throw new RuntimeException("Network error");
112+
});
113+
114+
testClusterStatsMonitorWithClient(client);
115+
}
116+
117+
private static void testClusterStatsMonitorWithClient(HttpClient client)
118+
{
119+
BackendStateConfiguration backendStateConfiguration = new BackendStateConfiguration();
120+
backendStateConfiguration.setUsername("test_user");
121+
ClusterStatsMonitor monitor = new ClusterStatsJmxMonitor(client, backendStateConfiguration);
122+
123+
ProxyBackendConfiguration proxyBackend = new ProxyBackendConfiguration();
124+
proxyBackend.setProxyTo("http://localhost:8080");
125+
proxyBackend.setName("test_cluster");
126+
127+
ClusterStats stats = monitor.monitor(proxyBackend);
128+
assertThat(stats.trinoStatus()).isEqualTo(TrinoStatus.UNHEALTHY);
129+
}
130+
68131
@Test
69132
void testInfoApiMonitor()
70133
{

0 commit comments

Comments
 (0)