diff --git a/docs/installation.md b/docs/installation.md index 276edef84..8eb954731 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -557,3 +557,23 @@ a username and password using `backendState` as with the `JDBC` option. #### NOOP This option disables health checks. + +### Best-effort routing when all backends are unhealthy + +By default, routing only selects backends that are both ACTIVE and HEALTHY. +However, in environments where health checks may occasionally be flaky, +this behavior can result in “Number of active backends found zero” errors—even when +viable clusters technically exist. + +In reality, if a cluster is truly unhealthy, the query will fail regardless of whether +the gateway routes to it or not. To prevent unnecessary immediate failures, you can enable +best-effort routing. + +When best-effort mode is enabled, if all active backends in the routing group are marked +UNHEALTHY, the router will still choose among them as a last resort, rather than failing +the routing decision outright. + +```yaml +routing: + bestEffortRouting: true +``` diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java b/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java index 035218d3d..e99766c7b 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java @@ -25,6 +25,10 @@ public class RoutingConfiguration private String defaultRoutingGroup = "adhoc"; + // When true, if all active backends are unhealthy, route among active backends anyway (best-effort). + // Default is false for backward compatibility (strict: healthy-only). + private boolean bestEffortRouting; + public Duration getAsyncTimeout() { return asyncTimeout; @@ -54,4 +58,14 @@ public void setDefaultRoutingGroup(String defaultRoutingGroup) { this.defaultRoutingGroup = defaultRoutingGroup; } + + public boolean isBestEffortRouting() + { + return bestEffortRouting; + } + + public void setBestEffortRouting(boolean bestEffortRouting) + { + this.bestEffortRouting = bestEffortRouting; + } } diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java b/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java index 996ee158c..a5e3f091a 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java @@ -54,6 +54,7 @@ public abstract class BaseRoutingManager private final GatewayBackendManager gatewayBackendManager; private final ConcurrentHashMap backendToStatus; private final String defaultRoutingGroup; + private final boolean bestEffortRouting; private final QueryHistoryManager queryHistoryManager; private final LoadingCache queryIdBackendCache; private final LoadingCache queryIdRoutingGroupCache; @@ -63,6 +64,7 @@ public BaseRoutingManager(GatewayBackendManager gatewayBackendManager, QueryHist { this.gatewayBackendManager = gatewayBackendManager; this.defaultRoutingGroup = routingConfiguration.getDefaultRoutingGroup(); + this.bestEffortRouting = routingConfiguration.isBestEffortRouting(); this.queryHistoryManager = queryHistoryManager; this.queryIdBackendCache = buildCache(this::findBackendForUnknownQueryId); this.queryIdRoutingGroupCache = buildCache(this::findRoutingGroupForUnknownQueryId); @@ -92,10 +94,16 @@ public void setRoutingGroupForQueryId(String queryId, String routingGroup) */ public ProxyBackendConfiguration provideDefaultBackendConfiguration(String user) { - List backends = gatewayBackendManager.getActiveDefaultBackends().stream() + List activeDefaults = gatewayBackendManager.getActiveDefaultBackends(); + List healthyDefaults = activeDefaults.stream() .filter(backEnd -> isBackendHealthy(backEnd.getName())) .toList(); - return selectBackend(backends, user).orElseThrow(() -> new IllegalStateException("Number of active backends found zero")); + // If no healthy defaults, optionally route among all active defaults when enabled + List candidates = !healthyDefaults.isEmpty() + ? healthyDefaults + : (bestEffortRouting ? activeDefaults : healthyDefaults); + return selectBackend(candidates, user) + .orElseThrow(() -> new IllegalStateException("Number of active backends found zero")); } /** diff --git a/gateway-ha/src/test/java/io/trino/gateway/ha/router/TestBestEffortRouting.java b/gateway-ha/src/test/java/io/trino/gateway/ha/router/TestBestEffortRouting.java new file mode 100644 index 000000000..351246caf --- /dev/null +++ b/gateway-ha/src/test/java/io/trino/gateway/ha/router/TestBestEffortRouting.java @@ -0,0 +1,86 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.gateway.ha.router; + +import io.trino.gateway.ha.clustermonitor.TrinoStatus; +import io.trino.gateway.ha.config.ProxyBackendConfiguration; +import io.trino.gateway.ha.config.RoutingConfiguration; +import io.trino.gateway.ha.persistence.JdbcConnectionManager; +import org.junit.jupiter.api.Test; + +import static io.trino.gateway.ha.TestingJdbcConnectionManager.createTestingJdbcConnectionManager; +import static org.assertj.core.api.Assertions.assertThat; + +final class TestBestEffortRouting +{ + @Test + void testBestEffortRoutingEnabledAllUnhealthy() + { + JdbcConnectionManager connectionManager = createTestingJdbcConnectionManager(); + RoutingConfiguration routingConfiguration = new RoutingConfiguration(); + routingConfiguration.setBestEffortRouting(true); + GatewayBackendManager backendMgr = new HaGatewayManager(connectionManager.getJdbi(), routingConfiguration); + RoutingManager rm = new StochasticRoutingManager(backendMgr, new HaQueryHistoryManager(connectionManager.getJdbi(), false), routingConfiguration); + + String group = "adhoc"; + addActiveBackend(backendMgr, group, "trino-1"); + addActiveBackend(backendMgr, group, "trino-2"); + + rm.updateBackEndHealth("trino-1", TrinoStatus.UNHEALTHY); + rm.updateBackEndHealth("trino-2", TrinoStatus.UNHEALTHY); + + ProxyBackendConfiguration selected = rm.provideBackendConfiguration(group, "user"); + assertThat(selected.getName()).isIn("trino-1", "trino-2"); + assertThat(selected.getRoutingGroup()).isEqualTo(group); + } + + @Test + void testFallsBackWhenAllUnhealthyInGroup() + { + JdbcConnectionManager connectionManager = createTestingJdbcConnectionManager(); + RoutingConfiguration routingConfiguration = new RoutingConfiguration(); + routingConfiguration.setBestEffortRouting(true); + routingConfiguration.setDefaultRoutingGroup("adhoc"); + GatewayBackendManager backendMgr = new HaGatewayManager(connectionManager.getJdbi(), routingConfiguration); + RoutingManager rm = new StochasticRoutingManager(backendMgr, new HaQueryHistoryManager(connectionManager.getJdbi(), false), routingConfiguration); + + // Non-default group with all unhealthy + String vipGroup = "vip"; + addActiveBackend(backendMgr, vipGroup, "vip-1"); + addActiveBackend(backendMgr, vipGroup, "vip-2"); + rm.updateBackEndHealth("vip-1", TrinoStatus.UNHEALTHY); + rm.updateBackEndHealth("vip-2", TrinoStatus.UNHEALTHY); + + // Default group with one healthy and one unhealthy + addActiveBackend(backendMgr, "adhoc", "adhoc-1"); + addActiveBackend(backendMgr, "adhoc", "adhoc-2"); + rm.updateBackEndHealth("adhoc-1", TrinoStatus.HEALTHY); + rm.updateBackEndHealth("adhoc-2", TrinoStatus.UNHEALTHY); + + ProxyBackendConfiguration selected = rm.provideBackendConfiguration(vipGroup, "user"); + assertThat(selected.getRoutingGroup()).isEqualTo("adhoc"); + assertThat(selected.getName()).isEqualTo("adhoc-1"); + } + + private static void addActiveBackend(GatewayBackendManager mgr, String group, String name) + { + ProxyBackendConfiguration backend = new ProxyBackendConfiguration(); + backend.setActive(true); + backend.setRoutingGroup(group); + backend.setName(name); + backend.setProxyTo(name + ".trino.example.com"); + backend.setExternalUrl("trino.example.com"); + mgr.addBackend(backend); + } +}