trinodb · felicity3786 · Dec 12, 2025 · Dec 12, 2025 · Dec 13, 2025 · xkrogen
diff --git a/docs/installation.md b/docs/installation.md
@@ -557,3 +557,23 @@ a username and password using `backendState` as with the `JDBC` option.
 #### NOOP
 
 This option disables health checks.
+
+### Best-effort routing when all backends are unhealthy
+
+By default, routing only selects backends that are both ACTIVE and HEALTHY.
+However, in environments where health checks may occasionally be flaky,
+this behavior can result in “Number of active backends found zero” errors—even when
+viable clusters technically exist.
+
+In reality, if a cluster is truly unhealthy, the query will fail regardless of whether
+the gateway routes to it or not. To prevent unnecessary immediate failures, you can enable
+best-effort routing.
+
+When best-effort mode is enabled, if all active backends in the routing group are marked
+UNHEALTHY, the router will still choose among them as a last resort, rather than failing
+the routing decision outright.
+
+```yaml
+routing:
+  bestEffortRouting: true
+```
diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java b/gateway-ha/src/main/java/io/trino/gateway/ha/config/RoutingConfiguration.java
@@ -25,6 +25,10 @@ public class RoutingConfiguration
 
     private String defaultRoutingGroup = "adhoc";
 
+    // When true, if all active backends are unhealthy, route among active backends anyway (best-effort).
+    // Default is false for backward compatibility (strict: healthy-only).
+    private boolean bestEffortRouting;
+
     public Duration getAsyncTimeout()
     {
         return asyncTimeout;
@@ -54,4 +58,14 @@ public void setDefaultRoutingGroup(String defaultRoutingGroup)
     {
         this.defaultRoutingGroup = defaultRoutingGroup;
     }
+
+    public boolean isBestEffortRouting()
+    {
+        return bestEffortRouting;
+    }
+
+    public void setBestEffortRouting(boolean bestEffortRouting)
+    {
+        this.bestEffortRouting = bestEffortRouting;
+    }
 }
diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java b/gateway-ha/src/main/java/io/trino/gateway/ha/router/BaseRoutingManager.java
@@ -54,6 +54,7 @@ public abstract class BaseRoutingManager
     private final GatewayBackendManager gatewayBackendManager;
     private final ConcurrentHashMap<String, TrinoStatus> backendToStatus;
     private final String defaultRoutingGroup;
+    private final boolean bestEffortRouting;
     private final QueryHistoryManager queryHistoryManager;
     private final LoadingCache<String, String> queryIdBackendCache;
     private final LoadingCache<String, String> queryIdRoutingGroupCache;
@@ -63,6 +64,7 @@ public BaseRoutingManager(GatewayBackendManager gatewayBackendManager, QueryHist
     {
         this.gatewayBackendManager = gatewayBackendManager;
         this.defaultRoutingGroup = routingConfiguration.getDefaultRoutingGroup();
+        this.bestEffortRouting = routingConfiguration.isBestEffortRouting();
         this.queryHistoryManager = queryHistoryManager;
         this.queryIdBackendCache = buildCache(this::findBackendForUnknownQueryId);
         this.queryIdRoutingGroupCache = buildCache(this::findRoutingGroupForUnknownQueryId);
@@ -92,10 +94,16 @@ public void setRoutingGroupForQueryId(String queryId, String routingGroup)
      */
     public ProxyBackendConfiguration provideDefaultBackendConfiguration(String user)
     {
-        List<ProxyBackendConfiguration> backends = gatewayBackendManager.getActiveDefaultBackends().stream()
+        List<ProxyBackendConfiguration> activeDefaults = gatewayBackendManager.getActiveDefaultBackends();
+        List<ProxyBackendConfiguration> healthyDefaults = activeDefaults.stream()
                 .filter(backEnd -> isBackendHealthy(backEnd.getName()))
                 .toList();
-        return selectBackend(backends, user).orElseThrow(() -> new IllegalStateException("Number of active backends found zero"));
+        // If no healthy defaults, optionally route among all active defaults when enabled
+        List<ProxyBackendConfiguration> candidates = !healthyDefaults.isEmpty()
+                ? healthyDefaults
+                : (bestEffortRouting ? activeDefaults : healthyDefaults);
+        return selectBackend(candidates, user)
+                .orElseThrow(() -> new IllegalStateException("Number of active backends found zero"));
     }
 
     /**

diff --git a/gateway-ha/src/test/java/io/trino/gateway/ha/router/TestBestEffortRouting.java b/gateway-ha/src/test/java/io/trino/gateway/ha/router/TestBestEffortRouting.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.gateway.ha.router;
+
+import io.trino.gateway.ha.clustermonitor.TrinoStatus;
+import io.trino.gateway.ha.config.ProxyBackendConfiguration;
+import io.trino.gateway.ha.config.RoutingConfiguration;
+import io.trino.gateway.ha.persistence.JdbcConnectionManager;
+import org.junit.jupiter.api.Test;
+
+import static io.trino.gateway.ha.TestingJdbcConnectionManager.createTestingJdbcConnectionManager;
+import static org.assertj.core.api.Assertions.assertThat;
+
+final class TestBestEffortRouting
+{
+    @Test
+    void testBestEffortRoutingEnabledAllUnhealthy()
+    {
+        JdbcConnectionManager connectionManager = createTestingJdbcConnectionManager();
+        RoutingConfiguration routingConfiguration = new RoutingConfiguration();
+        routingConfiguration.setBestEffortRouting(true);
+        GatewayBackendManager backendMgr = new HaGatewayManager(connectionManager.getJdbi(), routingConfiguration);
+        RoutingManager rm = new StochasticRoutingManager(backendMgr, new HaQueryHistoryManager(connectionManager.getJdbi(), false), routingConfiguration);
+
+        String group = "adhoc";
+        addActiveBackend(backendMgr, group, "trino-1");
+        addActiveBackend(backendMgr, group, "trino-2");
+
+        rm.updateBackEndHealth("trino-1", TrinoStatus.UNHEALTHY);
+        rm.updateBackEndHealth("trino-2", TrinoStatus.UNHEALTHY);
+
+        ProxyBackendConfiguration selected = rm.provideBackendConfiguration(group, "user");
+        assertThat(selected.getName()).isIn("trino-1", "trino-2");
+        assertThat(selected.getRoutingGroup()).isEqualTo(group);
+    }
+
+    @Test
+    void testFallsBackWhenAllUnhealthyInGroup()
+    {
+        JdbcConnectionManager connectionManager = createTestingJdbcConnectionManager();
+        RoutingConfiguration routingConfiguration = new RoutingConfiguration();
+        routingConfiguration.setBestEffortRouting(true);
+        routingConfiguration.setDefaultRoutingGroup("adhoc");
+        GatewayBackendManager backendMgr = new HaGatewayManager(connectionManager.getJdbi(), routingConfiguration);
+        RoutingManager rm = new StochasticRoutingManager(backendMgr, new HaQueryHistoryManager(connectionManager.getJdbi(), false), routingConfiguration);
+
+        // Non-default group with all unhealthy
+        String vipGroup = "vip";
+        addActiveBackend(backendMgr, vipGroup, "vip-1");
+        addActiveBackend(backendMgr, vipGroup, "vip-2");
+        rm.updateBackEndHealth("vip-1", TrinoStatus.UNHEALTHY);
+        rm.updateBackEndHealth("vip-2", TrinoStatus.UNHEALTHY);
+
+        // Default group with one healthy and one unhealthy
+        addActiveBackend(backendMgr, "adhoc", "adhoc-1");
+        addActiveBackend(backendMgr, "adhoc", "adhoc-2");
+        rm.updateBackEndHealth("adhoc-1", TrinoStatus.HEALTHY);
+        rm.updateBackEndHealth("adhoc-2", TrinoStatus.UNHEALTHY);
+
+        ProxyBackendConfiguration selected = rm.provideBackendConfiguration(vipGroup, "user");
+        assertThat(selected.getRoutingGroup()).isEqualTo("adhoc");
+        assertThat(selected.getName()).isEqualTo("adhoc-1");
+    }
+
+    private static void addActiveBackend(GatewayBackendManager mgr, String group, String name)
+    {
+        ProxyBackendConfiguration backend = new ProxyBackendConfiguration();
+        backend.setActive(true);
+        backend.setRoutingGroup(group);
+        backend.setName(name);
+        backend.setProxyTo(name + ".trino.example.com");
+        backend.setExternalUrl("trino.example.com");
+        mgr.addBackend(backend);
+    }
+}