trinodb
diff --git a/‎docs/config.yaml‎
Lines changed: 8 additions & 0 deletions b/‎docs/config.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/installation.md‎
Lines changed: 26 additions & 0 deletions b/‎docs/installation.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/operation.md‎
Lines changed: 16 additions & 2 deletions b/‎docs/operation.md‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎docs/valkey-configuration.md‎
Lines changed: 165 additions & 0 deletions b/‎docs/valkey-configuration.md‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎gateway-ha/config.yaml‎
Lines changed: 8 additions & 0 deletions b/‎gateway-ha/config.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎gateway-ha/pom.xml‎
Lines changed: 6 additions & 0 deletions b/‎gateway-ha/pom.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎gateway-ha/src/main/java/io/trino/gateway/ha/config/HaGatewayConfiguration.java‎
Lines changed: 12 additions & 0 deletions b/‎gateway-ha/src/main/java/io/trino/gateway/ha/config/HaGatewayConfiguration.java‎
Lines changed: 12 additions & 0 deletions
@@ -11,3 +11,11 @@ dataStore:
 
 clusterStatsConfiguration:
   monitorType: INFO_API
+
+# Valkey distributed cache (optional - for multi-instance deployments)
+valkeyConfiguration:
+  enabled: false
+  host: localhost
+  port: 6379
+  # password: ${VALKEY_PASSWORD}  # Uncomment if Valkey requires AUTH
+  # cacheTtlSeconds: 1800  # Cache TTL in seconds (default: 1800 = 30 minutes)
@@ -161,6 +161,32 @@ For additional configurations, use the `log.*` properties from the
 [Trino logging properties documentation](https://trino.io/docs/current/admin/properties-logging.html) and specify
 the properties in `serverConfig`.
 
+### Configure distributed cache (optional)
+
+For multi-instance deployments, Trino Gateway supports distributed caching
+using Valkey (or Redis) to share query metadata across gateway instances.
+This improves query routing and enables horizontal scaling.
+
+For single gateway deployments, distributed caching is not needed - the
+local cache is sufficient.
+
+```yaml
+valkeyConfiguration:
+  enabled: true
+  host: valkey.internal.prod
+  port: 6379
+  password: ${ENV:VALKEY_PASSWORD}
+  cacheTtlSeconds: 1800  # Cache TTL (default: 1800 = 30 minutes)
+```
+
+**Optional parameters**: You can customize `cacheTtlSeconds` based on your query duration:
+- Short queries (< 5 min): 600 seconds (10 minutes)
+- Default queries: 1800 seconds (30 minutes)
+- Long-running queries: 3600 seconds (1 hour)
+
+See [Valkey distributed cache configuration](valkey-configuration.md) for
+detailed configuration options, deployment scenarios, and performance tuning.
+
 ### Proxying additional paths
 
 By default, Trino Gateway only proxies requests to paths starting with
 
@@ -58,8 +58,8 @@ monitor:
 
 ## Monitoring <a name="monitoring"></a>
 
-Trino Gateway provides a metrics endpoint that uses the OpenMetrics format at 
-`/metrics`. Use it to monitor Trino Gateway instances with Prometheus and 
+Trino Gateway provides a metrics endpoint that uses the OpenMetrics format at
+`/metrics`. Use it to monitor Trino Gateway instances with Prometheus and
 other compatible systems with the following Prometheus configuration:
 
 ```yaml
@@ -70,6 +70,20 @@ scrape_configs:
         - gateway1.example.com:8080
 ```
 
+### Multi-instance deployments
+
+When running multiple Trino Gateway instances, enable the Valkey distributed
+cache to share query metadata across instances. This ensures consistent query
+routing regardless of which gateway instance receives the request.
+
+Monitor the distributed cache performance by checking:
+- Cache hit rate (target: 85-95%)
+- Cache errors (should be near 0)
+- Valkey server connectivity and memory usage
+
+See [Valkey distributed cache configuration](valkey-configuration.md) for
+setup instructions and monitoring details.
+
 ## Trino Gateway health endpoints
 
 Trino Gateway provides two API endpoints to indicate the current status of the server:
 
@@ -0,0 +1,165 @@
+
+## Performance Tuning
+
+### Cache TTL (`cacheTtlSeconds`)
+
+- **Default (1800s / 30min):** Good for typical workloads
+- **Short-lived queries (<5min):** Use 600s (10min)
+- **Long-running queries (hours):** Use 3600s (1 hour) or more
+- **Interactive development:** Use 300s (5min)
+
+### Health Check Interval (`healthCheckIntervalMs`)
+
+- **Default (30000ms / 30s):** Balanced check frequency
+- **Unstable network:** Increase to 60000ms (1 min)
+- **Critical systems:** Decrease to 10000ms (10s)
+
+### Connection Timeouts (`timeoutMs`)
+
+- **Default (2000ms):** Good for local/same-datacenter Valkey
+- **Cross-region:** Increase to 5000ms
+- **High latency network:** Increase to 10000ms
+
+---
+
+## Monitoring
+
+Valkey cache exposes the following metrics (accessible via `ValkeyDistributedCache` instance):
+
+```java
+long hits = cache.getCacheHits();
+long misses = cache.getCacheMisses();
+long writes = cache.getCacheWrites();
+long errors = cache.getCacheErrors();
+double hitRate = cache.getCacheHitRate();  // Percentage
+```
+
+### Expected Metrics (Healthy System)
+
+- **Cache Hit Rate:** 85-95%
+- **Cache Errors:** 0 (or very low)
+- **Cache Writes:** ~Equal to query submission rate
+
+### Troubleshooting
+
+**Low Hit Rate (<70%)**
+- Check TTL settings (may be too short)
+- Verify Valkey isn't evicting entries (check memory)
+- Check if multiple gateway versions deployed (cache key mismatch)
+
+**High Error Rate**
+- Check Valkey connectivity
+- Verify password/AUTH configuration
+- Review Valkey server logs
+
+**Connection Pool Exhaustion**
+- Increase `maxTotal` setting
+- Check for connection leaks (should be none with try-with-resources)
+
+---
+
+## Security Considerations
+
+### Production Deployment Checklist
+
+- [ ] **Enable AUTH:** Set `password` in configuration
+- [ ] **Use Environment Variables:** Don't hardcode passwords
+- [ ] **Network Security:** Deploy Valkey in private VPC/network
+- [ ] **Encryption at Rest:** Enable Valkey persistence encryption
+- [ ] **TLS/SSL:** (Future enhancement - not yet supported)
+- [ ] **Access Control:** Restrict Valkey port (6379) to gateway instances only
+
+### Example Production Setup
+
+```yaml
+# config.yaml
+valkeyConfiguration:
+  enabled: true
+  host: ${VALKEY_INTERNAL_HOST}
+  port: 6379
+  password: ${VALKEY_PASSWORD}
+```
+
+```bash
+# Environment variables (set in deployment)
+export VALKEY_INTERNAL_HOST=valkey.vpc.internal
+export VALKEY_PASSWORD=$(vault read -field=password secret/valkey)
+```
+
+---
+
+## Architecture
+
+### 3-Tier Caching
+
+```
+Request Flow:
+1. Check L1 (Local Guava Cache) → 10k entries, 30min TTL
+   ├─ Hit: Return immediately (~1ms)
+   └─ Miss: Continue to L2
+
+2. Check L2 (Valkey Distributed Cache) → Shared across gateways
+   ├─ Hit: Populate L1, return (~5ms)
+   └─ Miss: Continue to L3
+
+3. Check L3 (PostgreSQL Database) → Source of truth
+   ├─ Found: Populate L2 + L1, return (~50ms)
+   └─ Not Found: Search all backends via HTTP (~200ms)
+```
+
+### Cache Keys
+
+```
+Backend:        trino:query:backend:{queryId}
+Routing Group:  trino:query:routinggroup:{queryId}
+External URL:   trino:query:externalurl:{queryId}
+```
+
+---
+
+## Migration Guide
+
+### From Single Gateway to Multi-Gateway
+
+1. **Deploy Valkey server** (standalone or cluster)
+2. **Update config.yaml** on all gateways:
+   ```yaml
+   valkeyConfiguration:
+     enabled: true
+     host: valkey.internal
+     port: 6379
+     password: ${VALKEY_PASSWORD}
+   ```
+3. **Restart gateways** (rolling restart recommended)
+4. **Monitor metrics** to verify cache hit rates
+
+No data migration needed - cache will populate automatically.
+
+---
+
+## FAQ
+
+**Q: Do I need Valkey if I only have one gateway?**
+A: No. Local Guava cache is sufficient for single-instance deployments.
+
+**Q: What happens if Valkey goes down?**
+A: Graceful degradation - queries continue working, falling back to database. Performance may degrade slightly.
+
+**Q: Can I use Redis instead of Valkey?**
+A: Yes! Valkey is a Redis fork with compatible protocol. Just point to your Redis server.
+
+**Q: How much memory does Valkey need?**
+A: Rough estimate: `(queries per minute) × (average query lifetime in minutes) × 500 bytes`
+   Example: 1000 q/min × 30 min × 500 bytes = ~15 MB
+
+**Q: Can I clear the cache?**
+A: Yes, via Valkey CLI: `redis-cli -h <host> -a <password> FLUSHDB`
+   Or selectively: `redis-cli DEL trino:query:backend:*`
+
+---
+
+## Support
+
+For issues or questions:
+- GitHub Issues: https://github.com/trinodb/trino-gateway/issues
+- Trino Community Slack: #trino-gateway channel
@@ -20,3 +20,11 @@ clusterStatsConfiguration:
 monitor:
   taskDelay: 1m
   clusterMetricsRegistryRefreshPeriod: 30s
+
+# Valkey distributed cache (optional - for multi-instance deployments)
+valkeyConfiguration:
+  enabled: false  # Set to true to enable distributed caching
+  host: localhost
+  port: 6379
+  # password: ${VALKEY_PASSWORD}  # Uncomment if Valkey requires AUTH
+  # cacheTtlSeconds: 1800  # Cache TTL in seconds (default: 1800 = 30 minutes)
@@ -191,6 +191,12 @@
             <version>${dep.trino.version}</version>
         </dependency>
 
+        <dependency>
+            <groupId>io.valkey</groupId>
+            <artifactId>valkey-java</artifactId>
+            <version>5.5.0</version>
+        </dependency>
+
         <dependency>
             <groupId>jakarta.annotation</groupId>
             <artifactId>jakarta.annotation-api</artifactId>
 
@@ -48,6 +48,8 @@ public class HaGatewayConfiguration
 
     private UIConfiguration uiConfiguration = new UIConfiguration();
 
+    private ValkeyConfiguration valkeyConfiguration = new ValkeyConfiguration();
+
     // List of Modules with FQCN (Fully Qualified Class Name)
     private List<String> modules;
 
@@ -278,6 +280,16 @@ public void setProxyResponseConfiguration(ProxyResponseConfiguration proxyRespon
         this.proxyResponseConfiguration = proxyResponseConfiguration;
     }
 
+    public ValkeyConfiguration getValkeyConfiguration()
+    {
+        return valkeyConfiguration;
+    }
+
+    public void setValkeyConfiguration(ValkeyConfiguration valkeyConfiguration)
+    {
+        this.valkeyConfiguration = valkeyConfiguration;
+    }
+
     private void validateStatementPath(String statementPath, List<String> statementPaths)
     {
         if (statementPath.startsWith(V1_STATEMENT_PATH) ||