Add max_cache_length to PredictRequest.RequestOptions.

tensorflower-gardener · tensorflow-copybara · commit 1c8876683fc3 · 2025-10-09T18:15:58.000-07:00
When set and supported by servable, the model server will cache the prefix of request up to this length.

PiperOrigin-RevId: 817414641
diff --git a/tensorflow_serving/apis/predict.proto b/tensorflow_serving/apis/predict.proto
@@ -71,6 +71,10 @@ message PredictRequest {
     // response if the model stops at them. The model may stop at other tokens,
     // but will not return them in the response.
     repeated int64 return_stoptokens = 4;
+
+    // When set and supported by servable, the model server will cache the
+    // prefix of request up to this length.
+    optional int64 max_cache_length = 6;
   }
 
   optional RequestOptions request_options = 7;