vllm-project
diff --git a/‎benchmarks/client/client.py‎
Lines changed: 4 additions & 1 deletion b/‎benchmarks/client/client.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎config/gateway/gateway-plugin/gateway-plugin.yaml‎
Lines changed: 8 additions & 0 deletions b/‎config/gateway/gateway-plugin/gateway-plugin.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎config/overlays/release/default_patch.yaml‎
Lines changed: 11 additions & 1 deletion b/‎config/overlays/release/default_patch.yaml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎pkg/cache/cache_init.go‎
Lines changed: 9 additions & 7 deletions b/‎pkg/cache/cache_init.go‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎pkg/cache/cache_metrics.go‎
Lines changed: 0 additions & 3 deletions b/‎pkg/cache/cache_metrics.go‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎pkg/plugins/gateway/algorithms/README.md‎
Lines changed: 106 additions & 0 deletions b/‎pkg/plugins/gateway/algorithms/README.md‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎pkg/plugins/gateway/algorithms/least_request.go‎
Lines changed: 45 additions & 43 deletions b/‎pkg/plugins/gateway/algorithms/least_request.go‎
Lines changed: 45 additions & 43 deletions
@@ -53,6 +53,7 @@ async def send_request_streaming(client: openai.AsyncOpenAI,
     start_time = asyncio.get_event_loop().time()
     first_response_time = None
     target_pod = ""
+    target_request_id = ""
     try:
         cur_time = time.time()
         logging.warning(f"send_request_streaming: Prepare to launch task after {target_time - cur_time}")
@@ -66,6 +67,7 @@ async def send_request_streaming(client: openai.AsyncOpenAI,
         )
         if hasattr(response_stream, 'response') and hasattr(response_stream.response, 'headers'):
             target_pod = response_stream.response.headers.get('target-pod')
+            target_request_id = response_stream.response.headers.get('request-id')
 
         text_chunks = []
         prompt_tokens = 0
@@ -117,6 +119,7 @@ async def send_request_streaming(client: openai.AsyncOpenAI,
             "ttft": ttft,
             "tpot": tpot,
             "target_pod": target_pod,
+            "target_request_id": target_request_id,
             "session_id": session_id,
         }
 
@@ -141,6 +144,7 @@ async def send_request_streaming(client: openai.AsyncOpenAI,
             "start_time": start_time,
             "end_time": error_time,
             "target_pod": target_pod,
+            "target_request_id": target_request_id,
             "session_id": session_id,
         }
         logging.error(f"Request {request_id}: Error ({error_type}): {str(e)}")
@@ -375,4 +379,3 @@ def main(args):
 
     args = parser.parse_args()
     main(args)
-
 
@@ -58,6 +58,14 @@ spec:
               value: "6379" 
             - name: AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS
               value: "50"
+            - name: AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE
+              value: "character"
+            - name: AIBRIX_PREFIX_CACHE_BLOCK_SIZE
+              value: "128"
+            - name: AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT
+              value: "16"
+            - name: AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR
+              value: "2"
             # - name: AIBRIX_PREFIX_CACHE_EVICTION_DURATION_MINS
             #   value: "1"
             - name: POD_NAME
 
@@ -17,4 +17,14 @@ spec:
               memory: 8Gi
           env:
             - name: AIBRIX_GPU_OPTIMIZER_TRACING_FLAG
-              value: "false"
+              value: "false"
+            - name: AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE
+              value: "character"
+            - name: AIBRIX_PREFIX_CACHE_BLOCK_SIZE
+              value: "128"
+            - name: AIBRIX_PREFIX_CACHE_BLOCK_NUMBER
+              value: "200000"
+            - name: AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT
+              value: "16"
+            - name: AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR
+              value: "2"
@@ -91,22 +91,24 @@ func New(redisClient *redis.Client, prometheusApi prometheusv1.API) *Store {
 	}
 }
 
-func NewTestCacheWithPods(pods []*v1.Pod) *Store {
+func NewTestCacheWithPods(pods []*v1.Pod, model string) *Store {
 	c := &Store{}
 	for _, pod := range pods {
 		pod.Labels = make(map[string]string)
-		pod.Labels[modelIdentifier] = "modelName"
+		pod.Labels[modelIdentifier] = model
 		c.addPod(pod)
 	}
 	return c
 }
 
-func NewTestCacheWithPodsMetrics(pods []*v1.Pod, podMetrics map[string]map[string]metrics.MetricValue) *Store {
-	c := NewTestCacheWithPods(pods)
+func NewTestCacheWithPodsMetrics(pods []*v1.Pod, model string, podMetrics map[string]map[string]metrics.MetricValue) *Store {
+	c := NewTestCacheWithPods(pods, model)
 	c.metaPods.Range(func(podName string, metaPod *Pod) bool {
-		if metrics, ok := podMetrics[podName]; ok {
-			for metricName, metric := range metrics {
-				metaPod.Metrics.Store(metricName, metric)
+		if podmetrics, ok := podMetrics[podName]; ok {
+			for metricName, metric := range podmetrics {
+				if err := c.updatePodRecord(metaPod, model, metricName, metrics.PodMetricScope, metric); err != nil {
+					return false
+				}
 			}
 		}
 		return true
 
@@ -329,9 +329,6 @@ func (c *Store) queryUpdatePromQLMetrics(metric metrics.Metric, queryLabels map[
 // TODO: replace in-place metric update podMetrics and podModelMetrics to fresh copy for preventing stale metric keys
 func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, scope metrics.MetricScope, metricValue metrics.MetricValue) error {
 	if scope == metrics.PodMetricScope {
-		if modelName != "" {
-			return fmt.Errorf("modelName should be empty for scope %v", scope)
-		}
 		pod.Metrics.Store(metricName, metricValue)
 	} else if scope == metrics.PodModelMetricScope {
 		if modelName == "" {
 
@@ -0,0 +1,106 @@
+# Routing Algorithms
+
+## Prefix Cache Aware
+
+Below is the pseudo-code for prefix-cache aware routing.
+
+
+```shell
+func prefix_cache_routing(ready_pods []*v1.Pod) {
+    if check_load_imbalance(ready_pods) {
+        target_pod = select_pod_with_least_running_requests(ready_pods)
+    } else {
+        match_pods, prefix_hashes = match_prefix(ready_pods)
+        if len(match_pod) > 0 {
+            target_pod = select_least_loaded_match_pod(match_pods)
+        }
+    }
+
+    // if no target pod is selected, fallback to select pod with least request
+    if target_pod == nil {
+        target_pod = select_pod_with_least_running_requests(ready_pods)
+    }
+}
+
+func check_load_imbalance(ready_pods) {
+    // filter pods with min and max number of running requests
+    min_pod = select_pod_min_running_requests()
+    max_pod = select_pod_max_running_requests()
+    
+    // if difference between max & min running requests count 
+    // is more than configurable ABS_RUNNING_REQUEST_COUNT (default: 8)
+    // then load is imbalanced
+    if max_pod - min_pod > ABS_RUNNING_REQUEST_COUNT {
+        return true
+    }
+    return false
+}
+
+func match_prefix(input_tokens, ready_pods) {
+    // input_tokens are split based off configurable block_sizes and 
+    // hash is calculated for each token_block
+    hashes = calculate_hashes(input_tokens)
+
+    // checks if token_block exists on ready_pods [prefix_match], 
+    // if present calculate pod_name: prefix_match_percent
+    match_pods_with_prefix_match_percent = check_hashes_on_ready_pods(hashes, ready_pods)
+}
+
+func select_least_loaded_match_pod(match_pods_with_prefix_match_percent, ready_pods) {
+    mean = calculate_mean_running_request(ready_pods)   
+    std_dev = calculate_std_dev_running_request(ready_pods)
+
+    // sort match_pods in decreasing perfix_match_percent and 
+    // for same prefix_match_percent, sort in increasing running_request count.
+    sort(match_pods_with_prefix_match_percent)
+
+    // select match pod with highest prefix and running_request < (mean + std_dev)
+    for pod := range match_pods_with_prefix_match_percent {
+        if pod.running_request < mean + load_factor*std_dev {
+            return pod
+        }
+    }
+}
+
+// selects pod with minimum running requests, similar to least-request routing algorithm
+func select_pod_with_least_running_requests(ready_pods) {
+    return select_pod_min_running_requests()
+}
+```
+
+## Configurations
+
+- **_AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE_**
+
+    AIBrix gateway implements two tokenizers **_character_** and **_tiktoken_**. Default tokenizer is <ins>**_character_**</ins>.
+    
+    | Tokenizer Type  | Details |
+    | ------------- | ------------- |
+    | character  | splits input text into characters  |
+    | tiktoken  | open-source openai/tiktoken [tokenizer](https://github.com/openai/tiktoken)  |
+
+- **_AIBRIX_PREFIX_CACHE_BLOCK_SIZE_**
+
+    Tokenized input request is split into blocks and hash value of the blocks is cached for future match. Size of the block (i.e. number of tokens per block) defines how effective prefix match will be. Default is <ins>**_character tokenizer and 128 block size (tokens per block)_**</ins>.
+
+    | Tokenizer Type  | Block Size Recommendation |
+    | ------------- | ------------- |
+    | character  | 128  |
+    | tiktoken  | 16  |
+
+- **AIBRIX_PREFIX_CACHE_BLOCK_NUMBER**
+
+    Maximum number of prefix cache blocks. Default is <ins>**_200000_**</ins>.
+
+- **AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT**
+
+    Before evaluating prefix cache match, router checks if there is imbalance of running requests across pods. Imbalance is measured using absolute difference between max & min running requests across pods, for example if imbalance_abs_count = 16 and running requests for pods are [p1: 1, p2: 2, p3:20] then current scenario is flagged as imbalanced. If flagged as imbalanced then prefix match is ignored and request is routed to pod with least running requests which in above example will to route to pod p1. Default is <ins>**_16_**</ins> and should be adjusted based on GPU hardware & prompt length.
+
+- **AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR**
+
+    After evaluating prefix match, pods are selected with matching prefix cache. Selected pods are re-evaluated to prevent a hotspot scenario where bulk of prefix matching requests are routed to same pod. Imbalanced is checked as follows
+    <pre>
+    prefix_match_pod.running_requests <= mean + <b>load_factor</b> * standard_deviation
+    </pre>
+
+    **load_factor** determines number of standard deviations. Default is <ins>**_2_**</ins>
@@ -52,56 +52,20 @@ func NewLeastRequestRouter() (types.Router, error) {
 	}, nil
 }
 
+// Routes request based of least active request among input ready pods
 func (r leastRequestRouter) Route(ctx *types.RoutingContext, pods types.PodList) (string, error) {
-	var targetPod *v1.Pod
-	minCount := math.MaxFloat64
-
-	if pods.Len() == 0 {
-		return "", fmt.Errorf("no pods to forward request")
-	}
-
-	readyPods := utils.FilterRoutablePods(pods.All())
-	if len(readyPods) == 0 {
-		return "", fmt.Errorf("no ready pods available for fallback")
-	}
-
-	for _, pod := range readyPods {
-		runningReq, err := r.cache.GetMetricValueByPodModel(pod.Name, ctx.Model, metrics.NumRequestsRunning)
-		if err != nil {
-			klog.Error(err)
-			continue
-		}
-		waitingReq, err := r.cache.GetMetricValueByPodModel(pod.Name, ctx.Model, metrics.NumRequestsWaiting)
-		if err != nil {
-			klog.Error(err)
-			continue
-		}
-		swappedReq, err := r.cache.GetMetricValueByPodModel(pod.Name, ctx.Model, metrics.NumRequestsSwapped)
-		if err != nil {
-			klog.Error(err)
-			continue
-		}
-
-		totalReq := runningReq.GetSimpleValue() + waitingReq.GetSimpleValue() + swappedReq.GetSimpleValue()
-		klog.V(4).Infof("pod: %v, podIP: %v, runningReq: %v, waitingReq: %v, swappedReq: %v, totalReq: %v",
-			pod.Name, pod.Status.PodIP, runningReq, waitingReq, swappedReq, totalReq)
-
-		if totalReq <= minCount {
-			minCount = totalReq
-			targetPod = pod
-		}
-	}
+	targetPod := selectTargetPodWithLeastRequestCount(r.cache, pods.All())
 
 	// Use fallback if no valid metrics
 	if targetPod == nil {
-		klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback")
+		klog.Warning("no pods with valid metrics found for least-request routing strategy; selecting a pod randomly as fallback",
+			"requestID", ctx.RequestID)
 		var err error
 		targetPod, err = selectRandomPod(pods.All(), rand.Intn)
 		if err != nil {
 			return "", err
 		}
 	}
-
 	if targetPod == nil {
 		return "", fmt.Errorf("no pods to forward request")
 	}
@@ -112,8 +76,46 @@ func (r leastRequestRouter) Route(ctx *types.RoutingContext, pods types.PodList)
 
 func (r *leastRequestRouter) SubscribedMetrics() []string {
 	return []string{
-		metrics.NumRequestsRunning,
-		metrics.NumRequestsWaiting,
-		metrics.NumRequestsSwapped,
+		metrics.RealtimeNumRequestsRunning,
+	}
+}
+
+func selectTargetPodWithLeastRequestCount(cache cache.Cache, readyPods []*v1.Pod) *v1.Pod {
+	var targetPod *v1.Pod
+	targetPods := []string{}
+
+	minCount := math.MaxInt32
+	podRequestCount := getRequestCounts(cache, readyPods)
+	for _, totalReq := range podRequestCount {
+		if totalReq <= minCount {
+			minCount = totalReq
+		}
 	}
+	for podname, totalReq := range podRequestCount {
+		if totalReq == minCount {
+			targetPods = append(targetPods, podname)
+		}
+	}
+	if len(targetPods) > 0 {
+		targetPod, _ = utils.FilterPodByName(targetPods[rand.Intn(len(targetPods))], readyPods)
+	}
+	return targetPod
+}
+
+// getRequestCounts returns running request count for each pod tracked by gateway.
+// Note: Currently, gateway instance tracks active running request counts for each pod locally,
+// if multiple gateway instances are active then state is not shared across them.
+// It is advised to run on leader gateway instance.
+// TODO: Support stateful information sync across gateway instances: https://github.com/vllm-project/aibrix/issues/761
+func getRequestCounts(cache cache.Cache, readyPods []*v1.Pod) map[string]int {
+	podRequestCount := map[string]int{}
+	for _, pod := range readyPods {
+		runningReq, err := cache.GetMetricValueByPod(pod.Name, metrics.RealtimeNumRequestsRunning)
+		if err != nil {
+			runningReq = &metrics.SimpleMetricValue{Value: 0}
+		}
+		podRequestCount[pod.Name] = int(runningReq.GetSimpleValue())
+	}
+
+	return podRequestCount
 }
Original file line number	Diff line number	Diff line change
`@@ -91,22 +91,24 @@ func New(redisClient redis.Client, prometheusApi prometheusv1.API) Store {`
`91`	`91`	`}`
`92`	`92`	`}`
`93`	`93`
`94`		`-func NewTestCacheWithPods(pods []v1.Pod) Store {`
	`94`	`+func NewTestCacheWithPods(pods []v1.Pod, model string) Store {`
`95`	`95`	`c := &Store{}`
`96`	`96`	`for _, pod := range pods {`
`97`	`97`	`pod.Labels = make(map[string]string)`
`98`		`- pod.Labels[modelIdentifier] = "modelName"`
	`98`	`+ pod.Labels[modelIdentifier] = model`
`99`	`99`	`c.addPod(pod)`
`100`	`100`	`}`
`101`	`101`	`return c`
`102`	`102`	`}`
`103`	`103`
`104`		`-func NewTestCacheWithPodsMetrics(pods []v1.Pod, podMetrics map[string]map[string]metrics.MetricValue) Store {`
`105`		`- c := NewTestCacheWithPods(pods)`
	`104`	`+func NewTestCacheWithPodsMetrics(pods []v1.Pod, model string, podMetrics map[string]map[string]metrics.MetricValue) Store {`
	`105`	`+ c := NewTestCacheWithPods(pods, model)`
`106`	`106`	`c.metaPods.Range(func(podName string, metaPod *Pod) bool {`
`107`		`- if metrics, ok := podMetrics[podName]; ok {`
`108`		`- for metricName, metric := range metrics {`
`109`		`- metaPod.Metrics.Store(metricName, metric)`
	`107`	`+ if podmetrics, ok := podMetrics[podName]; ok {`
	`108`	`+ for metricName, metric := range podmetrics {`
	`109`	`+ if err := c.updatePodRecord(metaPod, model, metricName, metrics.PodMetricScope, metric); err != nil {`
	`110`	`+ return false`
	`111`	`+ }`
`110`	`112`	`}`
`111`	`113`	`}`
`112`	`114`	`return true`