vllm-project · comaniac · Jan 10, 2025
@@ -4,7 +4,7 @@
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+                                         KVCacheBlock, PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
@@ -69,6 +69,12 @@ def __init__(
         # is finished.
         self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
+        # Prefix cache metrics.
+        self.prefix_caching_metrics: PrefixCachingMetrics = {
+            "query_total": 0,
+            "query_hit": 0,
+        }
+
     def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
@@ -101,6 +107,8 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             else:
                 break
 
+        self.prefix_caching_metrics["query_total"] += len(block_hashes)
+        self.prefix_caching_metrics["query_hit"] += len(computed_blocks)
         return computed_blocks
 
     def append_slots(
@@ -328,6 +336,17 @@ def get_num_common_prefix_blocks(
                 break
         return num_common_blocks
 
+    def get_prefix_caching_hit_rate(self) -> float:
+        """Get the hit rate of prefix caching.
+
+        Returns:
+            The hit rate of prefix caching.
+        """
+        if self.prefix_caching_metrics["query_total"] == 0:
+            return 0.0
+        return self.prefix_caching_metrics[
+            "query_hit"] / self.prefix_caching_metrics["query_total"]
+
     def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
         """Get new blocks from the free block pool.
 

@@ -1,7 +1,7 @@
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, List, NamedTuple, Optional, Tuple
+from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict
 
 from vllm.logger import init_logger
 from vllm.v1.request import Request
@@ -24,6 +24,16 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class PrefixCachingMetrics(TypedDict):
+    """Metrics for prefix caching."""
+
+    query_total: int
+    """The total number of queries."""
+
+    query_hit: int
+    """The number of queries that hit the prefix cache."""
+
+
 @dataclass
 class KVCacheBlock:
     """KV-cache block metadata."""

@@ -244,10 +244,17 @@ def _log_stats(self):
         now = time.time()
 
         if now - self._last_logging_time > LOGGING_TIME_S:
+            prefix_caching_hit_rate = ""
+            if (hit_rate := self.scheduler.kv_cache_manager.
+                    get_prefix_caching_hit_rate()) > 0:
+                prefix_caching_hit_rate = (
+                    f" | PrefixCachingHitRate: {hit_rate:.2f}")
+
             logger.info(
-                "RUNNING: %s | WAITING: %s",
+                "RUNNING: %s | WAITING: %s%s",
                 len(self.scheduler.running),
                 len(self.scheduler.waiting),
+                prefix_caching_hit_rate,
             )
 
             self._last_logging_time = now