Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1] Add prefix caching hit rate #11942

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from vllm.logger import init_logger
from vllm.utils import cdiv
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
KVCacheBlock,
KVCacheBlock, PrefixCachingMetrics,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens)
Expand Down Expand Up @@ -69,6 +69,12 @@ def __init__(
# is finished.
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}

# Prefix cache metrics.
self.prefix_caching_metrics: PrefixCachingMetrics = {
"query_total": 0,
"query_hit": 0,
}

def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
"""Get the computed (cached) blocks for the request.
Note that the computed blocks must be full.
Expand Down Expand Up @@ -101,6 +107,8 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
else:
break

self.prefix_caching_metrics["query_total"] += len(block_hashes)
self.prefix_caching_metrics["query_hit"] += len(computed_blocks)
return computed_blocks

def append_slots(
Expand Down Expand Up @@ -328,6 +336,17 @@ def get_num_common_prefix_blocks(
break
return num_common_blocks

def get_prefix_caching_hit_rate(self) -> float:
"""Get the hit rate of prefix caching.

Returns:
The hit rate of prefix caching.
"""
if self.prefix_caching_metrics["query_total"] == 0:
return 0.0
return self.prefix_caching_metrics[
"query_hit"] / self.prefix_caching_metrics["query_total"]

def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
"""Get new blocks from the free block pool.

Expand Down
12 changes: 11 additions & 1 deletion vllm/v1/core/kv_cache_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""KV-Cache Utilities."""
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Any, List, NamedTuple, Optional, Tuple
from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict

from vllm.logger import init_logger
from vllm.v1.request import Request
Expand All @@ -24,6 +24,16 @@ class BlockHashType(NamedTuple):
extra_keys: Optional[Any] = None


class PrefixCachingMetrics(TypedDict):
"""Metrics for prefix caching."""

query_total: int
"""The total number of queries."""

query_hit: int
"""The number of queries that hit the prefix cache."""


@dataclass
class KVCacheBlock:
"""KV-cache block metadata."""
Expand Down
9 changes: 8 additions & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,10 +244,17 @@ def _log_stats(self):
now = time.time()

if now - self._last_logging_time > LOGGING_TIME_S:
prefix_caching_hit_rate = ""
if (hit_rate := self.scheduler.kv_cache_manager.
get_prefix_caching_hit_rate()) > 0:
prefix_caching_hit_rate = (
f" | PrefixCachingHitRate: {hit_rate:.2f}")

logger.info(
"RUNNING: %s | WAITING: %s",
"RUNNING: %s | WAITING: %s%s",
len(self.scheduler.running),
len(self.scheduler.waiting),
prefix_caching_hit_rate,
)

self._last_logging_time = now
Expand Down
Loading