@@ -98,9 +98,9 @@ def test_prefill():
9898 # Incomplete 1 block (6 tokens)
9999 unique_token_ids = [3 ] * 6
100100 req2 = make_request ("2" , common_token_ids + unique_token_ids )
101- computed_block = manager .get_computed_blocks (req2 )
101+ computed_blocks = manager .get_computed_blocks (req2 )
102102 assert len (req2 .kv_block_hashes ) == 3
103- assert [b .block_id for b in computed_block ] == [0 , 1 , 2 ]
103+ assert [b .block_id for b in computed_blocks ] == [0 , 1 , 2 ]
104104 num_new_tokens = 53 - 3 * 16
105105 blocks = manager .allocate_slots (req2 , num_new_tokens , computed_blocks )
106106 assert [b .block_id for b in blocks ] == [7 , 8 ]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
500500 mm_hashes = mm_hashes )
501501 computed_blocks = manager .get_computed_blocks (req1 )
502502 assert len (computed_blocks ) == 3
503+
504+
505+ def test_prefill_not_enough_free_blocks_with_computed_blocks ():
506+ """
507+ This is a unit test that tests the correctness of the allocate_slots
508+ when there is not enough free blocks. Specifically, when a request
509+ has computed blocks but cannot be allocated due to not enough free blocks,
510+ the computed blocks should not be touched.
511+ """
512+ block_size = 16
513+ manager = KVCacheManager (
514+ block_size = block_size ,
515+ num_gpu_blocks = 10 ,
516+ max_model_len = 8192 ,
517+ sliding_window = None ,
518+ enable_caching = True ,
519+ num_preallocate_tokens = 0 ,
520+ )
521+ # Complete 3 blocks (48 tokens)
522+ # | Common-0 | Common-1 | Common-2 | ... |
523+ common_token_ids = [i for i in range (3 ) for _ in range (16 )]
524+ req0 = make_request ("0" , common_token_ids )
525+ computed_blocks = manager .get_computed_blocks (req0 )
526+ assert not computed_blocks
527+ manager .allocate_slots (req0 , 48 , computed_blocks )
528+ block_part0 = manager .req_to_blocks [req0 .request_id ]
529+
530+ # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
531+ req1 = make_request ("1" , common_token_ids * 2 )
532+ computed_blocks = manager .get_computed_blocks (req1 )
533+ assert computed_blocks == block_part0
534+ manager .allocate_slots (req1 , 48 , computed_blocks )
535+ block_part1 = manager .req_to_blocks [req1 .request_id ]
536+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
537+ # | Req1-5(F)| ... |
538+ manager .free (req1 )
539+ assert {block .ref_cnt for block in block_part1 [:3 ]} == {1 }
540+ assert {block .ref_cnt for block in block_part1 [3 :]} == {0 }
541+
542+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
543+ # | Req1-5(F)| Req2-0 | Req2-1 | ... |
544+ req2 = make_request ("2" , [7 ] * block_size * 2 )
545+ computed_blocks = manager .get_computed_blocks (req2 )
546+ assert not computed_blocks
547+ manager .allocate_slots (req2 , block_size * 2 , computed_blocks )
548+
549+ # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
550+ # but it cannot be allocated due to insufficient free blocks (2).
551+ # In this case, the ref_cnt of the computed blocks should not be changed.
552+ assert manager .free_block_queue .num_free_blocks == 5
553+ req3 = make_request ("3" , common_token_ids * 3 )
554+ computed_blocks = manager .get_computed_blocks (req3 )
555+ assert computed_blocks == block_part1
556+ # Req3 cannot be allocated.
557+ assert manager .allocate_slots (req3 , 48 , computed_blocks ) is None
558+ # Block 0-2 are used by Req 1.
559+ assert {block .ref_cnt for block in block_part1 [:3 ]} == {1 }
560+ # Block 3-5 are free.
561+ assert {block .ref_cnt for block in block_part1 [3 :]} == {0 }
0 commit comments