@@ -309,6 +309,60 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
309
309
ov::hint::kv_cache_precision.name (),
310
310
" . Supported values: u8, bf16, f16, f32" );
311
311
}
312
+ } else if (key == ov::key_cache_precision.name ()) {
313
+ try {
314
+ keyCachePrecisionSetExplicitly = true ;
315
+ auto const prec = val.as <ov::element::Type>();
316
+ if (one_of (prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
317
+ keyCachePrecision = prec;
318
+ } else {
319
+ OPENVINO_THROW (" keyCachePrecision doesn't support value " , prec);
320
+ }
321
+ } catch (ov::Exception&) {
322
+ OPENVINO_THROW (" Wrong value " ,
323
+ val.as <std::string>(),
324
+ " for property key " ,
325
+ ov::key_cache_precision.name (),
326
+ " . Supported values: u8, bf16, f16, f32" );
327
+ }
328
+ } else if (key == ov::value_cache_precision.name ()) {
329
+ try {
330
+ valueCachePrecisionSetExplicitly = true ;
331
+ auto const prec = val.as <ov::element::Type>();
332
+ if (one_of (prec,
333
+ ov::element::f32,
334
+ ov::element::f16,
335
+ ov::element::bf16,
336
+ ov::element::u8,
337
+ ov::element::u4)) {
338
+ valueCachePrecision = prec;
339
+ } else {
340
+ OPENVINO_THROW (" valueCachePrecision doesn't support value " , prec);
341
+ }
342
+ } catch (ov::Exception&) {
343
+ OPENVINO_THROW (" Wrong value " ,
344
+ val.as <std::string>(),
345
+ " for property key " ,
346
+ ov::value_cache_precision.name (),
347
+ " . Supported values: u4, u8, bf16, f16, f32" );
348
+ }
349
+ } else if (key == ov::key_cache_group_size.name () || key == ov::value_cache_group_size.name ()) {
350
+ try {
351
+ auto const groupSize = val.as <uint64_t >();
352
+ if (key == ov::key_cache_group_size.name ()) {
353
+ keyCacheGroupSizeSetExplicitly = true ;
354
+ keyCacheGroupSize = groupSize;
355
+ } else {
356
+ valueCacheGroupSizeSetExplicitly = true ;
357
+ valueCacheGroupSize = groupSize;
358
+ }
359
+ } catch (ov::Exception&) {
360
+ OPENVINO_THROW (" Wrong value " ,
361
+ val.as <std::string>(),
362
+ " for property key " ,
363
+ key,
364
+ " . Expected only unsinged integer numbers" );
365
+ }
312
366
} else if (key == ov::cache_encryption_callbacks.name ()) {
313
367
try {
314
368
const auto & encryption_callbacks = val.as <EncryptionCallbacks>();
@@ -344,6 +398,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
344
398
aclFastMath = true ;
345
399
}
346
400
#endif
401
+ // key/value cache precision has higher priority, if not defined use kvCachePrecision
402
+ if (!keyCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
403
+ keyCachePrecision = kvCachePrecision;
404
+ }
405
+ if (!valueCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
406
+ valueCachePrecision = kvCachePrecision;
407
+ }
347
408
// disable dynamic quantization and kv quantization for best accuracy
348
409
if (executionMode == ov::hint::ExecutionMode::ACCURACY) {
349
410
if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
@@ -352,6 +413,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
352
413
if (!kvCachePrecisionSetExplicitly) {
353
414
kvCachePrecision = ov::element::f32;
354
415
}
416
+ if (!keyCachePrecisionSetExplicitly) {
417
+ keyCachePrecision = ov::element::f32;
418
+ }
419
+ if (!valueCachePrecisionSetExplicitly) {
420
+ valueCachePrecision = ov::element::f32;
421
+ }
355
422
}
356
423
357
424
if (!prop.empty ())
@@ -398,14 +465,31 @@ void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
398
465
// if user sets explicitly, it will be higher priority than rt_info
399
466
if (!kvCachePrecisionSetExplicitly &&
400
467
model->has_rt_info ({" runtime_options" , ov::hint::kv_cache_precision.name ()})) {
401
- this ->kvCachePrecision =
468
+ this ->kvCachePrecision = this -> keyCachePrecision = this -> valueCachePrecision =
402
469
model->get_rt_info <ov::element::Type>({" runtime_options" , ov::hint::kv_cache_precision.name ()});
403
470
}
404
471
if (!fcDynamicQuantizationGroupSizeSetExplicitly &&
405
472
model->has_rt_info ({" runtime_options" , ov::hint::dynamic_quantization_group_size.name ()})) {
406
473
this ->fcDynamicQuantizationGroupSize =
407
474
model->get_rt_info <uint64_t >({" runtime_options" , ov::hint::dynamic_quantization_group_size.name ()});
408
475
}
476
+ if (!keyCachePrecisionSetExplicitly && model->has_rt_info ({" runtime_options" , ov::key_cache_precision.name ()})) {
477
+ this ->keyCachePrecision =
478
+ model->get_rt_info <ov::element::Type>({" runtime_options" , ov::key_cache_precision.name ()});
479
+ }
480
+ if (!valueCachePrecisionSetExplicitly &&
481
+ model->has_rt_info ({" runtime_options" , ov::value_cache_precision.name ()})) {
482
+ this ->valueCachePrecision =
483
+ model->get_rt_info <ov::element::Type>({" runtime_options" , ov::value_cache_precision.name ()});
484
+ }
485
+ if (!keyCacheGroupSizeSetExplicitly && model->has_rt_info ({" runtime_options" , ov::key_cache_group_size.name ()})) {
486
+ this ->keyCacheGroupSize = model->get_rt_info <uint64_t >({" runtime_options" , ov::key_cache_group_size.name ()});
487
+ }
488
+ if (!valueCacheGroupSizeSetExplicitly &&
489
+ model->has_rt_info ({" runtime_options" , ov::value_cache_group_size.name ()})) {
490
+ this ->valueCacheGroupSize =
491
+ model->get_rt_info <uint64_t >({" runtime_options" , ov::value_cache_group_size.name ()});
492
+ }
409
493
}
410
494
411
495
} // namespace intel_cpu
0 commit comments