@@ -297,12 +297,20 @@ bool is_cw_compressed(const std::shared_ptr<ov::Model>& model) {
297
297
struct NPUDesc {
298
298
std::string arch;
299
299
int64_t max_tiles;
300
+ bool compiler_dq;
300
301
};
301
302
302
303
std::optional<NPUDesc> extract_npu_descriptor (const std::shared_ptr<const ov::IPlugin>& plugin) {
303
- const ov::Any arch = plugin->get_property (ov::device::architecture.name (), ov::AnyMap{});
304
- const ov::Any max_tiles = plugin->get_property (ov::intel_npu::max_tiles.name (), ov::AnyMap{});
305
- return std::make_optional (NPUDesc{arch.as <std::string>(), max_tiles.as <int64_t >()});
304
+ const std::string arch = plugin->get_property (ov::device::architecture.name (), ov::AnyMap{}).as <std::string>();
305
+ const int64_t max_tiles = plugin->get_property (ov::intel_npu::max_tiles.name (), ov::AnyMap{}).as <int64_t >();
306
+
307
+ bool compiler_dq = false ;
308
+ const auto device_caps =
309
+ plugin->get_property (ov::device::capabilities.name (), ov::AnyMap{}).as <std::vector<std::string>>();
310
+ if (std::find (device_caps.begin (), device_caps.end (), " COMPILER_DYNAMIC_QUANTIZATION" ) != device_caps.end ()) {
311
+ compiler_dq = true ;
312
+ }
313
+ return std::make_optional (NPUDesc{arch, max_tiles, compiler_dq});
306
314
}
307
315
308
316
std::optional<ov::Any> pop_option (ov::AnyMap& config, const std::string& option_name) {
@@ -349,6 +357,9 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model, c
349
357
if (npudesc.has_value () && npudesc->arch == " 4000" && npudesc->max_tiles != -1 ) {
350
358
config.emplace (" NPU_DPU_GROUPS" , npudesc->max_tiles );
351
359
}
360
+ if (npudesc.has_value () && npudesc->compiler_dq ) {
361
+ config.emplace (" NPUW_DQ_FULL" , " NO" );
362
+ }
352
363
return config;
353
364
}
354
365
@@ -364,6 +375,12 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
364
375
if (npudesc.has_value () && npudesc->arch == " 4000" ) {
365
376
config.emplace (" NPU_DPU_GROUPS" , 4 );
366
377
}
378
+ if (hint == ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE) {
379
+ config.emplace (" NPUW_UNFOLD_IREQS" , " YES" );
380
+ }
381
+ if (npudesc.has_value () && npudesc->compiler_dq ) {
382
+ config.emplace (" NPUW_DQ_FULL" , " NO" );
383
+ }
367
384
return config;
368
385
}
369
386
@@ -468,13 +485,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
468
485
OPENVINO_THROW (" GENERATE_HINT is only applicable for default generate config!" );
469
486
}
470
487
auto generate_config =
471
- generate_config_opt.value_or (get_default_generate_config (model, npudesc, generate_hint)).as <ov::AnyMap>();
488
+ generate_config_opt.value_or (get_default_generate_config (kvcache_model, npudesc, generate_hint))
489
+ .as <ov::AnyMap>();
472
490
473
491
merge_config_with (prefill_config, other_props);
474
492
merge_config_with (generate_config, other_props);
475
493
476
- m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
477
- m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
494
+ m_kvcache_compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
495
+ ov::npuw::ICompiledModel::create (kvcache_model, plugin, generate_config));
496
+ OPENVINO_ASSERT (m_kvcache_compiled,
497
+ " Can't create ov::npuw::CompiledModel for passed kvcache "
498
+ " model and its config, please check passed config." );
499
+ m_prefill_compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
500
+ ov::npuw::ICompiledModel::create (prefill_model, plugin, prefill_config));
501
+ OPENVINO_ASSERT (m_prefill_compiled,
502
+ " Can't create ov::npuw::CompiledModel for passed prefill "
503
+ " model and its config, please check passed config." );
478
504
479
505
implement_properties ();
480
506
LOG_DEBUG (" Done" );
0 commit comments