From 966d89ee544028cb73aa8bfe6f5688939093b084 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 30 Dec 2024 11:53:16 +0100 Subject: [PATCH] Enabled CB by default --- src/cpp/src/llm_pipeline.cpp | 141 ++++++++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 36 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5022595da1..5172780453 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -5,6 +5,7 @@ #include +#include "openvino/core/visibility.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/perf_metrics.hpp" @@ -18,9 +19,9 @@ namespace genai { namespace { -/* +/* * NPU reads some properties from the config file, but when LLMPipeline is initialized -* from the model_str and weights_tensor, there are not files. +* from the model_str and weights_tensor, there are not files. * In the later case ModelDesc is stored in properties. * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr. */ @@ -37,7 +38,7 @@ std::pair split_model_descr(const ov::An pop_property(main_properties, "name_or_path", model_descr.name_or_path); pop_property(main_properties, "type", model_descr.type); pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); - + return {main_properties, model_descr}; } @@ -62,7 +63,7 @@ std::pair draft_model( const std::string& device, const ov::AnyMap& properties) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); - + std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); auto generation_config = utils::from_config_json_if_exists(models_path); @@ -99,35 +100,80 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + + m_pimpl = std::make_unique(models_path, tokenizer, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } + m_pimpl->save_load_time(start_time); } ov::genai::LLMPipeline::LLMPipeline( const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& config) { + const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (config.find(ov::genai::scheduler_config.name()) != config.end() || - config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || - config.find(ov::genai::prompt_lookup.name()) != config.end()) { - auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config); + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config); - } else if (device == "NPU") { - m_pimpl = std::make_unique(models_path, device, config); - } else { - m_pimpl = std::make_unique(models_path, device, config); } + + if (m_pimpl == nullptr && device == "NPU") { + m_pimpl = std::make_unique(models_path, device, properties); + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + + m_pimpl = std::make_unique(models_path, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { + m_pimpl = std::make_unique(models_path, device, properties); + } + m_pimpl->save_load_time(start_time); } @@ -136,48 +182,71 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::Tensor& weights_tensor, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& config, + const ov::AnyMap& properties, const ov::genai::GenerationConfig& generation_config) { - auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); - + auto [core_properties, plugin_properties] = ov::genai::utils::split_core_compile_config(properties); auto start_time = std::chrono::steady_clock::now(); - if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || - plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || - plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){ - auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config); + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (plugin_properties.find(ov::genai::scheduler_config.name()) != plugin_properties.end() || + plugin_properties.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_properties.end() || + plugin_properties.find(ov::genai::prompt_lookup.name()) != plugin_properties.end()){ + + auto [plugin_properties_, scheduler_config] = utils::split_scheduler_config(plugin_properties); m_pimpl = std::make_unique(model_str, weights_tensor, - tokenizer, scheduler_config, device, plugin_config_, generation_config); - } else if (device == "NPU") { + tokenizer, scheduler_config, device, plugin_properties_, generation_config); + } + + if (m_pimpl == nullptr && device == "NPU") { // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. - // NPU reads some properties from the config file, but when LLMPipeline is initialized - // from the model_str and weights_tensor, there is no files. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. // Therefore, we need to pass these properties manually. // This is necessary only for NPU, for other plugins can be ommited. // Example of usage: - // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, - // {"type", "llama"}, + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, // {"num_key_value_heads", 32}}; // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); // This will convert from AnyMap to ModelDesc. - auto [properties, model_descr] = split_model_descr(plugin_config); + auto [npu_properties, model_descr] = split_model_descr(plugin_properties); m_pimpl = std::make_unique( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, device, - properties, + npu_properties, generation_config ); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + + m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, + default_config, device, plugin_properties, generation_config); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), tokenizer, device, - plugin_config, + plugin_properties, generation_config); } + m_pimpl->save_load_time(start_time); }