diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 11efed8b32..09dfb260d3 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -5,6 +5,7 @@ #include +#include "openvino/core/visibility.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/perf_metrics.hpp" @@ -18,7 +19,7 @@ namespace genai { namespace { -/* +/* * NPU reads some properties from the config file, but when LLMPipeline is initialized * from the model_str and weights_tensor, there are no files. * In the later case ModelDesc is stored in properties. @@ -37,7 +38,7 @@ std::pair split_model_descr( pop_property(main_properties, "name_or_path", model_descr.name_or_path); pop_property(main_properties, "type", model_descr.type); pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); - + return {main_properties, model_descr}; } @@ -62,7 +63,7 @@ std::pair draft_model( const std::string& device, const ov::AnyMap& properties) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); - + std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config); auto generation_config = utils::from_config_json_if_exists(models_path); @@ -99,16 +100,40 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios + + m_pimpl = std::make_unique(models_path, tokenizer, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } + m_pimpl->save_load_time(start_time); } @@ -118,14 +143,36 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, scheduler_config, device, device_properties); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios + + m_pimpl = std::make_unique(models_path, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique(models_path, device, properties); } @@ -141,36 +188,59 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::GenerationConfig& generation_config) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()){ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, scheduler_config, device, device_properties, generation_config); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. - // NPU reads some properties from the config file, but when LLMPipeline is initialized - // from the model_str and weights_tensor, there is no files. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. // Therefore, we need to pass these properties manually. // This is necessary only for NPU, for other plugins can be ommited. // Example of usage: - // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, - // {"type", "llama"}, + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, // {"num_key_value_heads", 32}}; // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); // This will convert from AnyMap to ModelDesc. - auto [filtered_properties, model_descr] = split_model_descr(properties); + auto [device_properties, model_descr] = split_model_descr(properties); m_pimpl = static_llm::LLMPipelineFactory::create( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, device, - filtered_properties, + device_properties, generation_config ); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios + + m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, + default_config, device, properties, generation_config); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique( utils::singleton_core().read_model(model_str, weights_tensor), tokenizer,