From 966d89ee544028cb73aa8bfe6f5688939093b084 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 30 Dec 2024 11:53:16 +0100
Subject: [PATCH] Enabled CB by default

---
 src/cpp/src/llm_pipeline.cpp | 141 ++++++++++++++++++++++++++---------
 1 file changed, 105 insertions(+), 36 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5022595da1..5172780453 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -5,6 +5,7 @@
 
 #include <nlohmann/json.hpp>
 
+#include "openvino/core/visibility.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
@@ -18,9 +19,9 @@ namespace genai {
 
 namespace {
 
-/* 
+/*
 * NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
+* from the model_str and weights_tensor, there are not files.
 * In the later case ModelDesc is stored in properties.
 * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
 */
@@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
     pop_property(main_properties, "name_or_path", model_descr.name_or_path);
     pop_property(main_properties, "type", model_descr.type);
     pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
+
     return {main_properties, model_descr};
 }
 
@@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
-    
+
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
     auto generation_config = utils::from_config_json_if_exists(models_path);
@@ -99,35 +100,80 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
-    const ov::AnyMap& config) {
+    const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
-        config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || 
-        config.find(ov::genai::prompt_lookup.name()) != config.end()) {
-        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
+        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
-    } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
-    } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
     }
+
+    if (m_pimpl == nullptr && device == "NPU") {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
+    }
+
     m_pimpl->save_load_time(start_time);
 }
 
@@ -136,48 +182,71 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::Tensor& weights_tensor,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& config,
+    const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config) {
-    auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-
+    auto [core_properties, plugin_properties] = ov::genai::utils::split_core_compile_config(properties);
     auto start_time = std::chrono::steady_clock::now();
-    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || 
-        plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || 
-        plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
 
-        auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (plugin_properties.find(ov::genai::scheduler_config.name()) != plugin_properties.end() ||
+        plugin_properties.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_properties.end() ||
+        plugin_properties.find(ov::genai::prompt_lookup.name()) != plugin_properties.end()){
+
+        auto [plugin_properties_, scheduler_config] = utils::split_scheduler_config(plugin_properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
-                                                              tokenizer, scheduler_config, device, plugin_config_, generation_config);
-    } else if (device == "NPU") {
+                                                              tokenizer, scheduler_config, device, plugin_properties_, generation_config);
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
-        // NPU reads some properties from the config file, but when LLMPipeline is initialized 
-        // from the model_str and weights_tensor, there is no files. 
+        // NPU reads some properties from the config file, but when LLMPipeline is initialized
+        // from the model_str and weights_tensor, there is no files.
         // Therefore, we need to pass these properties manually.
         // This is necessary only for NPU, for other plugins can be ommited.
         // Example of usage:
-        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, 
-        //                                      {"type", "llama"}, 
+        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
+        //                                      {"type", "llama"},
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [properties, model_descr] = split_model_descr(plugin_config);
+        auto [npu_properties, model_descr] = split_model_descr(plugin_properties);
 
         m_pimpl = std::make_unique<StaticLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             model_descr,
             tokenizer,
             device,
-            properties,
+            npu_properties,
             generation_config
         );
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
+                                                                  default_config, device, plugin_properties, generation_config);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,
             device,
-            plugin_config,
+            plugin_properties,
             generation_config);
     }
+
     m_pimpl->save_load_time(start_time);
 }