openvinotoolkit · ilya-lavrenov · Dec 30, 2024 · Jan 2, 2025 · Jan 4, 2025
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -5,6 +5,7 @@
 
 #include <nlohmann/json.hpp>
 
+#include "openvino/core/visibility.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
@@ -18,9 +19,9 @@ namespace genai {
 
 namespace {
 
-/* 
+/*
 * NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
+* from the model_str and weights_tensor, there are not files.
 * In the later case ModelDesc is stored in properties.
 * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
 */
@@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
     pop_property(main_properties, "name_or_path", model_descr.name_or_path);
     pop_property(main_properties, "type", model_descr.type);
     pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
+
     return {main_properties, model_descr};
 }
 
@@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
-    
+
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
     auto generation_config = utils::from_config_json_if_exists(models_path);
@@ -99,16 +100,40 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
@@ -118,14 +143,36 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
 
@@ -141,36 +188,59 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::GenerationConfig& generation_config) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
 
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
                                                               tokenizer, scheduler_config, device, device_properties, generation_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
-        // NPU reads some properties from the config file, but when LLMPipeline is initialized 
-        // from the model_str and weights_tensor, there is no files. 
+        // NPU reads some properties from the config file, but when LLMPipeline is initialized
+        // from the model_str and weights_tensor, there is no files.
         // Therefore, we need to pass these properties manually.
         // This is necessary only for NPU, for other plugins can be ommited.
         // Example of usage:
-        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, 
-        //                                      {"type", "llama"}, 
+        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
+        //                                      {"type", "llama"},
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [filtered_properties, model_descr] = split_model_descr(properties);
+        auto [device_properties, model_descr] = split_model_descr(properties);
 
         m_pimpl = std::make_unique<StaticLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             model_descr,
             tokenizer,
             device,
-            filtered_properties,
+            device_properties,
             generation_config
         );
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
+                                                                  default_config, device, properties, generation_config);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
             utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,