Skip to content

Commit

Permalink
Enabled CB by default
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Dec 30, 2024
1 parent 0c5f03b commit 966d89e
Showing 1 changed file with 105 additions and 36 deletions.
141 changes: 105 additions & 36 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <nlohmann/json.hpp>

#include "openvino/core/visibility.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"

Expand All @@ -18,9 +19,9 @@ namespace genai {

namespace {

/*
/*
* NPU reads some properties from the config file, but when LLMPipeline is initialized
* from the model_str and weights_tensor, there are not files.
* from the model_str and weights_tensor, there are not files.
* In the later case ModelDesc is stored in properties.
* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
*/
Expand All @@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
pop_property(main_properties, "name_or_path", model_descr.name_or_path);
pop_property(main_properties, "type", model_descr.type);
pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);

return {main_properties, model_descr};
}

Expand All @@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
const std::string& device,
const ov::AnyMap& properties) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto generation_config = utils::from_config_json_if_exists(models_path);
Expand Down Expand Up @@ -99,35 +100,80 @@ ov::genai::LLMPipeline::LLMPipeline(
const std::string& device,
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
}

m_pimpl->save_load_time(start_time);
}

ov::genai::LLMPipeline::LLMPipeline(
const std::filesystem::path& models_path,
const std::string& device,
const ov::AnyMap& config) {
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();

if (config.find(ov::genai::scheduler_config.name()) != config.end() ||
config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() ||
config.find(ov::genai::prompt_lookup.name()) != config.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
} else {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
}

m_pimpl->save_load_time(start_time);
}

Expand All @@ -136,48 +182,71 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& config,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config) {
auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);

auto [core_properties, plugin_properties] = ov::genai::utils::split_core_compile_config(properties);
auto start_time = std::chrono::steady_clock::now();
if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() ||
plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() ||
plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){

auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (plugin_properties.find(ov::genai::scheduler_config.name()) != plugin_properties.end() ||
plugin_properties.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_properties.end() ||
plugin_properties.find(ov::genai::prompt_lookup.name()) != plugin_properties.end()){

auto [plugin_properties_, scheduler_config] = utils::split_scheduler_config(plugin_properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
tokenizer, scheduler_config, device, plugin_config_, generation_config);
} else if (device == "NPU") {
tokenizer, scheduler_config, device, plugin_properties_, generation_config);
}

if (m_pimpl == nullptr && device == "NPU") {
// TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// Therefore, we need to pass these properties manually.
// This is necessary only for NPU, for other plugins can be ommited.
// Example of usage:
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// {"num_key_value_heads", 32}};
// ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
// This will convert from AnyMap to ModelDesc.
auto [properties, model_descr] = split_model_descr(plugin_config);
auto [npu_properties, model_descr] = split_model_descr(plugin_properties);

m_pimpl = std::make_unique<StaticLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
utils::singleton_core().read_model(model_str, weights_tensor),
model_descr,
tokenizer,
device,
properties,
npu_properties,
generation_config
);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
default_config, device, plugin_properties, generation_config);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
utils::singleton_core().read_model(model_str, weights_tensor),
tokenizer,
device,
plugin_config,
plugin_properties,
generation_config);
}

m_pimpl->save_load_time(start_time);
}

Expand Down

0 comments on commit 966d89e

Please sign in to comment.