From e4637b3a197cd03281e9b68f0e2f3a100c47b28a Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 13:48:22 +0400
Subject: [PATCH 01/54] Workaround (#618)

Workaround Python_VERSION_MAJOR and MINOR not being set by replasing
Python3 with Python

Disable generation of some of the COMPONENTs not needed for GenAI. There
are still unwanted empty archives, but they are generated
uncounditionally by rapidjson.
---
 CMakeLists.txt            |  3 +++
 src/python/CMakeLists.txt | 43 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8965e8b3e0..be8e03548a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ project(OpenVINOGenAI
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX)
 
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" OFF)
+option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
+
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 find_package(OpenVINODeveloperPackage QUIET
              PATHS "${OpenVINO_DIR}")
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 1867c72fa5..bcbdb77b49 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -11,9 +11,50 @@ FetchContent_Declare(
 FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
+
+# Wouraround Python_VERSION_MAJOR and MINOR not being set by finding
+# Python package instead of Python3
+macro(ov_find_python_no_3 find_package_mode)
+    # Settings for FindPython3.cmake
+    if(NOT DEFINED Python3_USE_STATIC_LIBS)
+        set(Python3_USE_STATIC_LIBS OFF)
+    endif()
+
+    if(NOT DEFINED Python3_FIND_VIRTUALENV)
+        set(Python3_FIND_VIRTUALENV FIRST)
+    endif()
+
+    if(NOT DEFINED Python3_FIND_IMPLEMENTATIONS)
+        set(Python3_FIND_IMPLEMENTATIONS CPython PyPy)
+    endif()
+
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+        set(python3_development_component Development.Module)
+    else()
+        set(python3_development_component Development)
+    endif()
+
+    if(CMAKE_CROSSCOMPILING AND LINUX)
+        # allow to find python headers from host in case of cross-compilation
+        # e.g. install libpython3-dev:<foreign arch> and finds its headers
+        set(_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
+        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+        ov_cross_compile_define_debian_arch()
+    endif()
+
+    find_package(Python ${find_package_mode} COMPONENTS Interpreter ${python3_development_component})
+
+    if(CMAKE_CROSSCOMPILING AND LINUX)
+        ov_cross_compile_define_debian_arch_reset()
+        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
+    endif()
+
+    unset(python3_development_component)
+endmacro()
+
 # the following two calls are required for cross-compilation
 if(OpenVINODeveloperPackage_DIR)
-    ov_find_python3(REQUIRED)
+    ov_find_python_no_3(REQUIRED)
     ov_detect_python_module_extension()
 else()
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)

From 423c8e36d8a7a9b8489f852e1a3845b7e2a32944 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 14:40:20 +0400
Subject: [PATCH 02/54] Revert to python3

---
 src/python/CMakeLists.txt | 42 +--------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index bcbdb77b49..f03f2f58d1 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -12,49 +12,9 @@ FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
 
-# Wouraround Python_VERSION_MAJOR and MINOR not being set by finding
-# Python package instead of Python3
-macro(ov_find_python_no_3 find_package_mode)
-    # Settings for FindPython3.cmake
-    if(NOT DEFINED Python3_USE_STATIC_LIBS)
-        set(Python3_USE_STATIC_LIBS OFF)
-    endif()
-
-    if(NOT DEFINED Python3_FIND_VIRTUALENV)
-        set(Python3_FIND_VIRTUALENV FIRST)
-    endif()
-
-    if(NOT DEFINED Python3_FIND_IMPLEMENTATIONS)
-        set(Python3_FIND_IMPLEMENTATIONS CPython PyPy)
-    endif()
-
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        set(python3_development_component Development.Module)
-    else()
-        set(python3_development_component Development)
-    endif()
-
-    if(CMAKE_CROSSCOMPILING AND LINUX)
-        # allow to find python headers from host in case of cross-compilation
-        # e.g. install libpython3-dev:<foreign arch> and finds its headers
-        set(_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
-        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
-        ov_cross_compile_define_debian_arch()
-    endif()
-
-    find_package(Python ${find_package_mode} COMPONENTS Interpreter ${python3_development_component})
-
-    if(CMAKE_CROSSCOMPILING AND LINUX)
-        ov_cross_compile_define_debian_arch_reset()
-        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
-    endif()
-
-    unset(python3_development_component)
-endmacro()
-
 # the following two calls are required for cross-compilation
 if(OpenVINODeveloperPackage_DIR)
-    ov_find_python_no_3(REQUIRED)
+    ov_find_python3(REQUIRED)
     ov_detect_python_module_extension()
 else()
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)

From 1b1b2f0ffdb96c7a5a77c40859c096b89a1da04a Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 16:28:59 +0400
Subject: [PATCH 03/54] Fix cmake Python var name (#624)

---
 src/python/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index f03f2f58d1..3d03a0d7a8 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -65,10 +65,10 @@ endif()
 install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
               "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 install(TARGETS py_generate_pipeline
         LIBRARY DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 
 install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION openvino_genai

From 70b74ad79eff75b14da04cfc2bbacdf2c0cd7e90 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 19:38:42 +0400
Subject: [PATCH 04/54] Add ContinuousBatchingPipeline constructor similar to
 LLMPipeline (#604)

That allows LLMPipeline to create ContinuousBatchingPipeline as a
backend. There's also a constructor accepting ireq, which can be used if
the model was already transformed appropriately for
ContinuousBatchingPipeline. But it feels it's going to be misleading and
it simpler just to throw if such constructor is called with
ContinuousBatchingPipeline backend.
---
 .github/workflows/causal_lm_cpp.yml           |  2 +-
 .../continuous_batching_accuracy.cpp          |  4 ++-
 .../genai/continuous_batching_pipeline.hpp    | 19 ++++++++++++-
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +--
 src/cpp/include/openvino/genai/tokenizer.hpp  |  2 +-
 src/cpp/src/continuous_batching_pipeline.cpp  | 27 +++++++++++++------
 src/python/py_generate_pipeline.cpp           | 10 ++++---
 tests/python_tests/common.py                  |  2 +-
 tests/python_tests/test_sampling.py           |  4 +--
 9 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 80089a4e81..18cc89a8f0 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -648,7 +648,7 @@ jobs:
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Releas -S ./ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
index 6e0cb5034f..77485e36db 100644
--- a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
+++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
@@ -78,7 +78,9 @@ int main(int argc, char* argv[]) try {
     // vLLM specific params
     scheduler_config.max_num_seqs = 2;
 
-    ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config);
+    // It's possible to construct a Tokenizer from a different path.
+    // If the Tokenizer isn't specified, it's loaded from the same folder.
+    ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config);
     std::vector<ov::genai::GenerationResult> generation_results = pipe.generate(prompts, sampling_params);
 
     for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) {
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index e30892f9c3..be9a5fd8c1 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -32,7 +32,24 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
                                const std::string& device = "CPU",
                                const ov::AnyMap& plugin_config = {});
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer();
+    /**
+    * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
+    *
+    * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
+    * @param scheduler_config
+    * @param tokenizer manually initialized ov::genai::Tokenizer
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    ContinuousBatchingPipeline(
+        const std::string& model_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device="CPU",
+        const ov::AnyMap& plugin_config={}
+    );
+
+    ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;
 
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index b36eab7238..84dc02bd58 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -116,10 +116,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     );
     
     /**
-    * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
+    * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
     *
     * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
-    * @param tokenizer manually initialized ov::Tokenizer 
+    * @param tokenizer manually initialized ov::genai::Tokenizer 
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 4af45e7cfd..5a1e181e21 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -26,7 +26,7 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::Tokenizer constructor.
+    * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
     Tokenizer(const std::string& tokenizer_path);
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index dbacf3c243..27c183ddd8 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -19,7 +19,7 @@ using namespace ov::genai;
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
-    std::shared_ptr<ov::genai::Tokenizer> m_tokenizer;
+    ov::genai::Tokenizer m_tokenizer;
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
@@ -69,9 +69,9 @@ class ContinuousBatchingPipeline::Impl {
     }
 
 public:
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string device, const ov::AnyMap& plugin_config) {
+    Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) :
+            m_tokenizer{tokenizer} {
         ov::Core core;
-        m_tokenizer = std::make_shared<ov::genai::Tokenizer>(models_path);
 
         // The model can be compiled for GPU as well
         std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
@@ -104,6 +104,9 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
+        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
     }
@@ -112,19 +115,19 @@ class ContinuousBatchingPipeline::Impl {
         return m_pipeline_metrics;
     }
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer() {
+    ov::genai::Tokenizer get_tokenizer() {
         return m_tokenizer;
     }
 
     GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
-        sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id());
+        sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
 
         ov::Tensor input_ids;
         {
             static ManualTimer timer("tokenize");
             timer.start();
-            input_ids = m_tokenizer->encode(prompt).input_ids;
+            input_ids = m_tokenizer.encode(prompt).input_ids;
             timer.end();
         }
 
@@ -262,7 +265,7 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer->decode(generation_output.generated_token_ids);
+                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
                 result.m_generation_ids.push_back(output_text);
                 result.m_scores.push_back(generation_output.score);
             }
@@ -282,7 +285,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
     m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
 }
 
-std::shared_ptr<ov::genai::Tokenizer> ContinuousBatchingPipeline::get_tokenizer() {
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_path,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& plugin_config
+) : m_impl{std::make_shared<Impl>(model_path, tokenizer, scheduler_config, device, plugin_config)} {}
+
+ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() {
     return m_impl->get_tokenizer();
 }
 
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 8e475329f1..d7b2aab29c 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -596,10 +596,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, config);
-        }))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+        .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+            ScopedVar env_manager(ov_tokenizers_module_path());
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
         .def("add_request", &ContinuousBatchingPipeline::add_request)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9b53a6b78b..2ec96f671c 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index f4f35deace..c02804527b 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -7,7 +7,7 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig
+from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List
 
 from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
@@ -205,7 +205,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From f0c26772d613cc1a31c7c1491484aef41a706996 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Mon, 15 Jul 2024 19:20:26 +0200
Subject: [PATCH 05/54] Clear beam search info when generate() is finished.
 (#630)

Port of PR: https://github.com/openvinotoolkit/openvino.genai/pull/615
---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 +
 src/cpp/src/sampler.hpp                      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 27c183ddd8..ddfebc5926 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -61,6 +61,7 @@ class ContinuousBatchingPipeline::Impl {
                 for (const auto& sequence: request->get_sequences()) {
                     m_scheduler->free_sequence(sequence->get_id());
                 }
+                m_sampler->clear_beam_search_info(request->get_request_id());
                 requests_iterator = m_requests.erase(requests_iterator);
             } else {
                 requests_iterator++;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 095c795a42..dc631c68ac 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -247,6 +247,8 @@ class Sampler {
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
 
     void set_seed(size_t seed) { rng_engine.seed(seed); }
+
+    void clear_beam_search_info(uint64_t request_id);
 };
 
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
@@ -578,4 +580,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
         }
     }
 }
+
+void Sampler::clear_beam_search_info(uint64_t request_id) { 
+    m_beam_search_info.erase(request_id);
+}
 }

From 73badf67a1a533afa1d94f6fca57a7604a0f4dc9 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Tue, 16 Jul 2024 09:51:55 +0200
Subject: [PATCH 06/54] Update nncf_utils.py (#616) (#633)

Updated default configurations based on results from CVS-143530.

(cherry picked from commit f460002dcc24171f279e032b4f91df3feab00c35)
---
 llm_bench/python/utils/nncf_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index 51d2c67979..25ef8aff18 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -38,10 +38,9 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
-    "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
     "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
                        "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
@@ -58,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
     "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
     "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
+    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
     "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
     "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True,
                      "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}},
@@ -70,7 +69,13 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
+    "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
+    "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
 }

From 25655e3f63705424a6de2180b3d49d8653a62f2e Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 13:25:27 +0400
Subject: [PATCH 07/54] Workaround cmake packaging (#634)

Remove unwanted archives
---
 CMakeLists.txt            | 19 +++++++++++++++++++
 src/python/CMakeLists.txt | 11 -----------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be8e03548a..908e353484 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,20 @@ find_file(spda_to_pa_header sdpa_to_paged_attention.hpp
 
 include(cmake/features.cmake)
 
+if(ENABLE_PYTHON)
+    # the following two calls are required for cross-compilation
+    if(OpenVINODeveloperPackage_DIR)
+        ov_find_python3(REQUIRED)
+        ov_detect_python_module_extension()
+    else()
+        if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
+        else()
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+        endif()
+    endif()
+endif()
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
@@ -52,4 +66,9 @@ install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LIC
 install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
+# Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+if(ENABLE_PYTHON)
+    list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
+endif() 
 include(CPack)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 3d03a0d7a8..7427b624a5 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -12,17 +12,6 @@ FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
 
-# the following two calls are required for cross-compilation
-if(OpenVINODeveloperPackage_DIR)
-    ov_find_python3(REQUIRED)
-    ov_detect_python_module_extension()
-else()
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
-    else()
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
-    endif()
-endif()
 if(NOT pybind11_POPULATED)
     FetchContent_Populate(pybind11)
     add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})

From 754f6d772003c4d9ceb17f85d535bfe1f1648803 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 14:29:34 +0400
Subject: [PATCH 08/54] Save licensing_genai into docs to align with OpenVINO
 (#637)

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 908e353484..5f7390f981 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,8 +62,8 @@ add_subdirectory(src)
 add_subdirectory(samples)
 add_subdirectory(tests/cpp)
 
-install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
-install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
+install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
+install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614

From e5247e048c2c74e4236e0333dc1825adf1fccf7c Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 18:54:46 +0400
Subject: [PATCH 09/54] Update submodule (#638)

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 880d569cd2..04795c1b78 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb
+Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c

From 2d1fa3b33fc3308f4cce9917829ad24346cc0901 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 15:51:54 +0400
Subject: [PATCH 10/54] Add Llama3 (#620)

Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
---
 samples/cpp/beam_search_causal_lm/README.md     |  2 +-
 samples/cpp/chat_sample/README.md               |  2 +-
 samples/cpp/greedy_causal_lm/README.md          |  2 +-
 samples/cpp/multinomial_causal_lm/README.md     |  2 +-
 samples/cpp/prompt_lookup_decoding_lm/README.md |  2 +-
 samples/cpp/speculative_decoding_lm/README.md   |  2 +-
 samples/python/beam_search_causal_lm/README.md  |  2 +-
 samples/python/chat_sample/README.md            |  2 +-
 samples/python/greedy_causal_lm/README.md       |  2 +-
 samples/python/multinomial_causal_lm/README.md  |  2 +-
 src/docs/SUPPORTED_MODELS.md                    | 14 +++++++++++++-
 11 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index a104288911..82232c42f6 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ sample that supports most popular models like LLaMA 2
+# Text generation C++ sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 4baa8385ef..8a24b20005 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -1,4 +1,4 @@
-# C++ chat_sample that supports most popular models like LLaMA 2
+# C++ chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index 3c0758ee6b..c0a7d5f3c4 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 731d03e3c1..4478579919 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 980c0cd19c..89a5e2c585 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality.
 
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index 7abcb6782a..c86bd8b617 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model.
 
diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md
index ff5286d010..5e80aa69da 100644
--- a/samples/python/beam_search_causal_lm/README.md
+++ b/samples/python/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python sample that supports most popular models like LLaMA 2
+# Text generation Python sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md
index 34d71fab8a..983789d0eb 100644
--- a/samples/python/chat_sample/README.md
+++ b/samples/python/chat_sample/README.md
@@ -1,4 +1,4 @@
-# Python chat_sample that supports most popular models like LLaMA 2
+# Python chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md
index 7c87b04aad..97b044eb51 100644
--- a/samples/python/greedy_causal_lm/README.md
+++ b/samples/python/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
index d76b933663..d39142f3de 100644
--- a/samples/python/multinomial_causal_lm/README.md
+++ b/samples/python/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 0e6099db03..3eb2af17b4 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -45,7 +45,19 @@
       </td>
     </tr>
     <tr>
-      <td rowspan="3" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td rowspan="4" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td>Llama 3</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><code>meta-llama/Meta-Llama-3-8B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><code>meta-llama/Meta-Llama-3-8B-Instruct</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B"><code>meta-llama/Meta-Llama-3-70B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"><code>meta-llama/Meta-Llama-3-70B-Instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
+    <tr>
+      <!-- <td><code>LlamaForCausalLM</code></td> -->
       <td>Llama 2</td>
       <td>
         <ul>

From 489a87d7c46960a0cb9920ac93333394c91d5306 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 16:54:42 +0400
Subject: [PATCH 11/54] nightly->rc1 (#621)

---
 .github/workflows/causal_lm_cpp.yml           | 66 +++++++++----------
 .github/workflows/genai_package.yml           | 18 ++---
 .github/workflows/genai_python_lib.yml        | 12 ++--
 .github/workflows/lcm_dreamshaper_cpp.yml     |  8 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 src/README.md                                 |  2 +-
 src/docs/BUILD.md                             |  8 +--
 7 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 18cc89a8f0..85bef624c8 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -34,8 +34,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -77,8 +77,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -208,8 +208,8 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -253,8 +253,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -280,8 +280,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -308,8 +308,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -336,8 +336,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -364,8 +364,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
@@ -401,8 +401,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -445,8 +445,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -493,8 +493,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -543,8 +543,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -603,8 +603,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -645,8 +645,8 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -684,8 +684,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 06e589dfb9..2535e423d9 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
@@ -28,8 +28,8 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -57,8 +57,8 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -99,8 +99,8 @@ jobs:
         shell: bash
       - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 34d5fbf924..e0c43bddd5 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_centos7_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
@@ -29,7 +29,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -m pytest ./tests/python_tests/
@@ -52,7 +52,7 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
@@ -79,7 +79,7 @@ jobs:
         shell: bash
       - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 2d450ad9c8..82a74f8cdf 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -50,8 +50,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
@@ -95,8 +95,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index cda567c23b..5197b27da8 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -49,8 +49,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/src/README.md b/src/README.md
index c67a60eaec..445b88aa58 100644
--- a/src/README.md
+++ b/src/README.md
@@ -23,7 +23,7 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions
     > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     > cd openvino.genai
     > # Install python dependencies
-    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
     > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
     > ```
 
diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 710428139e..1aee73bfb0 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -18,7 +18,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     sudo ./ov/install_dependencies/install_openvino_dependencies.sh
     ```
 3. Build the project:
@@ -48,9 +48,9 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64.zip
+    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
     unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64
+    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
     ```
 3. Build the project:
     ```sh
@@ -85,7 +85,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     ```
 3. Build the project:
     ```sh

From 67f04675a03b2774babc1e0358407ba67f49cb45 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 19:17:11 +0400
Subject: [PATCH 12/54] Add OpenVINOGenAITargets to core_genai_dev COMPONENT
 (#642)

OpenVINOGenAITargets.cmake was excluded from packaging because
CPACK_COMPONENTS_ALL is custom now and doesn't install Unspecified
component
---
 CMakeLists.txt         | 2 +-
 src/cpp/CMakeLists.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f7390f981..7059324d84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT lice
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
-set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs)
 if(ENABLE_PYTHON)
     list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 endif() 
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 454c53b944..c140bf9ac7 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -103,7 +103,8 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
-        NAMESPACE openvino:: DESTINATION runtime/cmake)
+        NAMESPACE openvino:: DESTINATION runtime/cmake
+        COMPONENT core_genai_dev)
 
 include(CMakePackageConfigHelpers)
 configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in"

From 19691609512f7c7d344cdf19cd8d36db30b6c574 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 22 Jul 2024 12:46:25 +0400
Subject: [PATCH 13/54] Apply todo, initialize detokenizer's cache (#647)

---
 src/cpp/src/tokenizer.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 9b4a206a1e..ac6b925dcb 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -98,8 +98,11 @@ class Tokenizer::TokenizerImpl {
                                                    device).create_infer_request();
 
         // Get special token ids by inference if they are not defined.
-        // todo: do not call until CVS-143410 is resolved
-        // infer_special_tokens_if_necessary();
+        infer_special_tokens_if_necessary();
+        // Initialize tokenizer's cache to save time later.
+        // infer_special_tokens_if_necessary() already could do that
+        // but it didn't run decode() for sure.
+        decode(encode("").input_ids);
     }
 
     // load special tokens ids from config.json

From 0e0f6a9c6cde08835dd579b20e76149f9fc17545 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Mon, 22 Jul 2024 11:04:19 +0100
Subject: [PATCH 14/54] Cherry-pick static LLM pipeline changes (#654)

Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
---
 samples/cpp/chat_sample/chat_sample.cpp |  2 +-
 src/cpp/src/llm_pipeline_static.cpp     | 88 +++++++++++++++++++------
 src/cpp/src/llm_pipeline_static.hpp     | 12 ++--
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
index d9d9c2b2de..ae4dad88a2 100644
--- a/samples/cpp/chat_sample/chat_sample.cpp
+++ b/samples/cpp/chat_sample/chat_sample.cpp
@@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(model_path, "CPU");
     
     ov::genai::GenerationConfig config;
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 3a9ea4d1d9..3f50d30ec9 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -8,6 +8,8 @@
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
 
+#include <openvino/pass/stateful_to_stateless.hpp>
+
 namespace {
 
 std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov::Model>& model) {
@@ -75,25 +77,42 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void fill_tensor(ov::Tensor tensor, int64_t fill_val) {
+void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) {
     int64_t* tensor_data = tensor.data<int64_t>();
-    std::fill(tensor_data, tensor_data + tensor.get_size(), fill_val);
+    std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val);
 }
 
-void copy_with_left_offset(const ov::Tensor& orig, ov::Tensor& padded) {
-    const auto orig_size = orig.get_size();
-    const auto padded_size = padded.get_size();
-    const auto kLeftOffset = padded_size - orig_size;
+void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) {
     int64_t* orig_data = orig.data<int64_t>();
     int64_t* padded_data = padded.data<int64_t>();
-    std::copy(orig_data, orig_data + orig_size, padded_data + kLeftOffset);
+    std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset);
 }
 
-ov::AnyMap extract_config_or_empty(const ov::AnyMap& config, const std::string& config_name) {
+ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) {
     ov::AnyMap stage_cfg;
     if (auto it = config.find(config_name); it != config.end()) {
         const auto& map = it->second.as<std::map<std::string, std::string>>();
         stage_cfg = { map.begin(), map.end() };
+    } else if (config_name == "PREFILL_CONFIG") {
+        std::map<std::string, std::string> prefill_config = {
+			{ "NPU_USE_NPUW", "YES" },
+			{ "NPUW_FOLD", "YES" },
+			{ "NPUW_DCOFF_TYPE", "f16" },
+			{ "NPUW_DCOFF_SCALE",  "YES" },
+			{ "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" }
+        };
+        stage_cfg.insert(prefill_config.begin(), prefill_config.end());
+    } else if (config_name == "GENERATE_CONFIG") {
+        std::map<std::string, std::string> generate_config = {
+            { "NPU_USE_NPUW", "YES" },
+            { "NPUW_FOLD", "YES" },
+            { "NPUW_DCOFF_TYPE", "f16" },
+            { "NPUW_DCOFF_SCALE", "YES" },
+            { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" },
+            { "NPUW_PARALLEL_COMPILE", "YES" },
+            { "NPUW_FUNCALL_ASYNC", "YES" }
+        };
+        stage_cfg.insert(generate_config.begin(), generate_config.end());
     }
     return stage_cfg;
 }
@@ -126,7 +145,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     ov::Core core;
     // (1) Read the template model - this will be kvcache model
     auto kvcache_model = core.read_model(path / "openvino_model.xml");
-    // (2) TODO: Expose KV-cache input and output layers from kvcache model
+    // (2) Expose KV-cache input and output layers from kvcache model
+    ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     // (3) Clone the model - this will be prefill
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
@@ -140,10 +160,10 @@ StaticLLMPipeline::StaticLLMPipeline(
     kvcache_model = add_slices_to_kvcache_inputs(kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
+        prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
-        kvcache_model, device, extract_config_or_empty(config, "GENERATE_CONFIG")
+        kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
     ).create_infer_request();
     // (7) Initialize tensors
     prepare_for_new_conversation();
@@ -156,6 +176,18 @@ StaticLLMPipeline::StaticLLMPipeline(
 ) : StaticLLMPipeline(path, path.string(), device, config) {
 }
 
+void StaticLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StaticLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
 void StaticLLMPipeline::prepare_for_new_conversation() {
     fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u);
@@ -175,9 +207,23 @@ DecodedResults StaticLLMPipeline::generate(
     }
 
     OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
-    auto tokenized_input = m_tokenizer.encode(std::get<std::string>(inputs));
+    auto& prompt = std::get<std::string>(inputs);
+
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    }
+
+    auto tokenized_input = m_tokenizer.encode(prompt);
     auto encoded_results = generate(tokenized_input, config, streamer);
-    return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+    return decoded_results;
 }
 
 EncodedResults StaticLLMPipeline::generate(
@@ -222,22 +268,25 @@ EncodedResults StaticLLMPipeline::generate(
     ov::genai::EncodedResults results;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
+    results.scores[0] = 0u;
     results.tokens.resize(1u);
 
-    // NB: Check if input prompt less than maximum size
+    // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
     if (prompt_len > m_kvcache_desc.total_size) {
         OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens");
     }
 
-    // NB: Reset tensors on every generate call - chat conversation isn't supported yet!
+    // NB: From the "generate" perspective, every call is treated as start of new conversation,
+    // but if continuation is needed, prompt contains information about the entire conversation.
     prepare_for_new_conversation();
 
     auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
-    copy_with_left_offset(input_ids, padded_input_ids);
+    const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
+    copy_with_offset(input_ids, offset, padded_input_ids);
 
     auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
-    copy_with_left_offset(attention_mask, padded_attention_mask);
+    fill_tensor(padded_attention_mask, 1u, offset);
 
     auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
     auto* padded_pos_data = padded_position_ids.data<int64_t>();
@@ -248,13 +297,13 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += prompt_len;
     int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
+    results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
     }
 
     padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask"));
 
-
     // Inputs: input_ids, attention_mask, position_ids, ...
     // Outputs: logits, ...
     const auto kStartInputKVCacheLayers = 3u;
@@ -286,13 +335,12 @@ EncodedResults StaticLLMPipeline::generate(
 
         last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
         results.tokens[0].push_back(last_token);
-        results.scores[0] = 0u;
 
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
 
-        if (last_token == m_generation_config.eos_token_id) {
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
             break;
         }
 
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 8c2f19ffa7..85488e1880 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -35,13 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         StreamerVariant streamer
     ) override;
 
-    void start_chat(const std::string& system_message) override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-    void finish_chat() override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
 private:
     void prepare_for_new_conversation();
 
@@ -54,6 +49,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
+
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 };
 
 }  // namespace genai

From cb100cb3bc7459bb489154937b3a076c5bd9f1d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Thu, 11 Jul 2024 16:50:27 +0200
Subject: [PATCH 15/54] [Continuous batching] Replace standard max_element call
 with custom loop for greedy sampling (#607)

Searching for max element in a custom loop gives better performance than
using std::max_element
---
 src/cpp/src/sampler.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index dc631c68ac..6390fc8725 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -219,8 +219,13 @@ class Sampler {
     }
 
     Token _greedy_sample(const std::vector<Token>& logit_vector) const {
-        auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
-        return *out_token;
+        Token max_token{-std::numeric_limits<float>::infinity() , 0};
+        for (const auto& logit : logit_vector) {
+            if (logit.m_log_prob > max_token.m_log_prob) {
+                max_token = logit;
+            }
+        }
+        return max_token;
     }
 
     std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {

From f0e41909ab06e22c569f1af54654aad521ce4a6e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 12 Jul 2024 12:21:38 +0200
Subject: [PATCH 16/54] wip

---
 samples/CMakeLists.txt                        |  1 +
 .../benchmark_vanilla_genai/CMakeLists.txt    | 25 +++++++
 samples/cpp/benchmark_vanilla_genai/README.md |  2 +
 .../benchmark_vanilla_genai.cpp               | 65 +++++++++++++++++++
 .../openvino/genai/generation_metrics.hpp     | 40 ++++++++++++
 .../include/openvino/genai/llm_pipeline.hpp   |  4 ++
 src/cpp/src/generation_metrics.cpp            | 62 ++++++++++++++++++
 src/cpp/src/greedy_decoding.cpp               | 17 ++++-
 src/cpp/src/llm_pipeline.cpp                  | 10 ++-
 src/cpp/src/llm_pipeline_base.hpp             |  2 +
 10 files changed, 223 insertions(+), 5 deletions(-)
 create mode 100644 samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
 create mode 100644 samples/cpp/benchmark_vanilla_genai/README.md
 create mode 100644 samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
 create mode 100644 src/cpp/include/openvino/genai/generation_metrics.hpp
 create mode 100644 src/cpp/src/generation_metrics.cpp

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 0839d58428..44f8d580b2 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
+add_subdirectory(cpp/benchmark_vanilla_genai)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)
diff --git a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt b/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
new file mode 100644
index 0000000000..e871f5a33a
--- /dev/null
+++ b/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+find_package(OpenVINOGenAI REQUIRED PATHS
+    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+)
+
+FetchContent_Declare(cxxopts
+    URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz
+    URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
+FetchContent_MakeAvailable(cxxopts)
+
+add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp)
+target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_vanilla_genai PROPERTIES
+    COMPILE_PDB_NAME benchmark_vanilla_genai
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11)
+install(TARGETS benchmark_vanilla_genai
+    RUNTIME DESTINATION samples_bin/
+    COMPONENT samples_bin
+    EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md
new file mode 100644
index 0000000000..739c2e950c
--- /dev/null
+++ b/samples/cpp/benchmark_vanilla_genai/README.md
@@ -0,0 +1,2 @@
+# benchmark OpenVINO GenAI sample
+
diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
new file mode 100644
index 0000000000..ccb7650b84
--- /dev/null
+++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/llm_pipeline.hpp"
+#include <cxxopts.hpp>
+
+int main(int argc, char* argv[]) try {
+    cxxopts::Options options("benchmark_vanilla_genai", "Help command");
+
+    options.add_options()
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
+    ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
+    ("h,help", "Print usage");
+
+    cxxopts::ParseResult result;
+    try {
+        result = options.parse(argc, argv);
+    } catch (const cxxopts::exceptions::exception& e) {
+        std::cout << e.what() << "\n\n";
+        std::cout << options.help() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (result.count("help")) {
+        std::cout << options.help() << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    std::string prompt = result["prompt"].as<std::string>();
+    const std::string model_path = result["model"].as<std::string>();
+    std::string device = result["device"].as<std::string>();
+    size_t num_warmup = result["num_warmup"].as<size_t>();
+    size_t num_iter = result["num_iter"].as<size_t>();
+  
+    ov::genai::GenerationConfig config;
+    config.max_new_tokens = 100;
+
+    ov::genai::LLMPipeline pipe(model_path, device);
+    
+    for (size_t i = 0; i < num_warmup; i++)
+        pipe.generate(prompt, config);
+    
+    ov::genai::GenerationMetrics metrics;
+    for (size_t i = 0; i < num_iter; i++) {
+        ov::genai::DecodedResults res = pipe.generate(prompt, config);
+        metrics = metrics + res.metrics;
+        metrics.load_time = res.metrics.load_time;
+    }
+    
+    std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
+    std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
+    std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl;
+    std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl;
+
+    return 0;
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/src/cpp/include/openvino/genai/generation_metrics.hpp b/src/cpp/include/openvino/genai/generation_metrics.hpp
new file mode 100644
index 0000000000..7129e5c52b
--- /dev/null
+++ b/src/cpp/include/openvino/genai/generation_metrics.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include <numeric>
+#include <vector>
+#include <cmath>
+
+namespace ov {
+namespace genai {
+
+using TimePoints = std::vector<std::chrono::steady_clock::time_point>;
+
+struct GenerationMetrics {
+    GenerationMetrics() = default;
+
+    GenerationMetrics(const TimePoints& tok_times, size_t batch_size = 1);
+    GenerationMetrics(const std::vector<float>& durations, const std::vector<float>& times_to_first_token, size_t batch_size = 1);
+
+    // First token time.
+    float mean_ttft;
+    float std_ttft;
+    std::vector<float> times_to_first_token;
+
+    // Time per output token.
+    float mean_tpot;
+    float std_tpot;
+    std::vector<float> durations;
+    
+    std::pair<float, float> get_tokens_per_sec() const;
+    size_t batch_size;
+    float load_time;
+
+    GenerationMetrics operator+(GenerationMetrics const& metrics) const;
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 84dc02bd58..9f0c9fba97 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -5,11 +5,13 @@
 
 #include <optional>
 #include <variant>
+#include <chrono>
 
 #include "openvino/core/any.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
+#include "openvino/genai/generation_metrics.hpp"
 
 namespace ov {
 namespace genai {
@@ -34,6 +36,7 @@ class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
+    GenerationMetrics metrics;
 };
 
 /**
@@ -47,6 +50,7 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
+    GenerationMetrics metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {
diff --git a/src/cpp/src/generation_metrics.cpp b/src/cpp/src/generation_metrics.cpp
new file mode 100644
index 0000000000..8ca8e0a07d
--- /dev/null
+++ b/src/cpp/src/generation_metrics.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/generation_metrics.hpp"
+#include <tuple>
+
+namespace {
+
+std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
+    float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size();
+    
+    float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
+        [](const float& acc, const float& duration) -> float {
+            return acc + duration * duration;
+        });
+    float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);      
+    return {mean, std};
+}
+
+} // namespace
+
+namespace ov {
+namespace genai {
+    
+
+GenerationMetrics::GenerationMetrics(const TimePoints& tok_times, size_t batch_size) {
+    this->batch_size = batch_size;
+    durations = std::vector<float>(tok_times.size() - 1);
+    for (size_t i = 1; i < tok_times.size(); ++i) {
+        durations[i - 1] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - tok_times[i - 1]).count();
+    }
+    times_to_first_token.emplace_back(durations[0]);
+
+    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations);
+    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token);
+}
+
+GenerationMetrics::GenerationMetrics(const std::vector<float>& durations_, const std::vector<float>& times_to_first_token_, size_t batch_size)
+    : durations(durations_), times_to_first_token(times_to_first_token_) {
+    this->batch_size = batch_size;
+    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations);
+    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token);
+}
+
+GenerationMetrics GenerationMetrics::operator+(GenerationMetrics const& metrics) const {
+    std::vector<float> new_durations = durations;
+    std::vector<float> new_times_to_first_token = times_to_first_token;
+    new_durations.insert(new_durations.end(), metrics.durations.begin(), metrics.durations.end());
+    new_times_to_first_token.insert(new_times_to_first_token.end(), metrics.times_to_first_token.begin(), metrics.times_to_first_token.end());
+    
+    return GenerationMetrics(new_durations, new_times_to_first_token);
+}
+
+std::pair<float, float> GenerationMetrics::get_tokens_per_sec() const {
+   auto mean_tps = 1000.0f * batch_size / mean_tpot;
+   auto std_tps = 1000.0f * std_tpot / (mean_tpot * mean_tpot);
+   return {mean_tps, std_tps};
+}
+
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 9170c7d2f9..dad93a0e6e 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -19,12 +19,18 @@ EncodedResults greedy_decoding(
     const size_t batch_size = prompts_shape[0];
     size_t running_batch_size = batch_size;
     size_t prompt_len = prompts_shape[1];
+    size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);
 
     EncodedResults results;
+    // Time before the first token generated as a reference point.
+    ov::genai::TimePoints tok_times;
+    tok_times.reserve(max_new_tokens);
+    tok_times.emplace_back(std::chrono::steady_clock::now());
+
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
     std::fill(results.scores.begin(), results.scores.end(), 0);
-       
+
     m_model_runner.set_tensor("input_ids", input_ids);
     m_model_runner.set_tensor("attention_mask", attention_mask);
     if (position_ids.has_value())
@@ -50,6 +56,8 @@ EncodedResults greedy_decoding(
         eos_met[batch] = (out_token == generation_config.eos_token_id);
         m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
     }
+    tok_times.emplace_back(std::chrono::steady_clock::now());
+
     if (streamer && streamer->put(token_iter_results[0])) {
         return results;
     }
@@ -58,8 +66,8 @@ EncodedResults greedy_decoding(
     if (!generation_config.ignore_eos && all_are_eos)
         return results;
     
-    size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
-    for (size_t i = 0; i < max_tokens - 1; ++i) {
+
+    for (size_t i = 0; i < max_new_tokens - 1; ++i) {
         if (position_ids.has_value())
             utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
@@ -80,6 +88,7 @@ EncodedResults greedy_decoding(
             
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
         }
+        tok_times.emplace_back(std::chrono::steady_clock::now());
 
         if (streamer && streamer->put(token_iter_results[0]))
             return results;
@@ -106,6 +115,8 @@ EncodedResults greedy_decoding(
     if (streamer) {
         streamer->end();
     }
+
+    results.metrics = GenerationMetrics(tok_times);
     return results;
 }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 507d988a6a..918e744286 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -9,6 +9,7 @@
 #include <openvino/openvino.hpp>
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/generation_metrics.hpp"
 #include "llm_pipeline_base.hpp"
 #include "llm_pipeline_static.hpp"
 #include "utils.hpp"
@@ -155,6 +156,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_history.push_back({{"role", "assistant"}, {"content", answer}});
         }
         
+        decoded_results.metrics = std::move(encoded_results.metrics);
+        decoded_results.metrics.load_time = m_load_time_ms;
         return decoded_results;
     }
 
@@ -253,7 +256,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         } else {
             m_is_cache_empty = false;
         }
-
         return result;
     }
 
@@ -350,6 +352,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& plugin_config
 ) {
+
     if (device == "NPU") {
         m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(model_path), tokenizer, device, plugin_config);
     } else {
@@ -361,12 +364,15 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& path,
     const std::string& device,
     const ov::AnyMap& config
-) {
+) { 
+    auto start_time = std::chrono::steady_clock::now();
     if (device == "NPU") {
         m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(path), device, config);
     } else {
         m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
     }
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp
index 9df6442b35..7e58cd3b37 100644
--- a/src/cpp/src/llm_pipeline_base.hpp
+++ b/src/cpp/src/llm_pipeline_base.hpp
@@ -36,6 +36,8 @@ class LLMPipelineImplBase {
 
     Tokenizer m_tokenizer;
     GenerationConfig m_generation_config;
+
+    float m_load_time_ms  = 0;
 };
 
 }  // namespace genai

From 7cab496c63a598dcb96027c9a88d3c96ef1b5b48 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 19 Jul 2024 13:01:02 +0200
Subject: [PATCH 17/54] add detokenization metric; refactor split to
 perf_conter & perf_metrics

---
 .../benchmark_vanilla_genai.cpp               |  8 +-
 .../openvino/genai/generation_metrics.hpp     | 40 ---------
 .../include/openvino/genai/llm_pipeline.hpp   |  6 +-
 .../include/openvino/genai/perf_metrics.hpp   | 50 ++++++++++++
 src/cpp/src/generation_metrics.cpp            | 62 --------------
 src/cpp/src/greedy_decoding.cpp               | 19 ++---
 src/cpp/src/group_beam_searcher.cpp           | 19 +++--
 src/cpp/src/llm_pipeline.cpp                  | 30 +++++--
 src/cpp/src/perf_counters.cpp                 | 21 +++++
 src/cpp/src/perf_counters.hpp                 | 44 ++++++++++
 src/cpp/src/perf_metrics.cpp                  | 81 +++++++++++++++++++
 src/cpp/src/tokenizer.cpp                     |  2 +
 src/cpp/src/utils.hpp                         | 14 ++++
 src/python/py_generate_pipeline.cpp           | 14 ++++
 tests/python_tests/ov_genai_test_utils.py     |  2 +
 15 files changed, 282 insertions(+), 130 deletions(-)
 delete mode 100644 src/cpp/include/openvino/genai/generation_metrics.hpp
 create mode 100644 src/cpp/include/openvino/genai/perf_metrics.hpp
 delete mode 100644 src/cpp/src/generation_metrics.cpp
 create mode 100644 src/cpp/src/perf_counters.cpp
 create mode 100644 src/cpp/src/perf_counters.hpp
 create mode 100644 src/cpp/src/perf_metrics.cpp

diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
index ccb7650b84..6489282b0b 100644
--- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
+++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -37,23 +37,25 @@ int main(int argc, char* argv[]) try {
   
     ov::genai::GenerationConfig config;
     config.max_new_tokens = 100;
+    config.num_beam_groups = 3;
+    config.num_beams = 15;
 
     ov::genai::LLMPipeline pipe(model_path, device);
     
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, config);
     
-    ov::genai::GenerationMetrics metrics;
+    ov::genai::PerfMetrics metrics;
     for (size_t i = 0; i < num_iter; i++) {
         ov::genai::DecodedResults res = pipe.generate(prompt, config);
         metrics = metrics + res.metrics;
         metrics.load_time = res.metrics.load_time;
     }
-    
+
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
     std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
     std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl;
-    std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl;
+    std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl;
 
     return 0;
 } catch (const std::exception& error) {
diff --git a/src/cpp/include/openvino/genai/generation_metrics.hpp b/src/cpp/include/openvino/genai/generation_metrics.hpp
deleted file mode 100644
index 7129e5c52b..0000000000
--- a/src/cpp/include/openvino/genai/generation_metrics.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <chrono>
-#include <numeric>
-#include <vector>
-#include <cmath>
-
-namespace ov {
-namespace genai {
-
-using TimePoints = std::vector<std::chrono::steady_clock::time_point>;
-
-struct GenerationMetrics {
-    GenerationMetrics() = default;
-
-    GenerationMetrics(const TimePoints& tok_times, size_t batch_size = 1);
-    GenerationMetrics(const std::vector<float>& durations, const std::vector<float>& times_to_first_token, size_t batch_size = 1);
-
-    // First token time.
-    float mean_ttft;
-    float std_ttft;
-    std::vector<float> times_to_first_token;
-
-    // Time per output token.
-    float mean_tpot;
-    float std_tpot;
-    std::vector<float> durations;
-    
-    std::pair<float, float> get_tokens_per_sec() const;
-    size_t batch_size;
-    float load_time;
-
-    GenerationMetrics operator+(GenerationMetrics const& metrics) const;
-};
-
-} // namespace genai
-} // namespace ov
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 9f0c9fba97..4db3c613e7 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -11,7 +11,7 @@
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
-#include "openvino/genai/generation_metrics.hpp"
+#include "openvino/genai/perf_metrics.hpp"
 
 namespace ov {
 namespace genai {
@@ -36,7 +36,7 @@ class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
-    GenerationMetrics metrics;
+    PerfMetrics metrics;
 };
 
 /**
@@ -50,7 +50,7 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
-    GenerationMetrics metrics;
+    PerfMetrics metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
new file mode 100644
index 0000000000..a11c4e0374
--- /dev/null
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include "openvino/genai/visibility.hpp"
+#include <vector>
+#include <memory>
+
+namespace ov {
+namespace genai {
+
+using TimePoint = std::chrono::steady_clock::time_point;
+
+struct PerfCounters;
+
+struct OPENVINO_GENAI_EXPORTS PerfMetrics {
+    // First token time.
+    float mean_ttft;
+    float std_ttft;
+
+    // Time per output token.
+    float mean_tpot;
+    float std_tpot;
+    
+    float load_time;
+    float start_time;
+
+    float mean_generate_duration;
+    float mean_decoding_duration;
+    float mean_encoding_duration;
+    
+    float mean_throughput;
+    float std_throughput;
+
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+
+    std::shared_ptr<PerfCounters> m_counters;
+    void evaluate(TimePoint start_time);
+
+    PerfMetrics operator+(const PerfMetrics& metrics) const;
+    PerfMetrics& operator+=(const PerfMetrics& right);
+
+    
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/generation_metrics.cpp b/src/cpp/src/generation_metrics.cpp
deleted file mode 100644
index 8ca8e0a07d..0000000000
--- a/src/cpp/src/generation_metrics.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "openvino/genai/generation_metrics.hpp"
-#include <tuple>
-
-namespace {
-
-std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
-    float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size();
-    
-    float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
-        [](const float& acc, const float& duration) -> float {
-            return acc + duration * duration;
-        });
-    float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);      
-    return {mean, std};
-}
-
-} // namespace
-
-namespace ov {
-namespace genai {
-    
-
-GenerationMetrics::GenerationMetrics(const TimePoints& tok_times, size_t batch_size) {
-    this->batch_size = batch_size;
-    durations = std::vector<float>(tok_times.size() - 1);
-    for (size_t i = 1; i < tok_times.size(); ++i) {
-        durations[i - 1] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - tok_times[i - 1]).count();
-    }
-    times_to_first_token.emplace_back(durations[0]);
-
-    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations);
-    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token);
-}
-
-GenerationMetrics::GenerationMetrics(const std::vector<float>& durations_, const std::vector<float>& times_to_first_token_, size_t batch_size)
-    : durations(durations_), times_to_first_token(times_to_first_token_) {
-    this->batch_size = batch_size;
-    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations);
-    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token);
-}
-
-GenerationMetrics GenerationMetrics::operator+(GenerationMetrics const& metrics) const {
-    std::vector<float> new_durations = durations;
-    std::vector<float> new_times_to_first_token = times_to_first_token;
-    new_durations.insert(new_durations.end(), metrics.durations.begin(), metrics.durations.end());
-    new_times_to_first_token.insert(new_times_to_first_token.end(), metrics.times_to_first_token.begin(), metrics.times_to_first_token.end());
-    
-    return GenerationMetrics(new_durations, new_times_to_first_token);
-}
-
-std::pair<float, float> GenerationMetrics::get_tokens_per_sec() const {
-   auto mean_tps = 1000.0f * batch_size / mean_tpot;
-   auto std_tps = 1000.0f * std_tpot / (mean_tpot * mean_tpot);
-   return {mean_tps, std_tps};
-}
-
-
-} // namespace genai
-} // namespace ov
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index dad93a0e6e..0802b87e66 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -1,7 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/perf_metrics.hpp"
+#include "perf_counters.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -22,11 +23,8 @@ EncodedResults greedy_decoding(
     size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);
 
     EncodedResults results;
-    // Time before the first token generated as a reference point.
-    ov::genai::TimePoints tok_times;
-    tok_times.reserve(max_new_tokens);
-    tok_times.emplace_back(std::chrono::steady_clock::now());
-
+    auto& perf_counters = results.metrics.m_counters;
+    
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
     std::fill(results.scores.begin(), results.scores.end(), 0);
@@ -56,8 +54,8 @@ EncodedResults greedy_decoding(
         eos_met[batch] = (out_token == generation_config.eos_token_id);
         m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
     }
-    tok_times.emplace_back(std::chrono::steady_clock::now());
-
+    perf_counters->add_timestamp(running_batch_size);
+        
     if (streamer && streamer->put(token_iter_results[0])) {
         return results;
     }
@@ -88,7 +86,7 @@ EncodedResults greedy_decoding(
             
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
         }
-        tok_times.emplace_back(std::chrono::steady_clock::now());
+        perf_counters->add_timestamp(running_batch_size);
 
         if (streamer && streamer->put(token_iter_results[0]))
             return results;
@@ -116,9 +114,8 @@ EncodedResults greedy_decoding(
         streamer->end();
     }
 
-    results.metrics = GenerationMetrics(tok_times);
     return results;
 }
 
 }  //namespace genai
-}  //namespace ov
\ No newline at end of file
+}  //namespace ov
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
index 8695aeac02..4f5cb79f2a 100644
--- a/src/cpp/src/group_beam_searcher.cpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -362,14 +362,20 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
                            std::optional<int32_t> selected_beam_idx) {
     OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
                     "number of beams should be divisible by number of groups");
-
-    // Initialize beam search
+    
     auto batch_size = input_ids.get_shape().at(0);
+    auto sequence_length = input_ids.get_shape().at(1);
+    
+    // Initialize time metric counters.
+    // ov::genai::TimePoints tok_times;
+    // tok_times.reserve(config.get_max_new_tokens(sequence_length));
+    // tok_times.emplace_back(std::chrono::steady_clock::now());
+
+    // Initialize beam search.
     const int64_t* prompt_data = input_ids.data<const int64_t>();
     std::vector<std::vector<int64_t>> prompts;
     prompts.reserve(batch_size);
     for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sequence_length = input_ids.get_shape().at(1);
         size_t batch_offset = batch * sequence_length;
         const int64_t* prompt_start = prompt_data + batch_offset;
         prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
@@ -389,7 +395,7 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
     lm.set_tensor("beam_idx", beam_idx);
 
     Parameters parameters{std::move(prompts)};
-    parameters.max_new_tokens = config.max_new_tokens;
+    parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
     parameters.eos_token_id = config.eos_token_id;
     parameters.n_groups = config.num_beam_groups;
     parameters.group_size = config.num_beams / config.num_beam_groups;
@@ -406,6 +412,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
         lm.infer();
 
         std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
+        // tok_times.emplace_back(std::chrono::steady_clock::now());
+
         if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
             // Break the cycle before masks are extended in update_attention_mask_with_beams.
             // If generation is continued, attention_mask length should be equal to KV cache size.
@@ -462,7 +470,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
             results.tokens.push_back(std::move(beam->get().tokens));
         }
     }
-
+    
+    // results.metrics = PerfCounters(tok_times);
     return {results, res_selected_beam_idx};
 }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 918e744286..81f807c149 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,6 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#include "perf_counters.hpp"
 #include <filesystem>
 #include <fstream>
 #include <variant>
@@ -9,7 +10,7 @@
 #include <openvino/openvino.hpp>
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
-#include "openvino/genai/generation_metrics.hpp"
+#include "openvino/genai/perf_metrics.hpp"
 #include "llm_pipeline_base.hpp"
 #include "llm_pipeline_static.hpp"
 #include "utils.hpp"
@@ -111,8 +112,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        auto start_time = std::chrono::steady_clock::now();
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-        EncodedInputs encoded_input;
+        TokenizedInputs encoded_input;
 
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
             encoded_input = m_tokenizer.encode(*input_vector);
@@ -144,9 +146,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 encoded_input = m_tokenizer.encode(prompt);
             }
         }
+        auto encode_stop_time =  std::chrono::steady_clock::now();
+        auto encoded_results = generate(encoded_input, config, streamer);
 
-        auto encoded_results  = generate(encoded_input, config, streamer);
+        auto decode_start_time =  std::chrono::steady_clock::now();
         DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+        auto decode_stop_time =  std::chrono::steady_clock::now();
         
         if (is_chat_conversation) {
             // Tail of chat template is missing in KV cache.
@@ -155,9 +160,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_templated_chat_history.append(answer);
             m_history.push_back({{"role", "assistant"}, {"content", answer}});
         }
+
+        auto& metrics = encoded_results.metrics;
+        // metrics.tokenization_duration = std::chrono::duration_cast<std::chrono::milliseconds>(encode_stop_time - start_time).count();
+        // metrics.detokenization_duration = std::chrono::duration_cast<std::chrono::milliseconds>(decode_stop_time - decode_start_time).count();
         
-        decoded_results.metrics = std::move(encoded_results.metrics);
-        decoded_results.metrics.load_time = m_load_time_ms;
+        // auto stop_time = std::chrono::steady_clock::now();
+        // metrics.generate_durations.emplace_back(std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count());
+        decoded_results.metrics = std::move(metrics);
         return decoded_results;
     }
 
@@ -166,9 +176,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        auto start_time = std::chrono::steady_clock::now();
         ov::Tensor input_ids;
         ov::Tensor attention_mask;
-
         if (auto data = std::get_if<ov::Tensor>(&inputs)) {
             input_ids = *data;
             attention_mask = ov::genai::utils::init_attention_mask(input_ids);
@@ -256,6 +266,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         } else {
             m_is_cache_empty = false;
         }
+
+
+
+        auto& metrics = result.metrics;
+        // metrics.batch_size = batch_size;
+        // metrics.num_generated_tokens = (metrics.m_durations.size() + 1) * batch_size;
+        metrics.num_input_tokens = batch_size * input_ids.get_shape().at(0);
+        result.metrics = std::move(metrics);
         return result;
     }
 
diff --git a/src/cpp/src/perf_counters.cpp b/src/cpp/src/perf_counters.cpp
new file mode 100644
index 0000000000..c9dac6eca0
--- /dev/null
+++ b/src/cpp/src/perf_counters.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "perf_counters.hpp"
+#include "openvino/genai/perf_metrics.hpp"
+#include "openvino/openvino.hpp"
+#include <tuple>
+#include <numeric>
+#include <cmath>
+
+namespace ov {
+namespace genai {
+
+void PerfCounters::add_timestamp(size_t batch_size) {
+    m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+    m_batch_sizes.emplace_back(batch_size);
+}
+
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/perf_counters.hpp b/src/cpp/src/perf_counters.hpp
new file mode 100644
index 0000000000..7d33490205
--- /dev/null
+++ b/src/cpp/src/perf_counters.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include <vector>
+#include <openvino/genai/perf_metrics.hpp>
+
+namespace ov {
+namespace genai {
+
+struct PerfCounters {
+    std::vector<float> generate_durations;
+    std::vector<float> tokenization_duration;
+    std::vector<float> detokenization_duration;
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+
+    std::vector<size_t> m_batch_sizes;
+    std::vector<float> m_durations;
+    std::vector<float> m_times_to_first_token;
+    std::vector<TimePoint> m_new_token_times;
+    void add_timestamp(size_t batch_size);
+    // void add_gen_finish_timestamp(size_t batch_size);
+
+};
+
+// class StopWatch {
+//     TimePoint m_start;
+// public:
+//     StopWatch& start() {
+//         m_start = std::chrono::steady_clock::now();
+//         return *this;
+//     }
+
+//     float split() {
+//         std::chrono::steady_clock::time_point curr_time = std::chrono::steady_clock::now();
+//         return std::chrono::duration_cast<std::chrono::milliseconds>(curr_time - m_start).count();
+//     }
+// };
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
new file mode 100644
index 0000000000..4a8b1d76c6
--- /dev/null
+++ b/src/cpp/src/perf_metrics.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/perf_metrics.hpp"
+#include "perf_counters.hpp"
+#include "openvino/openvino.hpp"
+#include <tuple>
+#include <numeric>
+#include <cmath>
+
+namespace {
+
+std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
+    float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size();
+    
+    float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
+        [](const float& acc, const float& duration) -> float {
+            return acc + duration * duration;
+        });
+    float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);      
+    return {mean, std};
+}
+
+
+} // namespace
+
+namespace ov {
+namespace genai {
+    
+void PerfMetrics::evaluate(TimePoint start_time) {
+
+    auto& tok_times = m_counters->m_new_token_times;
+    auto& batch_sizes = m_counters->m_batch_sizes;
+    m_counters->m_durations = std::vector<float>(tok_times.size());
+
+    auto ttft = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[0] - start_time).count();
+    m_counters->m_times_to_first_token.emplace_back(ttft);
+    
+    for (size_t i = 0; i < tok_times.size(); ++i) {
+        m_counters->m_durations[i] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - start_time).count();
+        // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
+        // todo: float check that it's valid for batch > 1.
+        m_counters->m_durations[i] /= batch_sizes[i];
+        start_time = tok_times[i];
+    }
+
+    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(m_counters->m_durations);
+    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(m_counters->m_times_to_first_token);
+}
+
+PerfMetrics PerfMetrics::operator+(const PerfMetrics& metrics) const {
+    PerfMetrics nm;  // new metrics
+    nm.m_counters = m_counters;
+    auto& new_counters = nm.m_counters;
+
+    auto& new_durations = new_counters->m_durations;
+    auto& new_times_to_first_token = new_counters->m_times_to_first_token;
+    
+    auto& counters_to_appnd = metrics.m_counters;
+    new_durations.insert(new_durations.end(), counters_to_appnd->m_durations.begin(), counters_to_appnd->m_durations.end());
+    new_times_to_first_token.insert(new_times_to_first_token.end(), counters_to_appnd->m_times_to_first_token.begin(), counters_to_appnd->m_times_to_first_token.end());
+    
+    OPENVINO_ASSERT(metrics.load_time == load_time, "generation metrics can be accumulated only for the same pipeline");
+    
+    std::tie(nm.mean_tpot, nm.std_tpot) = calc_mean_and_std(new_counters->m_durations);
+    std::tie(nm.mean_ttft, nm.std_ttft) = calc_mean_and_std(new_counters->m_times_to_first_token);
+    
+    // todo: add tokenization statistics concatenation.
+    
+    return nm;
+}
+
+PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) {
+    *this = *this + right;
+    return *this;
+}
+
+
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index ac6b925dcb..501d0e86cf 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -323,6 +323,8 @@ class Tokenizer::TokenizerImpl {
         
         // Replace what jinja2cpp doesn't support
         std::pair<std::string, std::string> replace_str_map[] = {
+            {"{-", "{"},
+            {"{%-", "{%"},
             {"'}", "' }"},
             {"{'", "{ '"},
             {".strip()", ""}
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 25acc1c87f..446ef8549b 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -12,6 +12,20 @@ namespace ov {
 namespace genai {
 namespace utils {
 
+#include <iostream>
+#include <chrono>
+#include <functional>
+
+// Templated function to measure execution time of an object method.
+template<typename T, typename Ret, typename... Args>
+std::pair<Ret, float> execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) {
+    auto start = std::chrono::steady_clock::now();
+    Ret result = (instance.*method)(std::forward<Args>(args)...);
+    auto end = std::chrono::steady_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    return {result, duration};
+}
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index d7b2aab29c..c78c760b6c 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -21,6 +21,7 @@ using ov::genai::GenerationConfig;
 using ov::genai::GenerationResult;
 using ov::genai::LLMPipeline;
 using ov::genai::OptionalGenerationConfig;
+using ov::genai::PerfMetrics;
 using ov::genai::SchedulerConfig;
 using ov::genai::StopCriteria;
 using ov::genai::StreamerBase;
@@ -536,6 +537,19 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("scores", &DecodedResults::scores)
         .def("__str__", &DecodedResults::operator std::string);;
 
+    py::class_<PerfMetrics>(m, "PerfMetrics")
+        .def(py::init<>())
+        .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration)
+        .def_readonly("mean_decoding_duration", &PerfMetrics::mean_decoding_duration)
+        .def_readonly("mean_encoding_duration", &PerfMetrics::mean_encoding_duration)
+        .def_readonly("mean_tpot", &PerfMetrics::mean_tpot)
+        .def_readonly("mean_ttft", &PerfMetrics::mean_ttft)
+        .def_readonly("std_tpot", &PerfMetrics::std_tpot)
+        .def_readonly("std_ttft", &PerfMetrics::std_ttft)
+        .def_readonly("load_time", &PerfMetrics::load_time)
+        .def("__add__", &PerfMetrics::operator+)
+        .def("__iadd__", &PerfMetrics::operator+=);
+
     py::class_<TokenizedInputs>(m, "TokenizedInputs")
         .def(py::init<ov::Tensor, ov::Tensor>())
         .def_readwrite("input_ids", &TokenizedInputs::input_ids)
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d48..5d038e65e2 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -81,6 +81,8 @@ def get_chat_templates():
     # but skips some models that currently are not processed correctly.
 
     skipped_models = {
+        "berkeley-nest/Starling-LM-7B-alpha", #  TODO: Need to enable and unskip, since it's preset in continious batching and has ~30 000 downloads.
+
         # These models fail even on HF so no need to check if applying chat matches.
         "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
         "codellama/CodeLlama-34b-Instruct-hf",

From bb1113ce69dc0126a1b83a66394f63d09146044a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 22 Jul 2024 13:10:03 +0200
Subject: [PATCH 18/54] refactor structure, add python sample

---
 samples/cpp/benchmark_vanilla_genai/README.md |   1 +
 .../benchmark_vanilla_genai.cpp               |  22 ++--
 .../python/benchmark_vanilla_genai/README.md  |  66 ++++++++++++
 .../benchmark_vanilla_genai.py                |  50 +++++++++
 .../include/openvino/genai/llm_pipeline.hpp   |   2 +
 .../include/openvino/genai/perf_metrics.hpp   |  37 +++++--
 src/cpp/src/greedy_decoding.cpp               |  10 +-
 src/cpp/src/group_beam_searcher.cpp           |  20 ++--
 src/cpp/src/llm_pipeline.cpp                  |  31 +++---
 src/cpp/src/perf_counters.cpp                 |  21 ----
 src/cpp/src/perf_counters.hpp                 |  44 --------
 src/cpp/src/perf_metrics.cpp                  | 100 +++++++++++-------
 src/cpp/src/tokenizer.cpp                     |   2 -
 src/python/py_generate_pipeline.cpp           |  25 ++++-
 tests/python_tests/ov_genai_test_utils.py     |   2 -
 15 files changed, 279 insertions(+), 154 deletions(-)
 create mode 100644 samples/python/benchmark_vanilla_genai/README.md
 create mode 100755 samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
 delete mode 100644 src/cpp/src/perf_counters.cpp
 delete mode 100644 src/cpp/src/perf_counters.hpp

diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md
index 739c2e950c..50197dad1d 100644
--- a/samples/cpp/benchmark_vanilla_genai/README.md
+++ b/samples/cpp/benchmark_vanilla_genai/README.md
@@ -1,2 +1,3 @@
 # benchmark OpenVINO GenAI sample
 
+TODO: adapt from python sample to c++
\ No newline at end of file
diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
index 6489282b0b..6d96d24fc5 100644
--- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
+++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -11,7 +11,8 @@ int main(int argc, char* argv[]) try {
     ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
     ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
-    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(5)))
+    ("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
     ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
     ("h,help", "Print usage");
 
@@ -36,26 +37,27 @@ int main(int argc, char* argv[]) try {
     size_t num_iter = result["num_iter"].as<size_t>();
   
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 100;
-    config.num_beam_groups = 3;
-    config.num_beams = 15;
+    config.max_new_tokens = result["max_new_tokens"].as<size_t>();
 
     ov::genai::LLMPipeline pipe(model_path, device);
     
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, config);
     
-    ov::genai::PerfMetrics metrics;
-    for (size_t i = 0; i < num_iter; i++) {
-        ov::genai::DecodedResults res = pipe.generate(prompt, config);
+    ov::genai::DecodedResults res = pipe.generate(prompt, config);
+    ov::genai::PerfMetrics metrics = res.metrics;
+    for (size_t i = 0; i < num_iter - 1; i++) {
+        res = pipe.generate(prompt, config);
         metrics = metrics + res.metrics;
-        metrics.load_time = res.metrics.load_time;
     }
 
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
+    std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl;
     std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
-    std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl;
-    std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl;
+    std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl;
+    std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl;
 
     return 0;
 } catch (const std::exception& error) {
diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md
new file mode 100644
index 0000000000..af66ea545d
--- /dev/null
+++ b/samples/python/benchmark_vanilla_genai/README.md
@@ -0,0 +1,66 @@
+# Benchmark Vanilla GenAI
+
+This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+# ov.genai.PerfMetrics structure
+ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics:
+- mean_ttft
+ - std_ttft
+ - mean_tpot
+ - std_tpot
+ - load_time
+ - mean_generate_duration
+ - std_generate_duration
+ - mean_tokenization_duration
+ - std_tokenization_duration
+ - mean_detokenization_duration
+ - std_detokenization_duration
+ - mean_throughput
+ - std_throughput
+ - num_generated_tokens
+ - num_input_tokens
+
+Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated.
+
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+python benchmark_vanilla_genai.py [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
+```
+
+```
+Load time: 3446 ms
+Generate time: 876.2 ± 3.30719 ms
+Tokenization time: 0 ± 0 ms
+Detokenization time: 0 ± 0 ms
+ttft: 168 ± 0 ms
+tpot: 174.68 ± 4.08671 ms
+Tokens/s: 5.72475 ± 0.133933
+```
diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
new file mode 100755
index 0000000000..4c87234179
--- /dev/null
+++ b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+import pdb
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+    
+    args = parser.parse_args()
+
+    prompt = [args.prompt]
+    model_path = args.model
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+    
+
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = args.num_new_tokens
+
+    pipe = ov_genai.LLMPipeline(model_path, device)
+    
+    for _ in range(num_warmup):
+        pipe.generate(prompt, config)
+    
+    res = pipe.generate(prompt, config)
+    metrics = res.metrics
+    for _ in range(num_iter - 1):
+        # pdb.set_trace()
+        res = pipe.generate(prompt, config)
+        metrics += res.metrics
+
+    print(f"Load time: {metrics.load_time} ms")
+    print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
+    print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
+    print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
+    print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms")
+    print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms")
+    print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 4db3c613e7..14100d4f16 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -31,6 +31,7 @@ using StringInputs = std::variant<std::string, std::vector<std::string>>;
 *
 * @param tokens sequence of resulting tokens
 * @param scores sum of logarithmic probabilities of all tokens in the sequence
+* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
 */
 class EncodedResults {
 public:
@@ -45,6 +46,7 @@ class EncodedResults {
 *
 * @param texts vector of resulting sequences
 * @param scores scores for each sequence
+* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
 */
 class DecodedResults {
 public:
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index a11c4e0374..e66c917e81 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -7,14 +7,34 @@
 #include "openvino/genai/visibility.hpp"
 #include <vector>
 #include <memory>
+#include <optional>
 
 namespace ov {
 namespace genai {
 
 using TimePoint = std::chrono::steady_clock::time_point;
 
-struct PerfCounters;
+/**
+* @brief Structure with raw performance metrics for each generation before any statistics calculated.
+*/
+struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
+    std::vector<float> generate_durations;
+    std::vector<float> tokenization_durations;
+    std::vector<float> detokenization_durations;
+    
+    std::vector<float> m_times_to_first_token;
+    std::vector<TimePoint> m_new_token_times;
+    std::vector<size_t> m_batch_sizes;
+    std::vector<float> m_durations;
 
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+};
+
+/**
+* @brief Structure to store performance metric for each generation
+*
+*/
 struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     // First token time.
     float mean_ttft;
@@ -25,11 +45,13 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     float std_tpot;
     
     float load_time;
-    float start_time;
 
     float mean_generate_duration;
-    float mean_decoding_duration;
-    float mean_encoding_duration;
+    float std_generate_duration;
+    float mean_tokenization_duration;
+    float std_tokenization_duration;
+    float mean_detokenization_duration;
+    float std_detokenization_duration;
     
     float mean_throughput;
     float std_throughput;
@@ -37,13 +59,12 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     size_t num_generated_tokens;
     size_t num_input_tokens;
 
-    std::shared_ptr<PerfCounters> m_counters;
-    void evaluate(TimePoint start_time);
-
+    void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt);
+    static float get_duration_ms(std::chrono::steady_clock::duration duration);
     PerfMetrics operator+(const PerfMetrics& metrics) const;
     PerfMetrics& operator+=(const PerfMetrics& right);
 
-    
+    RawPerfMetrics raw_counters;
 };
 
 } // namespace genai
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 0802b87e66..c5bf10a2d1 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/perf_metrics.hpp"
-#include "perf_counters.hpp"
+// #include "perf_counters.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -23,7 +23,7 @@ EncodedResults greedy_decoding(
     size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);
 
     EncodedResults results;
-    auto& perf_counters = results.metrics.m_counters;
+    auto& raw_perf_counters = results.metrics.raw_counters;
     
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
@@ -54,7 +54,8 @@ EncodedResults greedy_decoding(
         eos_met[batch] = (out_token == generation_config.eos_token_id);
         m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
     }
-    perf_counters->add_timestamp(running_batch_size);
+    raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+    raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
         
     if (streamer && streamer->put(token_iter_results[0])) {
         return results;
@@ -86,7 +87,8 @@ EncodedResults greedy_decoding(
             
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
         }
-        perf_counters->add_timestamp(running_batch_size);
+        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
         if (streamer && streamer->put(token_iter_results[0]))
             return results;
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
index 4f5cb79f2a..784ff1a915 100644
--- a/src/cpp/src/group_beam_searcher.cpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -366,11 +366,6 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
     auto batch_size = input_ids.get_shape().at(0);
     auto sequence_length = input_ids.get_shape().at(1);
     
-    // Initialize time metric counters.
-    // ov::genai::TimePoints tok_times;
-    // tok_times.reserve(config.get_max_new_tokens(sequence_length));
-    // tok_times.emplace_back(std::chrono::steady_clock::now());
-
     // Initialize beam search.
     const int64_t* prompt_data = input_ids.data<const int64_t>();
     std::vector<std::vector<int64_t>> prompts;
@@ -407,12 +402,19 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
 
     std::vector<int64_t> next_tokens;
     std::vector<int32_t> next_beams;
-    
+
+    // Reserve for performance counters.
+    std::vector<std::chrono::steady_clock::time_point> new_token_times;
+    std::vector<size_t> batch_sizes;
+    new_token_times.reserve(parameters.max_new_tokens);
+    batch_sizes.reserve(parameters.max_new_tokens);
+
     for (size_t length_count = 0; ; ++length_count) {
         lm.infer();
 
         std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        // tok_times.emplace_back(std::chrono::steady_clock::now());
+        new_token_times.emplace_back(std::chrono::steady_clock::now());
+        batch_sizes.emplace_back(batch_size);
 
         if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
             // Break the cycle before masks are extended in update_attention_mask_with_beams.
@@ -442,6 +444,9 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
     int32_t res_selected_beam_idx = 0;
     results.scores.reserve(config.num_return_sequences * result.size());
     results.tokens.reserve(config.num_return_sequences * result.size());
+    auto& raw_perf_counters = results.metrics.raw_counters;
+    raw_perf_counters.m_new_token_times = new_token_times;
+    raw_perf_counters.m_batch_sizes = batch_sizes;
     
     // align output with HF
     for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) {
@@ -471,7 +476,6 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
         }
     }
     
-    // results.metrics = PerfCounters(tok_times);
     return {results, res_selected_beam_idx};
 }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 81f807c149..5241142afe 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,7 +1,6 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "perf_counters.hpp"
 #include <filesystem>
 #include <fstream>
 #include <variant>
@@ -160,14 +159,18 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_templated_chat_history.append(answer);
             m_history.push_back({{"role", "assistant"}, {"content", answer}});
         }
-
-        auto& metrics = encoded_results.metrics;
-        // metrics.tokenization_duration = std::chrono::duration_cast<std::chrono::milliseconds>(encode_stop_time - start_time).count();
-        // metrics.detokenization_duration = std::chrono::duration_cast<std::chrono::milliseconds>(decode_stop_time - decode_start_time).count();
         
-        // auto stop_time = std::chrono::steady_clock::now();
-        // metrics.generate_durations.emplace_back(std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count());
-        decoded_results.metrics = std::move(metrics);
+        // generate_durations
+        decoded_results.metrics = encoded_results.metrics;
+
+        auto& raw_counters = decoded_results.metrics.raw_counters;
+        auto stop_time = std::chrono::steady_clock::now();
+
+        raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
+        raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time));
+        raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time));
+
+        decoded_results.metrics.evaluate_statistics(start_time);
         return decoded_results;
     }
 
@@ -267,13 +270,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_is_cache_empty = false;
         }
 
-
-
+        // If is called without tokenization then that stat will not be reported.
         auto& metrics = result.metrics;
-        // metrics.batch_size = batch_size;
-        // metrics.num_generated_tokens = (metrics.m_durations.size() + 1) * batch_size;
-        metrics.num_input_tokens = batch_size * input_ids.get_shape().at(0);
-        result.metrics = std::move(metrics);
+        metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+        metrics.load_time = this->m_load_time_ms;
+        metrics.evaluate_statistics(start_time);
         return result;
     }
 
@@ -390,7 +391,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
     }
     auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time);
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
diff --git a/src/cpp/src/perf_counters.cpp b/src/cpp/src/perf_counters.cpp
deleted file mode 100644
index c9dac6eca0..0000000000
--- a/src/cpp/src/perf_counters.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "perf_counters.hpp"
-#include "openvino/genai/perf_metrics.hpp"
-#include "openvino/openvino.hpp"
-#include <tuple>
-#include <numeric>
-#include <cmath>
-
-namespace ov {
-namespace genai {
-
-void PerfCounters::add_timestamp(size_t batch_size) {
-    m_new_token_times.emplace_back(std::chrono::steady_clock::now());
-    m_batch_sizes.emplace_back(batch_size);
-}
-
-
-} // namespace genai
-} // namespace ov
diff --git a/src/cpp/src/perf_counters.hpp b/src/cpp/src/perf_counters.hpp
deleted file mode 100644
index 7d33490205..0000000000
--- a/src/cpp/src/perf_counters.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <chrono>
-#include <vector>
-#include <openvino/genai/perf_metrics.hpp>
-
-namespace ov {
-namespace genai {
-
-struct PerfCounters {
-    std::vector<float> generate_durations;
-    std::vector<float> tokenization_duration;
-    std::vector<float> detokenization_duration;
-    size_t num_generated_tokens;
-    size_t num_input_tokens;
-
-    std::vector<size_t> m_batch_sizes;
-    std::vector<float> m_durations;
-    std::vector<float> m_times_to_first_token;
-    std::vector<TimePoint> m_new_token_times;
-    void add_timestamp(size_t batch_size);
-    // void add_gen_finish_timestamp(size_t batch_size);
-
-};
-
-// class StopWatch {
-//     TimePoint m_start;
-// public:
-//     StopWatch& start() {
-//         m_start = std::chrono::steady_clock::now();
-//         return *this;
-//     }
-
-//     float split() {
-//         std::chrono::steady_clock::time_point curr_time = std::chrono::steady_clock::now();
-//         return std::chrono::duration_cast<std::chrono::milliseconds>(curr_time - m_start).count();
-//     }
-// };
-
-} // namespace genai
-} // namespace ov
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 4a8b1d76c6..3947793802 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/perf_metrics.hpp"
-#include "perf_counters.hpp"
 #include "openvino/openvino.hpp"
 #include <tuple>
 #include <numeric>
@@ -17,7 +16,7 @@ std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
         [](const float& acc, const float& duration) -> float {
             return acc + duration * duration;
         });
-    float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);      
+    float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);
     return {mean, std};
 }
 
@@ -26,48 +25,77 @@ std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
 
 namespace ov {
 namespace genai {
-    
-void PerfMetrics::evaluate(TimePoint start_time) {
-
-    auto& tok_times = m_counters->m_new_token_times;
-    auto& batch_sizes = m_counters->m_batch_sizes;
-    m_counters->m_durations = std::vector<float>(tok_times.size());
 
-    auto ttft = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[0] - start_time).count();
-    m_counters->m_times_to_first_token.emplace_back(ttft);
+float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+}
     
-    for (size_t i = 0; i < tok_times.size(); ++i) {
-        m_counters->m_durations[i] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - start_time).count();
-        // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
-        // todo: float check that it's valid for batch > 1.
-        m_counters->m_durations[i] /= batch_sizes[i];
-        start_time = tok_times[i];
-    }
+void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
+    // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that.
+    if (start_time.has_value()) {
+        auto start_time_val = *start_time;
+        auto& tok_times = raw_counters.m_new_token_times;
+        auto& batch_sizes = raw_counters.m_batch_sizes;
+        raw_counters.m_durations = std::vector<float>(tok_times.size());
 
-    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(m_counters->m_durations);
-    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(m_counters->m_times_to_first_token);
-}
+        auto ttft = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[0] - start_time_val).count();
+        raw_counters.m_times_to_first_token = std::vector<float>();
+        raw_counters.m_times_to_first_token.emplace_back(ttft);
+        num_generated_tokens = 0;
+        for (size_t i = 0; i < tok_times.size(); ++i) {
+            raw_counters.m_durations[i] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - start_time_val).count();
+            
+            // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
+            // todo: float check that it's valid for batch > 1.
+            raw_counters.m_durations[i] /= batch_sizes[i];
+            num_generated_tokens += batch_sizes[i];
+            start_time_val = tok_times[i];
+        }
+    }
 
-PerfMetrics PerfMetrics::operator+(const PerfMetrics& metrics) const {
-    PerfMetrics nm;  // new metrics
-    nm.m_counters = m_counters;
-    auto& new_counters = nm.m_counters;
+    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_counters.m_durations);
+    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_counters.m_times_to_first_token);
 
-    auto& new_durations = new_counters->m_durations;
-    auto& new_times_to_first_token = new_counters->m_times_to_first_token;
-    
-    auto& counters_to_appnd = metrics.m_counters;
-    new_durations.insert(new_durations.end(), counters_to_appnd->m_durations.begin(), counters_to_appnd->m_durations.end());
-    new_times_to_first_token.insert(new_times_to_first_token.end(), counters_to_appnd->m_times_to_first_token.begin(), counters_to_appnd->m_times_to_first_token.end());
+    std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_counters.generate_durations);
+    std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_counters.tokenization_durations);
+    std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_counters.detokenization_durations);    
     
-    OPENVINO_ASSERT(metrics.load_time == load_time, "generation metrics can be accumulated only for the same pipeline");
+    mean_throughput = 1000.0f / mean_tpot;
+    std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot);
+}
+
+PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
+    OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline");
     
-    std::tie(nm.mean_tpot, nm.std_tpot) = calc_mean_and_std(new_counters->m_durations);
-    std::tie(nm.mean_ttft, nm.std_ttft) = calc_mean_and_std(new_counters->m_times_to_first_token);
+    // Copy left value to res.
+    PerfMetrics res = *this;
+
+    // Concatenate duration and first token times.
+    auto& new_durations = res.raw_counters.m_durations;
+    auto& new_times_to_first_token = res.raw_counters.m_times_to_first_token;
+    auto& right_durations = right.raw_counters.m_durations;
+    auto& right_times_to_first_token = right.raw_counters.m_times_to_first_token;
     
-    // todo: add tokenization statistics concatenation.
+    new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end());
+    new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end());
+
+    // Concatenate tokenization/detokenization and total generation times.
+    auto& new_tok_durations = res.raw_counters.tokenization_durations;
+    auto& new_detok_durations = res.raw_counters.detokenization_durations;
+    auto& new_gen_durations = res.raw_counters.generate_durations;
+    auto& right_tok_durations = right.raw_counters.tokenization_durations;
+    auto& right_detok_durations = right.raw_counters.detokenization_durations;
+    auto& right_gen_durations = right.raw_counters.generate_durations;
     
-    return nm;
+    new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end());
+    new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end());
+    new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end());
+
+    res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens;
+    res.num_input_tokens = num_generated_tokens + right.num_input_tokens;
+    res.load_time = load_time;
+    res.evaluate_statistics();
+    return res;
 }
 
 PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) {
@@ -75,7 +103,5 @@ PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) {
     return *this;
 }
 
-
-
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 501d0e86cf..ac6b925dcb 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -323,8 +323,6 @@ class Tokenizer::TokenizerImpl {
         
         // Replace what jinja2cpp doesn't support
         std::pair<std::string, std::string> replace_str_map[] = {
-            {"{-", "{"},
-            {"{%-", "{%"},
             {"'}", "' }"},
             {"{'", "{ '"},
             {".strip()", ""}
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index c78c760b6c..860d3c3592 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -22,6 +22,7 @@ using ov::genai::GenerationResult;
 using ov::genai::LLMPipeline;
 using ov::genai::OptionalGenerationConfig;
 using ov::genai::PerfMetrics;
+using ov::genai::RawPerfMetrics;
 using ov::genai::SchedulerConfig;
 using ov::genai::StopCriteria;
 using ov::genai::StreamerBase;
@@ -535,13 +536,30 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def(py::init<>())
         .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); })
         .def_readonly("scores", &DecodedResults::scores)
+        .def_readonly("metrics", &DecodedResults::metrics)
         .def("__str__", &DecodedResults::operator std::string);;
 
+    py::class_<RawPerfMetrics>(m, "RawPerfMetrics")
+        .def(py::init<>())
+        .def_readonly("generate_durations", &RawPerfMetrics::generate_durations)
+        .def_readonly("tokenization_durations", &RawPerfMetrics::tokenization_durations)
+        .def_readonly("detokenization_durations", &RawPerfMetrics::detokenization_durations)
+        .def_readonly("m_times_to_first_token", &RawPerfMetrics::m_times_to_first_token)
+        .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes)
+        .def_readonly("m_durations", &RawPerfMetrics::m_durations)
+        .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens)
+        .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens);
+
     py::class_<PerfMetrics>(m, "PerfMetrics")
         .def(py::init<>())
         .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration)
-        .def_readonly("mean_decoding_duration", &PerfMetrics::mean_decoding_duration)
-        .def_readonly("mean_encoding_duration", &PerfMetrics::mean_encoding_duration)
+        .def_readonly("std_generate_duration", &PerfMetrics::std_generate_duration)
+        .def_readonly("mean_tokenization_duration", &PerfMetrics::mean_tokenization_duration)
+        .def_readonly("std_tokenization_duration", &PerfMetrics::std_tokenization_duration)
+        .def_readonly("mean_detokenization_duration", &PerfMetrics::mean_detokenization_duration)
+        .def_readonly("std_detokenization_duration", &PerfMetrics::std_detokenization_duration)
+        .def_readonly("mean_throughput", &PerfMetrics::mean_throughput)
+        .def_readonly("std_throughput", &PerfMetrics::std_throughput)
         .def_readonly("mean_tpot", &PerfMetrics::mean_tpot)
         .def_readonly("mean_ttft", &PerfMetrics::mean_ttft)
         .def_readonly("std_tpot", &PerfMetrics::std_tpot)
@@ -557,7 +575,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
 
     py::class_<EncodedResults>(m, "EncodedResults")
         .def_readonly("tokens", &EncodedResults::tokens)
-        .def_readonly("scores", &EncodedResults::scores);
+        .def_readonly("scores", &EncodedResults::scores)
+        .def_readonly("metrics", &EncodedResults::metrics);
 
     py::class_<StreamerBase, ConstructableStreamer, std::shared_ptr<StreamerBase>>(m, "StreamerBase")  // Change the holder form unique_ptr to shared_ptr
         .def(py::init<>())
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 5d038e65e2..4ba71a1d48 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -81,8 +81,6 @@ def get_chat_templates():
     # but skips some models that currently are not processed correctly.
 
     skipped_models = {
-        "berkeley-nest/Starling-LM-7B-alpha", #  TODO: Need to enable and unskip, since it's preset in continious batching and has ~30 000 downloads.
-
         # These models fail even on HF so no need to check if applying chat matches.
         "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
         "codellama/CodeLlama-34b-Instruct-hf",

From 7bf42f1f12f55f1ae30610267897c22a98545f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Mon, 22 Jul 2024 17:03:49 +0200
Subject: [PATCH 19/54] Cherry-pick custom max_element loop (#662)

Cherry picked from master
---
 src/cpp/src/sampler.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index dc631c68ac..6390fc8725 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -219,8 +219,13 @@ class Sampler {
     }
 
     Token _greedy_sample(const std::vector<Token>& logit_vector) const {
-        auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
-        return *out_token;
+        Token max_token{-std::numeric_limits<float>::infinity() , 0};
+        for (const auto& logit : logit_vector) {
+            if (logit.m_log_prob > max_token.m_log_prob) {
+                max_token = logit;
+            }
+        }
+        return max_token;
     }
 
     std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {

From 0a8f0d95dcd37e59cced6a959de719d8a53e5c98 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 22 Jul 2024 17:24:33 +0200
Subject: [PATCH 20/54] add more preicise durations

---
 .../benchmark_vanilla_genai.cpp               |  2 +-
 .../python/benchmark_vanilla_genai/README.md  | 15 ++++++------
 .../benchmark_vanilla_genai.py                |  9 ++++---
 .../include/openvino/genai/perf_metrics.hpp   | 11 +++++----
 src/cpp/src/greedy_decoding.cpp               |  1 +
 src/cpp/src/llm_pipeline.cpp                  |  6 +++--
 src/cpp/src/multinomial_decoding.cpp          |  8 ++++++-
 src/cpp/src/perf_metrics.cpp                  | 24 ++++++++++++-------
 src/cpp/src/sampler.hpp                       |  9 ++-----
 src/cpp/src/utils.hpp                         | 14 -----------
 src/python/py_generate_pipeline.cpp           |  6 +++--
 11 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
index 6d96d24fc5..a9bc07f641 100644
--- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
+++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {
     ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
     ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
-    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(5)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
     ("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
     ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
     ("h,help", "Print usage");
diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md
index af66ea545d..13666a7de9 100644
--- a/samples/python/benchmark_vanilla_genai/README.md
+++ b/samples/python/benchmark_vanilla_genai/README.md
@@ -56,11 +56,12 @@ python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
 ```
 
 ```
-Load time: 3446 ms
-Generate time: 876.2 ± 3.30719 ms
-Tokenization time: 0 ± 0 ms
-Detokenization time: 0 ± 0 ms
-ttft: 168 ± 0 ms
-tpot: 174.68 ± 4.08671 ms
-Tokens/s: 5.72475 ± 0.133933
+Load time: 3405.69 ms
+Generate time: 1430.77 ± 3.04 ms
+Tokenization time: 0.51 ± 0.02 ms
+Detokenization time: 0.37 ± 0.01 ms
+TTFT: 81.60 ± 0.54 ms
+TPOT: 71.52 ± 2.72 ms
+Throughput tokens/s: 13.98 ± 0.53
 ```
+s
\ No newline at end of file
diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
index 4c87234179..9e4debe847 100755
--- a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
+++ b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
@@ -10,7 +10,7 @@ def main():
     parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
     parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
     parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
-    parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
     parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
     parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
     
@@ -22,9 +22,8 @@ def main():
     num_warmup = args.num_warmup
     num_iter = args.num_iter
     
-
     config = ov_genai.GenerationConfig()
-    config.max_new_tokens = args.num_new_tokens
+    config.max_new_tokens = args.max_new_tokens
 
     pipe = ov_genai.LLMPipeline(model_path, device)
     
@@ -37,8 +36,8 @@ def main():
         # pdb.set_trace()
         res = pipe.generate(prompt, config)
         metrics += res.metrics
-
-    print(f"Load time: {metrics.load_time} ms")
+    
+    print(f"Load time: {metrics.load_time:.2f} ms")
     print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
     print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
     print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index e66c917e81..5779b9b080 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -13,19 +13,20 @@ namespace ov {
 namespace genai {
 
 using TimePoint = std::chrono::steady_clock::time_point;
+using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
 
 /**
 * @brief Structure with raw performance metrics for each generation before any statistics calculated.
 */
 struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
-    std::vector<float> generate_durations;
-    std::vector<float> tokenization_durations;
-    std::vector<float> detokenization_durations;
+    std::vector<MicroSeconds> generate_durations;
+    std::vector<MicroSeconds> tokenization_durations;
+    std::vector<MicroSeconds> detokenization_durations;
     
-    std::vector<float> m_times_to_first_token;
+    std::vector<MicroSeconds> m_times_to_first_token;
     std::vector<TimePoint> m_new_token_times;
     std::vector<size_t> m_batch_sizes;
-    std::vector<float> m_durations;
+    std::vector<MicroSeconds> m_durations;
 
     size_t num_generated_tokens;
     size_t num_input_tokens;
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index c5bf10a2d1..c8fd36cbdd 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -22,6 +22,7 @@ EncodedResults greedy_decoding(
     size_t prompt_len = prompts_shape[1];
     size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);
 
+    // Initialize results and performance metrics.
     EncodedResults results;
     auto& raw_perf_counters = results.metrics.raw_counters;
     
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5241142afe..adac9110e1 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -165,7 +165,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
         auto& raw_counters = decoded_results.metrics.raw_counters;
         auto stop_time = std::chrono::steady_clock::now();
-
+        raw_counters.generate_durations = std::vector<MicroSeconds>();
         raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
         raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time));
         raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time));
@@ -269,11 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         } else {
             m_is_cache_empty = false;
         }
+        auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
         auto& metrics = result.metrics;
         metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
         metrics.load_time = this->m_load_time_ms;
+        metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
         metrics.evaluate_statistics(start_time);
         return result;
     }
@@ -391,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
     }
     auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time);
+    m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f;
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp
index fd16e948c1..fc59f00e12 100644
--- a/src/cpp/src/multinomial_decoding.cpp
+++ b/src/cpp/src/multinomial_decoding.cpp
@@ -162,7 +162,9 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
 
     size_t prompt_len = prompts_shape[1];
 
-    ov::genai::EncodedResults results;
+    // Initialize results and performance metrics.
+    EncodedResults results;
+    auto& raw_perf_counters = results.metrics.raw_counters;
     results.scores.resize(batch_size, 0);
     results.tokens.resize(batch_size);
 
@@ -179,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
     m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
     m_model_runner.infer();
+    raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+    raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
     auto logits_tensor = m_model_runner.get_tensor("logits");
 
@@ -222,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
         m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id;
 
         m_model_runner.infer();
+        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
         logits = m_model_runner.get_tensor("logits").data<float>();
         out_token = sampling.get_out_token(logits, vocab_size, tokens);
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 3947793802..d4dc6c8de6 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -9,12 +9,18 @@
 
 namespace {
 
-std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
-    float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size();
+// std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
+std::pair<float, float> calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
+    float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, 
+        [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
+            return acc + duration.count();
+        });
+    mean /= durations.size();
+    mean /= 1000.f;
     
     float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
-        [](const float& acc, const float& duration) -> float {
-            return acc + duration * duration;
+        [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
+            return acc + duration.count() * duration.count() / 1000000.0f;
         });
     float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);
     return {mean, std};
@@ -27,7 +33,7 @@ namespace ov {
 namespace genai {
 
 float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) {
-    return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
 }
     
 void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
@@ -36,14 +42,14 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
         auto start_time_val = *start_time;
         auto& tok_times = raw_counters.m_new_token_times;
         auto& batch_sizes = raw_counters.m_batch_sizes;
-        raw_counters.m_durations = std::vector<float>(tok_times.size());
+        raw_counters.m_durations = std::vector<MicroSeconds>(tok_times.size());
 
-        auto ttft = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[0] - start_time_val).count();
-        raw_counters.m_times_to_first_token = std::vector<float>();
+        auto ttft = tok_times[0] - start_time_val;
+        raw_counters.m_times_to_first_token = std::vector<MicroSeconds>();
         raw_counters.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = 0;
         for (size_t i = 0; i < tok_times.size(); ++i) {
-            raw_counters.m_durations[i] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - start_time_val).count();
+            raw_counters.m_durations[i] = tok_times[i] - start_time_val;
             
             // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
             // todo: float check that it's valid for batch > 1.
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 6390fc8725..dc631c68ac 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -219,13 +219,8 @@ class Sampler {
     }
 
     Token _greedy_sample(const std::vector<Token>& logit_vector) const {
-        Token max_token{-std::numeric_limits<float>::infinity() , 0};
-        for (const auto& logit : logit_vector) {
-            if (logit.m_log_prob > max_token.m_log_prob) {
-                max_token = logit;
-            }
-        }
-        return max_token;
+        auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
+        return *out_token;
     }
 
     std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 446ef8549b..25acc1c87f 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -12,20 +12,6 @@ namespace ov {
 namespace genai {
 namespace utils {
 
-#include <iostream>
-#include <chrono>
-#include <functional>
-
-// Templated function to measure execution time of an object method.
-template<typename T, typename Ret, typename... Args>
-std::pair<Ret, float> execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) {
-    auto start = std::chrono::steady_clock::now();
-    Ret result = (instance.*method)(std::forward<Args>(args)...);
-    auto end = std::chrono::steady_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    return {result, duration};
-}
-
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 860d3c3592..e2f89cd962 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -537,7 +537,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); })
         .def_readonly("scores", &DecodedResults::scores)
         .def_readonly("metrics", &DecodedResults::metrics)
-        .def("__str__", &DecodedResults::operator std::string);;
+        .def("__str__", &DecodedResults::operator std::string);
 
     py::class_<RawPerfMetrics>(m, "RawPerfMetrics")
         .def(py::init<>())
@@ -566,7 +566,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("std_ttft", &PerfMetrics::std_ttft)
         .def_readonly("load_time", &PerfMetrics::load_time)
         .def("__add__", &PerfMetrics::operator+)
-        .def("__iadd__", &PerfMetrics::operator+=);
+        .def("__iadd__", &PerfMetrics::operator+=)
+        .def_readonly("raw_counters", &PerfMetrics::raw_counters)
+        ;
 
     py::class_<TokenizedInputs>(m, "TokenizedInputs")
         .def(py::init<ov::Tensor, ov::Tensor>())

From bad01b94e2c21abce6d211c8c85db00f9af7f6c0 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 22 Jul 2024 19:35:25 +0200
Subject: [PATCH 21/54] Add note for pybind ov::Tensor issue (#659)

---
 src/docs/BUILD.md | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 1aee73bfb0..3b89995dc2 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -1,5 +1,8 @@
 # How to Build OpenVINO™ GenAI
 
+> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`.
+The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html).
+
 ## Build for Linux Systems
 
 ### Software Requirements 
@@ -10,20 +13,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build).  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -40,21 +39,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
-    unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
-    ```
 3. Build the project:
     ```sh
-    call ov\setupvars.bat
+    call <INSTALL_DIR>\setupvars.bat
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -77,19 +71,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov

From cb0da0ad7a2e35f686d7f529489d83ce01783989 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Tue, 23 Jul 2024 01:57:33 +0800
Subject: [PATCH 22/54] [OV 24.3]Fix multinomial sample CMakeList (#658)

@Wovchena, retarget to OV 24.3 release branch
---
 samples/cpp/multinomial_causal_lm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
index efcac50f09..98bc76ee3c 100644
--- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt
+++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
@@ -11,7 +11,7 @@ set_target_properties(multinomial_causal_lm PROPERTIES
     COMPILE_PDB_NAME multinomial_causal_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_features(greedy_causal_lm PRIVATE cxx_std_11)
+target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11)
 install(TARGETS multinomial_causal_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin

From bc9224884963ff89c99b7c73b30404fd6e3b0f40 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Tue, 23 Jul 2024 08:31:59 +0200
Subject: [PATCH 23/54] add Readme for tests (#664)

- Added Readme for python tests
- Added `--model_ids` option to run selectively only on specific models

---------

Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
---
 tests/python_tests/README.md                 | 47 ++++++++++++++++++++
 tests/python_tests/conftest.py               |  7 ++-
 tests/python_tests/ov_genai_test_utils.py    |  5 ++-
 tests/python_tests/test_chat_generate_api.py |  4 ++
 tests/python_tests/test_generate_api.py      | 32 +++++++++++++
 5 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 tests/python_tests/README.md

diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md
new file mode 100644
index 0000000000..e5381708de
--- /dev/null
+++ b/tests/python_tests/README.md
@@ -0,0 +1,47 @@
+# OpenVINO™ GenAI Tests
+
+This tests aim to validate support for vanilla and continuous batching GenAI APIs.
+
+## Setup environemnt
+
+In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).
+
+Then install requirements for tests:
+```sh
+pip install -r tests/python_tests/requirements.txt
+```
+
+## Run Tests
+
+```sh
+python -m pytest tests/python_tests/ -m precommit
+```
+
+During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
+```sh
+GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
+```
+
+If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
+```sh
+PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
+```
+
+## Customise tests run
+
+Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
+```
+
+If you wish to run all tests except beam search do the following:
+```sh
+python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
+```
+
+Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
+```
+
+List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index 66212468af..f98f47ecf3 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
         return f'{argname}={val}'
     return None
 
-def pytest_configure(config):
+def pytest_addoption(parser):
+    parser.addoption("--model_ids", help="Select models to run")
+
+def pytest_configure(config: pytest.Config):
     marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
     pytest.run_marker = marker
+    pytest.selected_model_ids = config.getoption('--model_ids', default=None)
+
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d48..bc95418aff 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -49,7 +49,10 @@ def get_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-
+    
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+    # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 94de8f6cc2..5a73d481d3 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -33,6 +33,7 @@
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     device = 'CPU'
     chat_history_hf = []
@@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
     # compares with HF when history in ov_genai is save as a text
     device = 'CPU'
@@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
     # Check that when history is stored in KV cache results are the same as when history stored in a text.
     device ='CPU'
@@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
     {'role': 'user', 'content': 'What was my first question?'},
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize('chat_config', get_chat_templates())
 def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
     tokenizer_config = chat_config[1]
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40eba92277..e2395cf8d7 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
     run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
 
@@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
     condition=sys.platform == "linux"
 )
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_ov_tensors(model_descr, inputs):
     hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
@@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("encoded_prompt", encoded_prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_multibatch(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
@@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
                               max_new_tokens, diversity_penalty, prompt):
     generation_config = dict(
@@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
 @pytest.mark.parametrize("max_new_tokens", [10, 80])
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
     # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
     # while genai ends sentence with <eos>
@@ -323,6 +330,7 @@ def user_defined_callback(subword):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -332,6 +340,7 @@ def test_callback_one_string(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
     # On metallam this prompt generates output which can shorten after adding new tokens.
@@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -380,6 +392,7 @@ def end(self):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_one_string():
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -389,6 +402,7 @@ def test_streamer_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -397,6 +411,7 @@ def test_streamer_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -404,6 +419,7 @@ def test_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -429,6 +447,7 @@ def test_operator_with_callback_batch_fail(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -436,6 +455,7 @@ def test_operator_with_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_ids_1(model_tmp_path):
     # test when there is an available config.json
     config_json = { 
@@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_str_2(model_tmp_path):
     # test with special_tokens_map
     special_tokens_map_json = { 
@@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3_(model_tmp_path):
     # special_tokens_map is not available 
     # but tokenize_config.json exists
@@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3(model_tmp_path):
     # both config.json is availabel and tokenizer_config.json available
     # check that it does not read int values from tokenizer_config.json if they are in config.json
@@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=AssertionError, 
     reason="CVS-143410 ov tokenizer should be aligned with hf",
@@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
 ]
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_invalid_configs(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
@@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
     pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
@@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
     dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
 def test_python_generation_config_validation(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
@@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_1():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_2():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_3():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
@@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():
 
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
 def test_left_pad():
     # test left pad tokenizer post processing implementation

From 90320f411257e215d06bcdf100d37bbe20f1622e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 23 Jul 2024 21:57:11 +0200
Subject: [PATCH 24/54] add cpp Readme, ensured correct batch processing, add
 PerfMetrics to Readme

---
 samples/CMakeLists.txt                        |  2 +-
 .../CMakeLists.txt                            | 12 ++--
 samples/cpp/benchmark_genai/README.md         | 47 +++++++++++++
 .../benchmark_genai.cpp}                      | 12 ++--
 samples/cpp/benchmark_vanilla_genai/README.md |  3 -
 .../README.md                                 | 30 ++-------
 .../benchmark_genai.py}                       | 24 ++++---
 .../benchmark_genai_automatic.py              | 62 +++++++++++++++++
 src/README.md                                 | 49 ++++++++++++++
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +-
 .../include/openvino/genai/perf_metrics.hpp   | 24 ++++---
 src/cpp/src/greedy_decoding.cpp               |  2 +-
 src/cpp/src/group_beam_searcher.cpp           |  2 +-
 src/cpp/src/llm_pipeline.cpp                  | 18 ++---
 src/cpp/src/multinomial_decoding.cpp          |  2 +-
 src/cpp/src/perf_metrics.cpp                  | 67 ++++++++++---------
 src/python/py_generate_pipeline.cpp           | 33 +++++++--
 17 files changed, 278 insertions(+), 115 deletions(-)
 rename samples/cpp/{benchmark_vanilla_genai => benchmark_genai}/CMakeLists.txt (64%)
 create mode 100644 samples/cpp/benchmark_genai/README.md
 rename samples/cpp/{benchmark_vanilla_genai/benchmark_vanilla_genai.cpp => benchmark_genai/benchmark_genai.cpp} (90%)
 delete mode 100644 samples/cpp/benchmark_vanilla_genai/README.md
 rename samples/python/{benchmark_vanilla_genai => benchmark_genai}/README.md (64%)
 rename samples/python/{benchmark_vanilla_genai/benchmark_vanilla_genai.py => benchmark_genai/benchmark_genai.py} (58%)
 create mode 100755 samples/python/benchmark_genai/benchmark_genai_automatic.py

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 44f8d580b2..5339817c1f 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
-add_subdirectory(cpp/benchmark_vanilla_genai)
+add_subdirectory(cpp/benchmark_genai)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)
diff --git a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt
similarity index 64%
rename from samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
rename to samples/cpp/benchmark_genai/CMakeLists.txt
index e871f5a33a..bfa1592f61 100644
--- a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
+++ b/samples/cpp/benchmark_genai/CMakeLists.txt
@@ -12,14 +12,14 @@ FetchContent_Declare(cxxopts
     URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
 FetchContent_MakeAvailable(cxxopts)
 
-add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp)
-target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts)
-set_target_properties(benchmark_vanilla_genai PROPERTIES
-    COMPILE_PDB_NAME benchmark_vanilla_genai
+add_executable(benchmark_genai benchmark_genai.cpp)
+target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_genai PROPERTIES
+    COMPILE_PDB_NAME benchmark_genai
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11)
-install(TARGETS benchmark_vanilla_genai
+# target_compile_features(benchmark_genai PRIVATE cxx_std_11)
+install(TARGETS benchmark_genai
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
     EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md
new file mode 100644
index 0000000000..bac16c2f7d
--- /dev/null
+++ b/samples/cpp/benchmark_genai/README.md
@@ -0,0 +1,47 @@
+# Benchmarking Vanilla GenAI
+
+This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+benchmark_vanilla_genai [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10
+```
+
+```
+Load time: 3405.69 ms
+Generate time: 1430.77 ± 3.04 ms
+Tokenization time: 0.51 ± 0.02 ms
+Detokenization time: 0.37 ± 0.01 ms
+TTFT: 81.60 ± 0.54 ms
+TPOT: 71.52 ± 2.72 ms
+Throughput tokens/s: 13.98 ± 0.53
+```
+
+For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
similarity index 90%
rename from samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
rename to samples/cpp/benchmark_genai/benchmark_genai.cpp
index a9bc07f641..9610aabe54 100644
--- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
+++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -8,11 +8,11 @@ int main(int argc, char* argv[]) try {
     cxxopts::Options options("benchmark_vanilla_genai", "Help command");
 
     options.add_options()
-    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
-    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
-    ("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
+    ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
     ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
     ("h,help", "Print usage");
 
@@ -38,6 +38,8 @@ int main(int argc, char* argv[]) try {
   
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+    config.num_beam_groups = 3;
+    config.num_beams = 15;
 
     ov::genai::LLMPipeline pipe(model_path, device);
     
@@ -45,10 +47,10 @@ int main(int argc, char* argv[]) try {
         pipe.generate(prompt, config);
     
     ov::genai::DecodedResults res = pipe.generate(prompt, config);
-    ov::genai::PerfMetrics metrics = res.metrics;
+    ov::genai::PerfMetrics metrics = res.perf_metrics;
     for (size_t i = 0; i < num_iter - 1; i++) {
         res = pipe.generate(prompt, config);
-        metrics = metrics + res.metrics;
+        metrics = metrics + res.perf_metrics;
     }
 
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md
deleted file mode 100644
index 50197dad1d..0000000000
--- a/samples/cpp/benchmark_vanilla_genai/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# benchmark OpenVINO GenAI sample
-
-TODO: adapt from python sample to c++
\ No newline at end of file
diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_genai/README.md
similarity index 64%
rename from samples/python/benchmark_vanilla_genai/README.md
rename to samples/python/benchmark_genai/README.md
index 13666a7de9..fa4fa85576 100644
--- a/samples/python/benchmark_vanilla_genai/README.md
+++ b/samples/python/benchmark_genai/README.md
@@ -1,28 +1,7 @@
-# Benchmark Vanilla GenAI
+# Benchmarking Vanilla GenAI
 
 This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
 
-# ov.genai.PerfMetrics structure
-ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics:
-- mean_ttft
- - std_ttft
- - mean_tpot
- - std_tpot
- - load_time
- - mean_generate_duration
- - std_generate_duration
- - mean_tokenization_duration
- - std_tokenization_duration
- - mean_detokenization_duration
- - std_detokenization_duration
- - mean_throughput
- - std_throughput
- - num_generated_tokens
- - num_input_tokens
-
-Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated.
-
-
 ## Download and convert the model and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -45,14 +24,14 @@ python benchmark_vanilla_genai.py [OPTIONS]
 - `-m, --model`: Path to the model and tokenizers base directory.
 - `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
 - `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
-- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
 - `-n, --num_iter` (default: `3`): Number of iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
 - `-d, --device` (default: `"CPU"`): Device to run the model on.
 
 ### Output:
 
 ```
-python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
+python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
 ```
 
 ```
@@ -64,4 +43,5 @@ TTFT: 81.60 ± 0.54 ms
 TPOT: 71.52 ± 2.72 ms
 Throughput tokens/s: 13.98 ± 0.53
 ```
-s
\ No newline at end of file
+
+For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics).
diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
similarity index 58%
rename from samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
rename to samples/python/benchmark_genai/benchmark_genai.py
index 9e4debe847..06bd8b0f48 100755
--- a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
+++ b/samples/python/benchmark_genai/benchmark_genai.py
@@ -3,7 +3,6 @@
 
 import argparse
 import openvino_genai as ov_genai
-import pdb
 
 def main():
     parser = argparse.ArgumentParser(description="Help command")
@@ -16,6 +15,8 @@ def main():
     
     args = parser.parse_args()
 
+    # Perf metrics is stored in DecodedResults. 
+    # In order to get DecodedResults instead of a string input should be a list.
     prompt = [args.prompt]
     model_path = args.model
     device = args.device
@@ -24,6 +25,8 @@ def main():
     
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
+    config.num_beam_groups = 3
+    config.num_beams = 15
 
     pipe = ov_genai.LLMPipeline(model_path, device)
     
@@ -31,19 +34,18 @@ def main():
         pipe.generate(prompt, config)
     
     res = pipe.generate(prompt, config)
-    metrics = res.metrics
+    perf_metrics = res.perf_metrics
     for _ in range(num_iter - 1):
-        # pdb.set_trace()
         res = pipe.generate(prompt, config)
-        metrics += res.metrics
+        perf_metrics += res.perf_metrics
     
-    print(f"Load time: {metrics.load_time:.2f} ms")
-    print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
-    print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
-    print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
-    print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms")
-    print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms")
-    print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}")
+    print(f"Load time: {perf_metrics.load_time:.2f} ms")
+    print(f"Generate time: {perf_metrics.mean_generate_duration:.2f} ± {perf_metrics.std_generate_duration:.2f} ms")
+    print(f"Tokenization time: {perf_metrics.mean_tokenization_duration:.2f} ± {perf_metrics.std_tokenization_duration:.2f} ms")
+    print(f"Detokenization time: {perf_metrics.mean_detokenization_duration:.2f} ± {perf_metrics.std_detokenization_duration:.2f} ms")
+    print(f"TTFT: {perf_metrics.mean_ttft:.2f} ± {perf_metrics.std_ttft:.2f} ms")
+    print(f"TPOT: {perf_metrics.mean_tpot:.2f} ± {perf_metrics.std_tpot:.2f} ms")
+    print(f"Throughput tokens/s: {perf_metrics.mean_throughput:.2f} ± {perf_metrics.std_throughput:.2f}")
 
 if __name__ == "__main__":
     main()
diff --git a/samples/python/benchmark_genai/benchmark_genai_automatic.py b/samples/python/benchmark_genai/benchmark_genai_automatic.py
new file mode 100755
index 0000000000..98a00a8c99
--- /dev/null
+++ b/samples/python/benchmark_genai/benchmark_genai_automatic.py
@@ -0,0 +1,62 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+import pdb
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=5, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+    
+    args = parser.parse_args()
+
+    # Perf metrics is stored in DecodedResults. 
+    # In order to get DecodedResults instead of a string input should be a list.
+    
+    model_path = args.model
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+    
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 20
+    # config.num_beam_groups = 3
+    # config.num_beams = 15
+
+    pipe = ov_genai.LLMPipeline(model_path, device)
+    
+    import pandas as pd
+    metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot'])
+
+    batch_sizes = [1, 2, 4, 16, 32, 64, 256]
+    for batch_size in batch_sizes:
+        prompt = [args.prompt] * batch_size
+        for _ in range(num_warmup):
+            pipe.generate(prompt, config)
+        
+        res = pipe.generate(prompt, config)
+        metrics = res.metrics
+        for _ in range(num_iter - 1):
+            res = pipe.generate(prompt, config)
+            metrics += res.metrics
+        # pdb.set_trace()
+        metrics_df = metrics_df._append({
+            'batch_size': batch_size,
+            'throughput': metrics.mean_throughput,
+            'ttft': metrics.mean_ttft,
+            'tpot': metrics.mean_tpot,
+            'std_throughput': metrics.std_throughput,
+            'std_ttft': metrics.std_ttft,
+            'std_tpot': metrics.std_tpot,
+        }, ignore_index=True)
+
+    metrics_df.to_csv('metrics.csv', index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/README.md b/src/README.md
index 445b88aa58..a5530ea578 100644
--- a/src/README.md
+++ b/src/README.md
@@ -196,6 +196,55 @@ int main(int argc, char* argv[]) {
 }
 ```
 
+### Performance Metrics
+
+`ov.genai.PerfMetrics` (referred to as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` hold fields with mean and standard deviations for the following metrics:
+- `ttft`
+- `tpot`
+- `load_time`
+- `generate_duration`
+- `tokenization_duration`
+- `detokenization_duration`
+- `throughput`
+
+and:
+- `num_generated_tokens`
+- `num_input_tokens`
+
+Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `ov.genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`.
+
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
+res = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = res.perf_metrics
+print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
+print(f'ttft: {perf_metrics.mean_ttft:.2f}')
+print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+```
+output:
+```sh
+mean_generate_duration: 76.28
+mean_ttft: 42.58
+mean_tpot 3.80
+```
+
+>**Note**: If the input prompt is just a string, the generate function will return only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs.
+
+Several `perf_metrics` can be added with each other. In that case `raw_metrics` will be concatenated and mean/std values will be recalculated. This enhances benchmarking and accumulating statistics from several calls.
+
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
+res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20)
+perf_metrics = res_1.perf_metrics + res_2.perf_metrics
+
+print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
+print(f'ttft: {perf_metrics.mean_ttft:.2f}')
+print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+```
+
 ## How It Works
 
 For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md).
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 14100d4f16..4be298128e 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -37,7 +37,7 @@ class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
-    PerfMetrics metrics;
+    PerfMetrics perf_metrics;
 };
 
 /**
@@ -52,7 +52,7 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
-    PerfMetrics metrics;
+    PerfMetrics perf_metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index 5779b9b080..44535cf3a2 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -37,23 +37,25 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
 *
 */
 struct OPENVINO_GENAI_EXPORTS PerfMetrics {
-    // First token time.
+    // Load time in ms.
+    float load_time;
+
+    // First token time (in ms).
     float mean_ttft;
     float std_ttft;
 
-    // Time per output token.
+    // Time (in ms) per output token.
     float mean_tpot;
     float std_tpot;
     
-    float load_time;
-
     float mean_generate_duration;
     float std_generate_duration;
-    float mean_tokenization_duration;
-    float std_tokenization_duration;
-    float mean_detokenization_duration;
-    float std_detokenization_duration;
-    
+    float mean_tokenization_duration = -1;
+    float std_tokenization_duration = -1;
+    float mean_detokenization_duration = -1;
+    float std_detokenization_duration = -1;
+     
+    // Tokens per second.
     float mean_throughput;
     float std_throughput;
 
@@ -61,11 +63,11 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     size_t num_input_tokens;
 
     void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt);
-    static float get_duration_ms(std::chrono::steady_clock::duration duration);
+    static float get_microsec(std::chrono::steady_clock::duration duration);
     PerfMetrics operator+(const PerfMetrics& metrics) const;
     PerfMetrics& operator+=(const PerfMetrics& right);
 
-    RawPerfMetrics raw_counters;
+    RawPerfMetrics raw_metrics;
 };
 
 } // namespace genai
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index c8fd36cbdd..8b0cf19c1f 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -24,7 +24,7 @@ EncodedResults greedy_decoding(
 
     // Initialize results and performance metrics.
     EncodedResults results;
-    auto& raw_perf_counters = results.metrics.raw_counters;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
index 784ff1a915..1b9729b2f6 100644
--- a/src/cpp/src/group_beam_searcher.cpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -444,7 +444,7 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
     int32_t res_selected_beam_idx = 0;
     results.scores.reserve(config.num_return_sequences * result.size());
     results.tokens.reserve(config.num_return_sequences * result.size());
-    auto& raw_perf_counters = results.metrics.raw_counters;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     raw_perf_counters.m_new_token_times = new_token_times;
     raw_perf_counters.m_batch_sizes = batch_sizes;
     
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index adac9110e1..1c1bd5ccd8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -161,16 +161,16 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         }
         
         // generate_durations
-        decoded_results.metrics = encoded_results.metrics;
+        decoded_results.perf_metrics = encoded_results.perf_metrics;
 
-        auto& raw_counters = decoded_results.metrics.raw_counters;
+        auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
         auto stop_time = std::chrono::steady_clock::now();
         raw_counters.generate_durations = std::vector<MicroSeconds>();
-        raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
-        raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time));
-        raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time));
+        raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+        raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+        raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
 
-        decoded_results.metrics.evaluate_statistics(start_time);
+        decoded_results.perf_metrics.evaluate_statistics(start_time);
         return decoded_results;
     }
 
@@ -272,10 +272,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
-        auto& metrics = result.metrics;
+        auto& metrics = result.perf_metrics;
         metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
         metrics.load_time = this->m_load_time_ms;
-        metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
+        metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
         metrics.evaluate_statistics(start_time);
         return result;
     }
@@ -393,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
     }
     auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f;
+    m_pimpl->m_load_time_ms = PerfMetrics::get_microsec(stop_time - start_time) / 1000.0f;
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp
index fc59f00e12..b00c62aed7 100644
--- a/src/cpp/src/multinomial_decoding.cpp
+++ b/src/cpp/src/multinomial_decoding.cpp
@@ -164,7 +164,7 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
 
     // Initialize results and performance metrics.
     EncodedResults results;
-    auto& raw_perf_counters = results.metrics.raw_counters;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     results.scores.resize(batch_size, 0);
     results.tokens.resize(batch_size);
 
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index d4dc6c8de6..c319032449 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -9,18 +9,18 @@
 
 namespace {
 
-// std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
 std::pair<float, float> calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
+    // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
     float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, 
         [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
-            return acc + duration.count();
+            return acc + duration.count() / 1000.0f;
         });
     mean /= durations.size();
-    mean /= 1000.f;
     
     float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
         [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
-            return acc + duration.count() * duration.count() / 1000000.0f;
+            auto d = duration.count() / 1000.0f;
+            return acc + d * d;
         });
     float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);
     return {mean, std};
@@ -32,7 +32,7 @@ std::pair<float, float> calc_mean_and_std(const std::vector<ov::genai::MicroSeco
 namespace ov {
 namespace genai {
 
-float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) {
+float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) {
     return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
 }
     
@@ -40,33 +40,33 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
     // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that.
     if (start_time.has_value()) {
         auto start_time_val = *start_time;
-        auto& tok_times = raw_counters.m_new_token_times;
-        auto& batch_sizes = raw_counters.m_batch_sizes;
-        raw_counters.m_durations = std::vector<MicroSeconds>(tok_times.size());
+        auto& tok_times = raw_metrics.m_new_token_times;
+        auto& batch_sizes = raw_metrics.m_batch_sizes;
+        raw_metrics.m_durations = std::vector<MicroSeconds>(tok_times.size());
 
         auto ttft = tok_times[0] - start_time_val;
-        raw_counters.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_counters.m_times_to_first_token.emplace_back(ttft);
+        raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
+        raw_metrics.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = 0;
         for (size_t i = 0; i < tok_times.size(); ++i) {
-            raw_counters.m_durations[i] = tok_times[i] - start_time_val;
+            raw_metrics.m_durations[i] = tok_times[i] - start_time_val;
             
-            // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
-            // todo: float check that it's valid for batch > 1.
-            raw_counters.m_durations[i] /= batch_sizes[i];
+            // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms.
+            raw_metrics.m_durations[i] /= batch_sizes[i];
             num_generated_tokens += batch_sizes[i];
             start_time_val = tok_times[i];
         }
     }
+    
+    // calc_mean_and_std will convert microsecond to milliseconds.
+    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_metrics.m_durations);
+    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_metrics.m_times_to_first_token);
 
-    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_counters.m_durations);
-    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_counters.m_times_to_first_token);
-
-    std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_counters.generate_durations);
-    std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_counters.tokenization_durations);
-    std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_counters.detokenization_durations);    
+    std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_metrics.generate_durations);
+    std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_metrics.tokenization_durations);
+    std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_metrics.detokenization_durations);    
     
-    mean_throughput = 1000.0f / mean_tpot;
+    mean_throughput = 1000.0f / mean_tpot;  // tokens per second
     std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot);
 }
 
@@ -76,22 +76,25 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
     // Copy left value to res.
     PerfMetrics res = *this;
 
-    // Concatenate duration and first token times.
-    auto& new_durations = res.raw_counters.m_durations;
-    auto& new_times_to_first_token = res.raw_counters.m_times_to_first_token;
-    auto& right_durations = right.raw_counters.m_durations;
-    auto& right_times_to_first_token = right.raw_counters.m_times_to_first_token;
+    // Concatenate durations, batch_sizes first token times.
+    auto& new_durations = res.raw_metrics.m_durations;
+    auto& new_batch_sizes = res.raw_metrics.m_batch_sizes;
+    auto& new_times_to_first_token = res.raw_metrics.m_times_to_first_token;
+    auto& right_durations = right.raw_metrics.m_durations;
+    auto& right_batch_sizes = right.raw_metrics.m_batch_sizes;
+    auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token;
     
     new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end());
     new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end());
+    new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end());
 
     // Concatenate tokenization/detokenization and total generation times.
-    auto& new_tok_durations = res.raw_counters.tokenization_durations;
-    auto& new_detok_durations = res.raw_counters.detokenization_durations;
-    auto& new_gen_durations = res.raw_counters.generate_durations;
-    auto& right_tok_durations = right.raw_counters.tokenization_durations;
-    auto& right_detok_durations = right.raw_counters.detokenization_durations;
-    auto& right_gen_durations = right.raw_counters.generate_durations;
+    auto& new_tok_durations = res.raw_metrics.tokenization_durations;
+    auto& new_detok_durations = res.raw_metrics.detokenization_durations;
+    auto& new_gen_durations = res.raw_metrics.generate_durations;
+    auto& right_tok_durations = right.raw_metrics.tokenization_durations;
+    auto& right_detok_durations = right.raw_metrics.detokenization_durations;
+    auto& right_gen_durations = right.raw_metrics.generate_durations;
     
     new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end());
     new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end());
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index e2f89cd962..6c88b3ffcc 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -38,6 +38,17 @@ using PyBindStreamerVariant = std::variant<std::function<bool(py::str)>, std::sh
 template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
 template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
 
+template <typename T, typename U>
+std::vector<float> get_ms(const T& instance, U T::*member) {
+    // Converts c++ duration to float so that it can be used in Python.
+    std::vector<float> res;
+    const auto& durations = instance.*member;
+    res.reserve(durations.size());
+    std::transform(durations.begin(), durations.end(), std::back_inserter(res),
+                   [](const auto& duration) { return duration.count(); });
+    return res;
+}
+
 namespace {
 
 auto generate_docstring = R"(
@@ -536,17 +547,25 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def(py::init<>())
         .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); })
         .def_readonly("scores", &DecodedResults::scores)
-        .def_readonly("metrics", &DecodedResults::metrics)
+        .def_readonly("perf_metrics", &DecodedResults::perf_metrics)
         .def("__str__", &DecodedResults::operator std::string);
 
     py::class_<RawPerfMetrics>(m, "RawPerfMetrics")
         .def(py::init<>())
         .def_readonly("generate_durations", &RawPerfMetrics::generate_durations)
-        .def_readonly("tokenization_durations", &RawPerfMetrics::tokenization_durations)
-        .def_readonly("detokenization_durations", &RawPerfMetrics::detokenization_durations)
-        .def_readonly("m_times_to_first_token", &RawPerfMetrics::m_times_to_first_token)
+        .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { 
+            return get_ms(rw, &RawPerfMetrics::tokenization_durations);
+         })
+        .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { 
+            return get_ms(rw, &RawPerfMetrics::detokenization_durations); 
+        })
+        .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { 
+            return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); 
+        })
+        .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { 
+            return get_ms(rw, &RawPerfMetrics::m_durations); 
+        })
         .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes)
-        .def_readonly("m_durations", &RawPerfMetrics::m_durations)
         .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens)
         .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens);
 
@@ -567,7 +586,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("load_time", &PerfMetrics::load_time)
         .def("__add__", &PerfMetrics::operator+)
         .def("__iadd__", &PerfMetrics::operator+=)
-        .def_readonly("raw_counters", &PerfMetrics::raw_counters)
+        .def_readonly("raw_metrics", &PerfMetrics::raw_metrics)
         ;
 
     py::class_<TokenizedInputs>(m, "TokenizedInputs")
@@ -578,7 +597,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
     py::class_<EncodedResults>(m, "EncodedResults")
         .def_readonly("tokens", &EncodedResults::tokens)
         .def_readonly("scores", &EncodedResults::scores)
-        .def_readonly("metrics", &EncodedResults::metrics);
+        .def_readonly("perf_metrics", &EncodedResults::perf_metrics);
 
     py::class_<StreamerBase, ConstructableStreamer, std::shared_ptr<StreamerBase>>(m, "StreamerBase")  // Change the holder form unique_ptr to shared_ptr
         .def(py::init<>())

From aeec730c4ebd14c90c081df40e50fd49d3c66f0d Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 23 Jul 2024 23:08:39 +0200
Subject: [PATCH 25/54] use MeanStdPair

---
 .../cpp/benchmark_genai/benchmark_genai.cpp   | 14 +++--
 .../python/benchmark_genai/benchmark_genai.py | 14 +++--
 .../include/openvino/genai/perf_metrics.hpp   | 51 +++++++++++--------
 src/cpp/src/perf_metrics.cpp                  | 18 ++++---
 src/python/py_generate_pipeline.cpp           | 27 +++++-----
 5 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
index 9610aabe54..24b9491219 100644
--- a/samples/cpp/benchmark_genai/benchmark_genai.cpp
+++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -38,8 +38,6 @@ int main(int argc, char* argv[]) try {
   
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
-    config.num_beam_groups = 3;
-    config.num_beams = 15;
 
     ov::genai::LLMPipeline pipe(model_path, device);
     
@@ -54,12 +52,12 @@ int main(int argc, char* argv[]) try {
     }
 
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
-    std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl;
-    std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl;
-    std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl;
-    std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
-    std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl;
-    std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl;
+    std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl;
+    std::cout << "ttft: " << metrics.ttft.mean  << " ± " << metrics.ttft.std << " ms" << std::endl;
+    std::cout << "tpot: " << metrics.tpot.mean  << " ± " << metrics.tpot.std << " ms " << std::endl;
+    std::cout << "Tokens/s: " << metrics.throughput.mean  << " ± " << metrics.throughput.std << std::endl;
 
     return 0;
 } catch (const std::exception& error) {
diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
index 06bd8b0f48..c29c508bf4 100755
--- a/samples/python/benchmark_genai/benchmark_genai.py
+++ b/samples/python/benchmark_genai/benchmark_genai.py
@@ -25,8 +25,6 @@ def main():
     
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
-    config.num_beam_groups = 3
-    config.num_beams = 15
 
     pipe = ov_genai.LLMPipeline(model_path, device)
     
@@ -40,12 +38,12 @@ def main():
         perf_metrics += res.perf_metrics
     
     print(f"Load time: {perf_metrics.load_time:.2f} ms")
-    print(f"Generate time: {perf_metrics.mean_generate_duration:.2f} ± {perf_metrics.std_generate_duration:.2f} ms")
-    print(f"Tokenization time: {perf_metrics.mean_tokenization_duration:.2f} ± {perf_metrics.std_tokenization_duration:.2f} ms")
-    print(f"Detokenization time: {perf_metrics.mean_detokenization_duration:.2f} ± {perf_metrics.std_detokenization_duration:.2f} ms")
-    print(f"TTFT: {perf_metrics.mean_ttft:.2f} ± {perf_metrics.std_ttft:.2f} ms")
-    print(f"TPOT: {perf_metrics.mean_tpot:.2f} ± {perf_metrics.std_tpot:.2f} ms")
-    print(f"Throughput tokens/s: {perf_metrics.mean_throughput:.2f} ± {perf_metrics.std_throughput:.2f}")
+    print(f"Generate time: {perf_metrics.generate_duration.mean:.2f} ± {perf_metrics.generate_duration.std:.2f} ms")
+    print(f"Tokenization time: {perf_metrics.tokenization_duration.mean:.2f} ± {perf_metrics.tokenization_duration.std:.2f} ms")
+    print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms")
+    print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms")
+    print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms")
+    print(f"Throughput tokens/s: {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f}")
 
 if __name__ == "__main__":
     main()
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index 44535cf3a2..8715761792 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -33,36 +33,43 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
 };
 
 /**
-* @brief Structure to store performance metric for each generation
-*
+* @brief Structure to store mean and standart deviation values.
 */
-struct OPENVINO_GENAI_EXPORTS PerfMetrics {
-    // Load time in ms.
-    float load_time;
-
-    // First token time (in ms).
-    float mean_ttft;
-    float std_ttft;
+struct OPENVINO_GENAI_EXPORTS MeanStdPair {
+    float mean;
+    float std;
+};
 
-    // Time (in ms) per output token.
-    float mean_tpot;
-    float std_tpot;
+/**
+* @brief Structure to store performance metric for each generation.
+* 
+* @param
+*/
+struct OPENVINO_GENAI_EXPORTS PerfMetrics {
+    float load_time;   // Load time in ms.
+    MeanStdPair ttft;  // Time to the first token (in ms) (TTTFT).
+    MeanStdPair tpot;  // Time (in ms) per output token (TPOT).
+    MeanStdPair throughput;  // Tokens per second.
     
-    float mean_generate_duration;
-    float std_generate_duration;
-    float mean_tokenization_duration = -1;
-    float std_tokenization_duration = -1;
-    float mean_detokenization_duration = -1;
-    float std_detokenization_duration = -1;
-     
-    // Tokens per second.
-    float mean_throughput;
-    float std_throughput;
+    MeanStdPair generate_duration;
+    MeanStdPair tokenization_duration = {-1, -1};
+    MeanStdPair detokenization_duration = {-1. -1};
 
     size_t num_generated_tokens;
     size_t num_input_tokens;
 
+    /** 
+     * @brief calculates mean/std values from raw_metrics. 
+     * 
+     * @param start_time optional start_time in case if duration needs to be updated.
+     */
     void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt);
+    
+    /** 
+     * @brief convert duration to microseconds
+     * 
+     * @param duration duration in 
+     */
     static float get_microsec(std::chrono::steady_clock::duration duration);
     PerfMetrics operator+(const PerfMetrics& metrics) const;
     PerfMetrics& operator+=(const PerfMetrics& right);
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index c319032449..bc394fae52 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -9,7 +9,7 @@
 
 namespace {
 
-std::pair<float, float> calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
+ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
     // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
     float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, 
         [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
@@ -59,15 +59,17 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
     }
     
     // calc_mean_and_std will convert microsecond to milliseconds.
-    std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_metrics.m_durations);
-    std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_metrics.m_times_to_first_token);
+    tpot = calc_mean_and_std(raw_metrics.m_durations);
+    ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token);
 
-    std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_metrics.generate_durations);
-    std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_metrics.tokenization_durations);
-    std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_metrics.detokenization_durations);    
+    generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
+    generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
+
+    tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations);
+    detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);    
     
-    mean_throughput = 1000.0f / mean_tpot;  // tokens per second
-    std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot);
+    // tokens per second
+    throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)};
 }
 
 PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 6c88b3ffcc..e744179c34 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -20,6 +20,7 @@ using ov::genai::EncodedResults;
 using ov::genai::GenerationConfig;
 using ov::genai::GenerationResult;
 using ov::genai::LLMPipeline;
+using ov::genai::MeanStdPair;
 using ov::genai::OptionalGenerationConfig;
 using ov::genai::PerfMetrics;
 using ov::genai::RawPerfMetrics;
@@ -569,25 +570,23 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens)
         .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens);
 
+    py::class_<MeanStdPair>(m, "MeanStdPair")
+        .def(py::init<>())
+        .def_readonly("mean", &MeanStdPair::mean)
+        .def_readonly("std", &MeanStdPair::std);
+
     py::class_<PerfMetrics>(m, "PerfMetrics")
         .def(py::init<>())
-        .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration)
-        .def_readonly("std_generate_duration", &PerfMetrics::std_generate_duration)
-        .def_readonly("mean_tokenization_duration", &PerfMetrics::mean_tokenization_duration)
-        .def_readonly("std_tokenization_duration", &PerfMetrics::std_tokenization_duration)
-        .def_readonly("mean_detokenization_duration", &PerfMetrics::mean_detokenization_duration)
-        .def_readonly("std_detokenization_duration", &PerfMetrics::std_detokenization_duration)
-        .def_readonly("mean_throughput", &PerfMetrics::mean_throughput)
-        .def_readonly("std_throughput", &PerfMetrics::std_throughput)
-        .def_readonly("mean_tpot", &PerfMetrics::mean_tpot)
-        .def_readonly("mean_ttft", &PerfMetrics::mean_ttft)
-        .def_readonly("std_tpot", &PerfMetrics::std_tpot)
-        .def_readonly("std_ttft", &PerfMetrics::std_ttft)
+        .def_readonly("generate_duration", &PerfMetrics::generate_duration)
+        .def_readonly("tokenization_duration", &PerfMetrics::tokenization_duration)
+        .def_readonly("detokenization_duration", &PerfMetrics::detokenization_duration)
+        .def_readonly("throughput", &PerfMetrics::throughput)
+        .def_readonly("tpot", &PerfMetrics::tpot)
+        .def_readonly("ttft", &PerfMetrics::ttft)
         .def_readonly("load_time", &PerfMetrics::load_time)
         .def("__add__", &PerfMetrics::operator+)
         .def("__iadd__", &PerfMetrics::operator+=)
-        .def_readonly("raw_metrics", &PerfMetrics::raw_metrics)
-        ;
+        .def_readonly("raw_metrics", &PerfMetrics::raw_metrics);
 
     py::class_<TokenizedInputs>(m, "TokenizedInputs")
         .def(py::init<ov::Tensor, ov::Tensor>())

From 56eeafcd1117b7a57ff396f96a6656e97d2bb6bd Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Wed, 24 Jul 2024 10:50:02 +0300
Subject: [PATCH 26/54] [2024.3] Fix symbol encode error (#629)

Symbols that cause errors:
- `\u0643`
- `\u25aa`
---
 .github/workflows/causal_lm_cpp.yml             |  4 ++++
 .github/workflows/genai_package.yml             |  1 +
 .github/workflows/genai_python_lib.yml          |  1 +
 samples/cpp/beam_search_causal_lm/README.md     | 14 +++++++++++++-
 samples/cpp/chat_sample/README.md               | 14 +++++++++++++-
 samples/cpp/greedy_causal_lm/README.md          | 14 +++++++++++++-
 samples/cpp/multinomial_causal_lm/README.md     | 14 +++++++++++++-
 samples/cpp/prompt_lookup_decoding_lm/README.md | 14 +++++++++++++-
 samples/cpp/speculative_decoding_lm/README.md   | 14 +++++++++++++-
 samples/python/beam_search_causal_lm/README.md  | 14 +++++++++++++-
 samples/python/chat_sample/README.md            | 14 +++++++++++++-
 samples/python/greedy_causal_lm/README.md       | 14 +++++++++++++-
 samples/python/multinomial_causal_lm/README.md  | 14 +++++++++++++-
 13 files changed, 136 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 85bef624c8..527259f203 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -191,6 +191,8 @@ jobs:
 
   cpp-greedy_causal_lm-windows:
     runs-on: windows-latest
+    env:
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
@@ -626,6 +628,8 @@ jobs:
 
   cpp-continuous-batching-windows:
     runs-on: windows-latest
+    env:
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 2535e423d9..cf604b4bcc 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -80,6 +80,7 @@ jobs:
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index e0c43bddd5..257a9c2f57 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -63,6 +63,7 @@ jobs:
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index 82232c42f6..0d2ee83bfc 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 8a24b20005..a2eccb4d3d 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `chat_sample TinyLlama-1.1B-Chat-v1.0`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index c0a7d5f3c4..79852e0d10 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 4478579919..21c9a07e77 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 89a5e2c585..c5517c5bf6 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -20,8 +20,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index c86bd8b617..644ebd2c94 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -24,8 +24,20 @@ optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-ch
 
 `speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md
index 5e80aa69da..7e412db379 100644
--- a/samples/python/beam_search_causal_lm/README.md
+++ b/samples/python/beam_search_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md
index 983789d0eb..aee8783b5f 100644
--- a/samples/python/chat_sample/README.md
+++ b/samples/python/chat_sample/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `chat_sample.py TinyLlama-1.1B-Chat-v1.0`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md
index 97b044eb51..1f0eb333ea 100644
--- a/samples/python/greedy_causal_lm/README.md
+++ b/samples/python/greedy_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
index d39142f3de..0778868e6a 100644
--- a/samples/python/multinomial_causal_lm/README.md
+++ b/samples/python/multinomial_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.

From 8934a0e8991d0dfd9a14630c8ef0a907e44d138e Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Wed, 24 Jul 2024 11:55:56 +0200
Subject: [PATCH 27/54] [release branch] Add infer request queue for tokenizers
 and allow for optional plugin_config in tokenizer (#669)

This improves performance of CB lib when tested within OVMS.
Already merged to master:
https://github.com/openvinotoolkit/openvino.genai/pull/651
This is cherry-pick
---
 .../genai/continuous_batching_pipeline.hpp    |   3 +-
 src/cpp/include/openvino/genai/tokenizer.hpp  |   2 +-
 src/cpp/src/circular_buffer_queue.hpp         | 100 ++++++++++++++++++
 src/cpp/src/continuous_batching_pipeline.cpp  |   9 +-
 src/cpp/src/tokenizer.cpp                     |  98 ++++++++++-------
 src/python/py_generate_pipeline.cpp           |  12 +--
 tests/python_tests/common.py                  |   2 +-
 tests/python_tests/ov_genai_test_utils.py     |   2 +-
 tests/python_tests/test_sampling.py           |   2 +-
 9 files changed, 179 insertions(+), 51 deletions(-)
 create mode 100644 src/cpp/src/circular_buffer_queue.hpp

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index be9a5fd8c1..f5f8c53309 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -30,7 +30,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     ContinuousBatchingPipeline(const std::string& models_path,
                                const SchedulerConfig& scheduler_config,
                                const std::string& device = "CPU",
-                               const ov::AnyMap& plugin_config = {});
+                               const ov::AnyMap& llm_plugin_config = {},
+                               const ov::AnyMap& tokenizer_plugin_config = {});
 
     /**
     * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 5a1e181e21..425c30128b 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -29,7 +29,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
-    Tokenizer(const std::string& tokenizer_path);
+    Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config = {});
 
     /**
     * @brief encode a single prompt
diff --git a/src/cpp/src/circular_buffer_queue.hpp b/src/cpp/src/circular_buffer_queue.hpp
new file mode 100644
index 0000000000..086854e68e
--- /dev/null
+++ b/src/cpp/src/circular_buffer_queue.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <queue>
+#include <mutex>
+#include <future>
+#include <algorithm>
+#include <atomic>
+
+namespace ov::genai {
+
+// From OVMS:
+// https://github.com/openvinotoolkit/model_server/blob/d73e85cbb8ac1d761754cb2064a00551a9ffc655/src/queue.hpp#L34
+template <typename T>
+class CircularBufferQueue
+{
+    int m_front_idx;
+    std::atomic<int> m_back_idx;
+    std::vector<int> m_values;
+    std::queue<std::promise<int>> m_promises;
+    std::vector<T> m_data;
+    std::mutex m_front_mut;
+    std::mutex m_queue_mutex;
+
+public:
+
+    CircularBufferQueue(size_t length, const std::function<T()>& create_fn) :
+        m_values(length),
+        m_front_idx{0},
+        m_back_idx{0} {
+        std::iota(m_values.begin(), m_values.end(), 0);
+        m_data.reserve(length);
+        for (size_t i = 0; i < length; i++) {
+            m_data.emplace_back(std::move(create_fn()));
+        }
+    }
+
+    CircularBufferQueue(const CircularBufferQueue&) = delete;
+    CircularBufferQueue(const CircularBufferQueue&&) = delete;
+    CircularBufferQueue& operator=(const CircularBufferQueue&) = delete;
+
+    T& get(int value) {
+        return m_data[value];
+    }
+
+    std::future<int> get_idle() {
+        int value;
+        std::promise<int> idle_promise;
+        std::future<int> idle_future = idle_promise.get_future();
+        std::unique_lock<std::mutex> lk(m_front_mut);
+        if (m_values[m_front_idx] < 0) {
+            std::unique_lock<std::mutex> queueLock(m_queue_mutex);
+            m_promises.push(std::move(idle_promise));
+        } else {
+            value = m_values[m_front_idx];
+            m_values[m_front_idx] = -1;
+            m_front_idx = (m_front_idx + 1) % m_values.size();
+            lk.unlock();
+            idle_promise.set_value(value);
+        }
+        return idle_future;
+    }
+
+    void return_to(int value) {
+        std::unique_lock<std::mutex> lk(m_queue_mutex);
+        if (m_promises.size()) {
+            std::promise<int> promise = std::move(m_promises.front());
+            m_promises.pop();
+            lk.unlock();
+            promise.set_value(value);
+            return;
+        }
+        int old_back = m_back_idx.load();
+        while (!m_back_idx.compare_exchange_weak(
+            old_back,
+            (old_back + 1) % m_values.size(),
+            std::memory_order_relaxed)) {
+        }
+        m_values[old_back] = value;
+    }
+};
+
+template <typename T>
+class CircularBufferQueueElementGuard {
+    CircularBufferQueue<T>* m_queue;
+    int m_value;
+public:
+    CircularBufferQueueElementGuard(CircularBufferQueue<T>* queue) : m_queue(queue) {
+        m_value = m_queue->get_idle().get();   // blocking until we get the element
+    }
+
+    T& get() {
+        return m_queue->get(m_value);
+    }
+
+    ~CircularBufferQueueElementGuard() {
+        m_queue->return_to(m_value);
+    }
+};
+
+}
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index ddfebc5926..55100f3cb4 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -105,8 +105,8 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
-        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, const ov::AnyMap& tokenizer_plugin_config)
+        : Impl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {}
 
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
@@ -282,8 +282,9 @@ class ContinuousBatchingPipeline::Impl {
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path,
                                                         const SchedulerConfig& scheduler_config,
                                                         const std::string& device,
-                                                        const ov::AnyMap& plugin_config ) {
-    m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
+                                                        const ov::AnyMap& llm_plugin_config,
+                                                        const ov::AnyMap& tokenizer_plugin_config) {
+    m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
 }
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline(
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index ac6b925dcb..b1e36033ee 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -7,7 +7,9 @@
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
 #include "tokenizers_path.hpp"
+#include "circular_buffer_queue.hpp"
 #include <fstream>
+#include <memory>
 
 namespace {
 
@@ -55,10 +57,12 @@ namespace genai {
 
 class Tokenizer::TokenizerImpl {
 public:
-    ov::InferRequest m_tokenizer_request;
-    ov::InferRequest m_detokenizer_request;
-    std::mutex m_tokenizer_mutex;
-    std::mutex m_detokenizer_mutex;
+    ov::CompiledModel m_tokenizer;
+    ov::CompiledModel m_detokenizer;
+
+    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
+    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
+
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -71,7 +75,7 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(std::filesystem::path tokenizer_path)
+    TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config)
         : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} {
         ov::Core core;
         
@@ -92,10 +96,23 @@ class Tokenizer::TokenizerImpl {
         read_tokenizer_config_if_necessary(tokenizer_path); 
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
-        m_tokenizer_request = core.compile_model(tokenizer_path / "openvino_tokenizer.xml",
-                                                device).create_infer_request();
-        m_detokenizer_request = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", 
-                                                   device).create_infer_request();
+        m_tokenizer = core.compile_model(tokenizer_path / "openvino_tokenizer.xml",
+                                                device, plugin_config);
+        m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", 
+                                                   device, plugin_config);
+
+        
+        const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
+        m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            INFER_REQUEST_QUEUE_SIZE,
+            [this]() -> ov::InferRequest {
+                return std::move(this->m_tokenizer.create_infer_request());
+            });
+        m_ireq_queue_detokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            INFER_REQUEST_QUEUE_SIZE,
+            [this]() -> ov::InferRequest {
+                return std::move(this->m_detokenizer.create_infer_request());
+            });
 
         // Get special token ids by inference if they are not defined.
         infer_special_tokens_if_necessary();
@@ -231,29 +248,35 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt) {
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
         size_t batch_size = 1;
-        std::unique_lock<std::mutex> lock(m_tokenizer_mutex);
-        m_tokenizer_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
-        m_tokenizer_request.infer();
-        return get_copied_results();
+        infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        return get_copied_results(
+            infer_request_guard.get().get_tensor("input_ids"),
+            infer_request_guard.get().get_tensor("attention_mask")
+        );
     }
 
     TokenizedInputs encode(std::vector<std::string>& prompts) {
         TokenizedInputs unpadded;
         {
-            std::unique_lock<std::mutex> lock(m_tokenizer_mutex);
-            m_tokenizer_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-            auto size_ = m_tokenizer_request.get_input_tensor().get_shape();
-            m_tokenizer_request.infer();
-
-            unpadded = get_copied_results();
+            CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+            infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
+            auto size_ = infer_request_guard.get().get_input_tensor().get_shape();
+            infer_request_guard.get().start_async();
+            infer_request_guard.get().wait();
+
+            unpadded = get_copied_results(
+                infer_request_guard.get().get_tensor("input_ids"),
+                infer_request_guard.get().get_tensor("attention_mask")
+            );
         }
         return pad_left(unpadded.input_ids, unpadded.attention_mask);
     }
 
-    TokenizedInputs get_copied_results() {
-        auto input_ids = m_tokenizer_request.get_tensor("input_ids");
-        auto attention_mask = m_tokenizer_request.get_tensor("attention_mask");
+    TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) {
         ov::Tensor input_ids_ = ov::Tensor(input_ids.get_element_type(), input_ids.get_shape());
         ov::Tensor attention_mask_ = ov::Tensor(attention_mask.get_element_type(), attention_mask.get_shape());
         input_ids.copy_to(input_ids_);
@@ -263,22 +286,24 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::string decode(std::vector<int64_t> tokens) {
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
         size_t batch_size = 1;
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
-        m_detokenizer_request.infer();
-        return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
+        infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        return infer_request_guard.get().get_output_tensor().data<std::string>()[0];
     }
 
     std::vector<std::string> decode(ov::Tensor tokens) {
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(tokens);
-        m_detokenizer_request.infer();
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        infer_request_guard.get().set_input_tensor(tokens);
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
         
-        auto res = m_detokenizer_request.get_output_tensor();
+        auto res = infer_request_guard.get().get_output_tensor();
         auto res_data = res.data<std::string>();
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
@@ -299,10 +324,11 @@ class Tokenizer::TokenizerImpl {
             std::fill(tokens_data + i * max_len + line_len, tokens_data + (i + 1) * max_len, m_pad_token_id);
         }
 
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(tokens);
-        m_detokenizer_request.infer();
-        auto res = m_detokenizer_request.get_output_tensor();
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        infer_request_guard.get().set_input_tensor(tokens);
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        auto res = infer_request_guard.get().get_output_tensor();
         auto res_data = res.data<std::string>();
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
@@ -411,9 +437,9 @@ class Tokenizer::TokenizerImpl {
     
 };
 
-Tokenizer::Tokenizer(const std::string& tokenizer_path) {
+Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config) {
     ScopedVar env_manager(tokenizers_relative_to_genai().string());
-    m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path);
+    m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path, plugin_config);
 }
 
 TokenizedInputs Tokenizer::encode(const std::string prompt) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index d7b2aab29c..8a1a226bc1 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -436,10 +436,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         R"(openvino_genai.Tokenizer object is used to initialize Tokenizer 
            if it's located in a different path than the main model.)")
         
-        .def(py::init([](const std::string& tokenizer_path) {
+        .def(py::init([](const std::string& tokenizer_path, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path);
-        }), py::arg("tokenizer_path"))
+            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, properties_to_any_map(plugin_config));
+        }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({}))
         
         .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts) { return tok.encode(prompts); },
             py::arg("prompts"),
@@ -596,10 +596,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
-        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(llm_plugin_config), properties_to_any_map(tokenizer_plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({}))
         .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 2ec96f671c..1eb7c27911 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index bc95418aff..ccd5d1397d 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -205,7 +205,7 @@ def load_tok(configs: List[Tuple], temp_path):
     for config_json, config_name in configs:
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
-    return ov_genai.Tokenizer(str(temp_path))
+    return ov_genai.Tokenizer(str(temp_path), {})
 
 
 def load_pipe(configs: List[Tuple], temp_path):
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index c02804527b..9e1d358011 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -205,7 +205,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From 12f8e4472884c3521770da5ada46a5f341ff935a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Wed, 24 Jul 2024 12:13:51 +0200
Subject: [PATCH 28/54] Add max_new_tokens to every generate call in
 src/README.md (#670)

[mixtral-8x7b-instruct-v0.1-int4-ov](https://huggingface.co/OpenVINO/mixtral-8x7b-instruct-v0.1-int4-ov/)
didn't have `generation_config.json` therefore generation continued
ininitely. EOS_TOKEN_ID was red correctly but during generation it was
not met.

Updated docs so in every generate call max_new_tokens is set either in
arguments or via default generation config
`pipe.set_generation_config({'max_new_tokens': 100, 'num_beam_groups':
3, ...)`

tickets: CVS-146933 CVS-146324
---
 src/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/README.md b/src/README.md
index 445b88aa58..b404794977 100644
--- a/src/README.md
+++ b/src/README.md
@@ -42,7 +42,7 @@ A simple example:
 ```python
 import openvino_genai as ov_genai
 pipe = ov_genai.LLMPipeline(model_path, "CPU")
-print(pipe.generate("The Sun is yellow because"))
+print(pipe.generate("The Sun is yellow because", max_new_tokens=100))
 ```
 
 Calling generate with custom generation config parameters, e.g. config for grouped beam search:
@@ -50,7 +50,7 @@ Calling generate with custom generation config parameters, e.g. config for group
 import openvino_genai as ov_genai
 pipe = ov_genai.LLMPipeline(model_path, "CPU")
 
-result = pipe.generate("The Sun is yellow because", max_new_tokens=30, num_beam_groups=3, num_beams=15, diversity_penalty=1.5)
+result = pipe.generate("The Sun is yellow because", max_new_tokens=100, num_beam_groups=3, num_beams=15, diversity_penalty=1.5)
 print(result)
 ```
 
@@ -73,7 +73,7 @@ while True:
     prompt = input()
     if prompt == 'Stop!':
         break
-    print(pipe(prompt))
+    print(pipe(prompt, max_new_tokens=200))
 pipe.finish_chat()
 ```
 
@@ -89,7 +89,7 @@ A simple example:
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
     ov::genai::LLMPipeline pipe(model_path, "CPU");
-    std::cout << pipe.generate("The Sun is yellow because");
+    std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(256));
 }
 ```
 
@@ -159,7 +159,7 @@ int main(int argc, char* argv[]) {
         // false means continue generation.
         return false;
     };
-    std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer));
+    std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer), ov::genai::max_new_tokens(200));
 }
 ```
 
@@ -192,7 +192,7 @@ int main(int argc, char* argv[]) {
 
     std::string model_path = argv[1];
     ov::genai::LLMPipeline pipe(model_path, "CPU");
-    std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer));
+    std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(200));
 }
 ```
 

From f9e45e1ffbc4ae671bd6a01384191ade89f932dc Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 24 Jul 2024 17:27:51 +0400
Subject: [PATCH 29/54] Add CB naive chat (#644)

Co-authored-by: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
---
 .../genai/continuous_batching_pipeline.hpp    |  20 +-
 .../openvino/genai/generation_handle.hpp      |  15 ++
 .../include/openvino/genai/llm_pipeline.hpp   |   2 +-
 .../openvino/genai/scheduler_config.hpp       |   2 +-
 src/cpp/src/continuous_batching_pipeline.cpp  | 146 ++++++++++++---
 src/cpp/src/generation_handle.cpp             |   4 +
 src/cpp/src/generation_stream.hpp             |   3 +
 src/cpp/src/llm_pipeline.cpp                  | 176 ++++++++++++++++--
 src/cpp/src/synchronized_queue.hpp            |   6 +
 src/python/py_generate_pipeline.cpp           |  18 +-
 tests/python_tests/ov_genai_test_utils.py     |   5 +
 tests/python_tests/test_chat_generate_api.py  |  20 +-
 tests/python_tests/test_generate_api.py       |  42 ++++-
 thirdparty/openvino_tokenizers                |   2 +-
 14 files changed, 411 insertions(+), 50 deletions(-)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index f5f8c53309..626a51c5da 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -10,6 +10,8 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/generation_handle.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/visibility.hpp"
 
 namespace ov::genai {
@@ -56,13 +58,27 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 
     PipelineMetrics get_metrics() const;
 
-    GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params);
+    GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
+    GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
 
     void step();
 
     bool has_non_finished_requests();
 
     // more high level interface, which can process multiple prompts in continuous batching manner
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params);
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
+
+    /**
+    * @brief start chat with keeping history in kv cache.
+    *
+    * @param system_message optional system message.
+    */
+    void start_chat(const std::string& system_message = "");
+
+    /**
+    * @brief finish chat and clear kv cache.
+    */
+    void finish_chat();
 };
 }
diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
index d0ddbc3a32..8d00ae0e9b 100644
--- a/src/cpp/include/openvino/genai/generation_handle.hpp
+++ b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -18,6 +18,20 @@ enum class GenerationStatus {
     DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped
 };
 
+struct EncodedGenerationResult {
+    // request ID - obsolete when handle API is approved as handle will connect results with prompts.
+    uint64_t m_request_id;
+
+    // in a generic case we have multiple generation results per initial prompt
+    // depending on sampling parameters (e.g. beam search or parallel sampling)
+    std::vector<std::vector<int64_t>> m_generation_ids;
+    // scores
+    std::vector<float> m_scores;
+
+    // Status of generation
+    GenerationStatus m_status = GenerationStatus::RUNNING;
+};
+
 struct GenerationResult {
     // request ID - obsolete when handle API is approved as handle will connect results with prompts.
     uint64_t m_request_id;
@@ -60,6 +74,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
 
     bool can_read();
 
+    GenerationOutputs back();
     // Reads result of a generation for single iteration
     GenerationOutputs read();
     // Reads all generated tokens for all sequences
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 84dc02bd58..abd4ee5a44 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -14,7 +14,7 @@
 namespace ov {
 namespace genai {
 
-// Return flag corresponds whether generation should be stopped: false means continue generation, true means stop.
+// Return flag correspods whether generation should be stopped: false means continue generation, true means stop.
 using StreamerVariant = std::variant<std::function<bool(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
 using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>;
diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp
index 787060d07e..9d808fd424 100644
--- a/src/cpp/include/openvino/genai/scheduler_config.hpp
+++ b/src/cpp/include/openvino/genai/scheduler_config.hpp
@@ -16,7 +16,7 @@ struct SchedulerConfig {
     std::size_t num_kv_blocks = 0;
 
     // total size of KV cache in GB
-    std::size_t cache_size = 0;
+    std::size_t cache_size = 1;
 
     // block size for KV cache
     std::size_t block_size = 32;
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 55100f3cb4..a66a88cad4 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -6,16 +6,21 @@
 #include <memory>
 
 #include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "cache_manager.hpp"
 #include "sampler.hpp"
 #include "model_runner.hpp"
 #include "scheduler.hpp"
+#include "text_callback_streamer.hpp"
 #include "timer.hpp"
 #include "debug_utils.hpp"
 
 using namespace ov::genai;
 
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
@@ -51,6 +56,8 @@ class ContinuousBatchingPipeline::Impl {
     std::vector<SequenceGroup::Ptr> m_awaiting_requests;
     // Mutex protecting access to m_awaiting_requests, so add_request and step methods can be called from different threads
     std::mutex m_awaiting_requests_mutex;
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 
 
     void _free_non_running_requests() {
@@ -120,18 +127,9 @@ class ContinuousBatchingPipeline::Impl {
         return m_tokenizer;
     }
 
-    GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
+    GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) {
         sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
-
-        ov::Tensor input_ids;
-        {
-            static ManualTimer timer("tokenize");
-            timer.start();
-            input_ids = m_tokenizer.encode(prompt).input_ids;
-            timer.end();
-        }
-
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                             sampling_params, m_scheduler->get_config().block_size);
         {
@@ -141,6 +139,14 @@ class ContinuousBatchingPipeline::Impl {
         return std::make_unique<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
     }
 
+    GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) {
+        static ManualTimer timer("tokenize");
+        timer.start();
+        ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids;
+        timer.end();
+        return add_request(request_id, input_ids, sampling_params);
+    }
+
     void step() {
         static ManualTimer step_timer("step()");
         step_timer.start();
@@ -238,25 +244,47 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
-        OPENVINO_ASSERT(prompts.size() == sampling_params.size());
+        OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+        const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
+            [](std::monostate) -> std::shared_ptr<StreamerBase> {
+                return nullptr;
+            },
+            [](const std::shared_ptr<StreamerBase>& streamer) {
+                return streamer;
+            },
+            [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+                return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+            }
+        }, streamer);
 
         std::vector<GenerationHandle> generations;
-        for (size_t request_id = 0; request_id < prompts.size(); ++request_id) {
-            generations.push_back(add_request(request_id, prompts[request_id], sampling_params[request_id]));
+        for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
+            OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
+            generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
         }
 
-        std::vector<GenerationResult> results;
+        std::vector<EncodedGenerationResult> results;
         results.reserve(m_awaiting_requests.size());
 
-        while (has_non_finished_requests()) {
+        bool continue_generation = true;
+        while (has_non_finished_requests() && continue_generation) {
             step();
+            if (streamer_ptr) {
+                std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+                OPENVINO_ASSERT(1 == token.size());
+                OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size());
+                continue_generation = !streamer_ptr->put(token.begin()->second.generated_token_ids.at(0));
+            }
+        }
+        if (streamer_ptr) {
+            streamer_ptr->end();
         }
 
         for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
             const auto& generation = generations[generation_idx];
-            GenerationResult result;
+            EncodedGenerationResult result;
             result.m_request_id = 1;
             std::vector<GenerationOutput> generation_outputs = generation->read_all();
             std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) {
@@ -266,17 +294,69 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
-                result.m_generation_ids.push_back(output_text);
+                result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids));
                 result.m_scores.push_back(generation_output.score);
             }
             result.m_status = generation->get_status();
-            results.push_back(result);
+            results.push_back(std::move(result));
         }
 
-        OPENVINO_ASSERT(results.size() == prompts.size());
+        OPENVINO_ASSERT(results.size() == input_ids.size());
         return results;
     }
+
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params, const StreamerVariant& streamer) {
+        std::vector<ov::Tensor> input_ids;
+        static ManualTimer timer("tokenize");
+        if (m_is_chat_conversation) {
+            OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
+            m_history.push_back({{"role", "user"}, {"content", prompts.at(0)}});
+            constexpr bool add_generation_prompt = true;
+            std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+            timer.start();
+            input_ids.push_back(m_tokenizer.encode(history).input_ids);
+            timer.end();
+        } else {
+            input_ids.reserve(prompts.size());
+            for (const std::string& prompt : prompts) {
+                timer.start();
+                input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
+                timer.end();
+            }
+        }
+        std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
+        std::vector<GenerationResult> decoded;
+        decoded.reserve(encoded.size());
+        for (EncodedGenerationResult& res : encoded) {
+            std::vector<std::string> generated;
+            generated.reserve(res.m_generation_ids.size());
+            for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) {
+                generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx)));
+                if (m_is_chat_conversation && 0 == idx) {
+                    m_history.push_back({{"role", "assistant"}, {"content", generated.back()}});
+                }
+            }
+            decoded.push_back(GenerationResult{
+                res.m_request_id,
+                std::move(generated),
+                std::move(res.m_scores),
+                res.m_status
+            });
+        }
+        return decoded;
+    }
+
+    void start_chat(const std::string& system_message) {
+        if (!system_message.empty()) {
+            m_history.push_back({{"role", "system"}, {"content", system_message}});
+        }
+        m_is_chat_conversation = true;
+    };
+
+    void finish_chat() {
+        m_is_chat_conversation = false;
+        m_history.clear();
+    };
 };
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path,
@@ -307,10 +387,14 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{
     return m_impl->get_metrics();
 }
 
-GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
+GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params) {
     return m_impl->add_request(request_id, prompt, sampling_params);
 }
 
+GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params) {
+    return m_impl->add_request(request_id, input_ids, sampling_params);
+}
+
 void ContinuousBatchingPipeline::step() {
     m_impl->step();
 }
@@ -319,6 +403,18 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
     return m_impl->has_non_finished_requests();
 }
 
-std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
-    return m_impl->generate(prompts, sampling_params);
-}
\ No newline at end of file
+std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
+    return m_impl->generate(input_ids, sampling_params, streamer);
+}
+
+std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
+    return m_impl->generate(prompts, sampling_params, streamer);
+}
+
+void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
+    m_impl->start_chat(system_message);
+};
+
+void ContinuousBatchingPipeline::finish_chat() {
+    m_impl->finish_chat();
+};
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index a0187025ec..26cc12604f 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -20,6 +20,10 @@ bool GenerationHandleImpl::can_read() {
     return m_generation_stream->can_read();
 }
 
+std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() {
+    return m_generation_stream->back();
+}
+
 std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() {
     return m_generation_stream->read();
 }
diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp
index 0d51897e82..1ac2eefef9 100644
--- a/src/cpp/src/generation_stream.hpp
+++ b/src/cpp/src/generation_stream.hpp
@@ -31,6 +31,9 @@ class GenerationStream {
     }
 
     // Retriving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
+    GenerationOutputs back() {
+        return m_output_queue.back();
+    }
     GenerationOutputs read() {
         return m_output_queue.pull();
     }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 507d988a6a..1d68d4c746 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
+#include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "llm_pipeline_base.hpp"
@@ -114,6 +115,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         EncodedInputs encoded_input;
 
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+            OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
             encoded_input = m_tokenizer.encode(*input_vector);
         } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
             std::string& prompt = *input_prompt;
@@ -334,7 +336,151 @@ std::pair<std::string, Any> generation_config(const GenerationConfig& config) {
 }  // namespace genai
 }  // namespace ov
 
-using namespace std;
+namespace {
+using namespace ov::genai;
+
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+Tokenizer dont_construct() {
+    OPENVINO_THROW("Continuous Batching backend can't be constructed"
+        "from ireq because the model must be transformed");
+}
+
+class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
+public:
+    ContinuousBatchingPipeline m_impl;
+
+    ContinuousBatchingAdapter(
+        const ov::InferRequest& request,
+        const Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config
+    ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& model_path,
+        const Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{tokenizer}, m_impl{
+        model_path.string(),
+        tokenizer,
+        SchedulerConfig{},
+        device,
+        plugin_config
+    } {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& model_path,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{
+        model_path.string(),
+        m_tokenizer,
+        SchedulerConfig{},
+        device,
+        plugin_config
+    } {}
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<std::string> prompts = std::visit(overloaded{
+            [](const std::string& prompt) {
+                return std::vector{prompt};
+            },
+            [](std::vector<std::string>& prompts) {
+                return prompts;
+            }
+        }, inputs);
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<GenerationResult> generated = m_impl.generate(
+            prompts,
+            std::vector<GenerationConfig>{prompts.size(), config},
+            streamer
+        );
+        std::vector<std::string> plain_replies;
+        std::vector<float> plain_scores;
+        for (GenerationResult& res : generated) {
+            if (GenerationStatus::FINISHED != res.m_status) {
+                OPENVINO_THROW("Got unfinished GenerationStatus");
+            }
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_replies), std::move(plain_scores)};
+    }
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
+            [](const ov::Tensor& inp) {
+                size_t batch_size = inp.get_shape().at(0);
+                if (1 == batch_size) {
+                    return std::vector{inp};
+                }
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.get_shape().at(1);
+                const int64_t* const source = inp.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    std::copy_n(source + batch_id * max_len, max_len, destination);
+                }
+                return input_ids;
+            },
+            [](const TokenizedInputs& inp) {
+                size_t batch_size = inp.input_ids.get_shape().at(0);
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.input_ids.get_shape().at(1);
+                const int64_t* const source = inp.input_ids.data<const int64_t>();
+                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    size_t copy_count = 0;
+                    for (size_t idx = 0; idx < max_len; ++idx) {
+                        if (1 == attention_mask[batch_id * max_len + idx]) {
+                            destination[copy_count++] = source[batch_id * max_len + idx];
+                        }
+                    }
+                    input_ids.back().set_shape({1, copy_count});
+                }
+                return input_ids;
+            }
+        }, inputs);
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
+        std::vector<std::vector<int64_t>> plain_tokens;
+        std::vector<float> plain_scores;
+        for (EncodedGenerationResult& res : generated) {
+            if (GenerationStatus::FINISHED != res.m_status) {
+                OPENVINO_THROW("Got unfinished GenerationStatus");
+            }
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_tokens), std::move(plain_scores)};
+    }
+
+    void start_chat(const std::string& system_message) override {
+        m_impl.start_chat();
+    };
+
+    void finish_chat() override {
+        m_impl.finish_chat();
+    };
+};
+}
 
 ov::genai::LLMPipeline::LLMPipeline(
     const ov::InferRequest& request,
@@ -349,25 +495,27 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& plugin_config
-) {
-    if (device == "NPU") {
-        m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(model_path), tokenizer, device, plugin_config);
-    } else {
-        m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(model_path), tokenizer, device, plugin_config);
+): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
+    if ("CB" == device) {
+        return std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
+    } if ("NPU" == device) {
+        return std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
     }
-}
+    return std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
+}()} {}
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::string& path,
     const std::string& device,
     const ov::AnyMap& config
-) {
-    if (device == "NPU") {
-        m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(path), device, config);
-    } else {
-        m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
+): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
+    if ("CB" == device) {
+        return std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
+    } if ("NPU" == device) {
+        return std::make_unique<StaticLLMPipeline>(path, device, config);
     }
-}
+    return std::make_unique<StatefulLLMPipeline>(path, device, config);
+}()} {}
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;
@@ -386,7 +534,7 @@ void ov::genai::LLMPipeline::finish_chat() {
 }
 
 void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) {
-    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;;
+    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;
     m_pimpl->m_generation_config = config;
     // if eos_token_id was not provided in config forward from default config
     if (config.eos_token_id == -1)
diff --git a/src/cpp/src/synchronized_queue.hpp b/src/cpp/src/synchronized_queue.hpp
index 0c2cd3180d..bd025f1b7d 100644
--- a/src/cpp/src/synchronized_queue.hpp
+++ b/src/cpp/src/synchronized_queue.hpp
@@ -17,6 +17,12 @@ class SynchronizedQueue
     SynchronizedQueue(const SynchronizedQueue&&) = delete;
     SynchronizedQueue& operator=(const SynchronizedQueue&) = delete;
 
+    T back() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        m_cv.wait(lock, [this]{return !m_queue.empty();});
+        return m_queue.back();
+    }
+
     T pull() {
         std::unique_lock<std::mutex> lock(m_mutex);
         m_cv.wait(lock, [this]{return !m_queue.empty();});
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 8a1a226bc1..47f38788d2 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -606,8 +606,22 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
-        .def("add_request", &ContinuousBatchingPipeline::add_request)
+        .def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
+        .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
-        .def("generate", &ContinuousBatchingPipeline::generate);
+        .def(
+            "generate",
+            py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
+            py::arg("input_ids"),
+            py::arg("sampling_params"),
+            py::arg("streamer") = std::monostate{}
+        )
+        .def(
+            "generate",
+            py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
+            py::arg("prompts"),
+            py::arg("sampling_params"),
+            py::arg("streamer") = std::monostate{}
+        );
 }
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index ccd5d1397d..edfadb0988 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -218,3 +218,8 @@ def load_pipe(configs: List[Tuple], temp_path):
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
     return ov_genai.LLMPipeline(str(temp_path))
+
+
+@functools.lru_cache(1)
+def get_continuous_batching(path):
+    return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 5a73d481d3..bd1d45d18f 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 import openvino
 import openvino_tokenizers
 import openvino_genai as ov_genai
@@ -12,7 +13,8 @@
     read_model,
     load_tok,
     model_tmp_path,
-    get_chat_templates
+    get_chat_templates,
+    get_continuous_batching,
 )
 
 
@@ -167,3 +169,19 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
         print(f'hf reference: {full_history_str_hf}')
         print(f'ov_genai out: {full_history_str}')
     assert full_history_str == full_history_str_hf
+
+
+@pytest.mark.parametrize("generation_config", configs[1:])
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
+    model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb = get_continuous_batching(path)
+    stateful.start_chat()
+    cb.start_chat()
+    for question in quenstions:
+        generated = cb.generate(question, **generation_config)
+        reference = stateful.generate(question, **generation_config)
+        assert generated == reference
+    # Test that finish_chat() doesn't fail just in case.
+    cb.finish_chat()
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index e2395cf8d7..a18bc517d9 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,6 +11,7 @@
 import sys
 from pathlib import Path
 import torch
+import math
 from ov_genai_test_utils import (
     get_models_list, 
     read_model, 
@@ -18,11 +19,11 @@
     load_tok, 
     model_tmp_path, 
     STOP_CRITERIA_MAP, 
+    get_continuous_batching,
 )
 
 
 def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    device = 'CPU'
     model_id, path, tokenizer, model, pipe = model_descr
     config = generation_config.copy()  # to avoid side effects
     num_beams = config['num_beams'] if 'num_beams' in config else 1
@@ -67,7 +68,6 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
         assert hf_output == ov_output
 
 def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
-    device = 'CPU'
     model_id, path, tokenizer, model, pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
@@ -75,7 +75,7 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
     if 'do_sample' not in config:
         # Some HF models have default do_sample = True, and if we set beam search generation config 
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set exlicitly to False, but only if test arguments omitted this arg.
+        # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = None
@@ -705,3 +705,39 @@ def test_left_pad():
 
     models[2].pad_token = models[2].eos_token
     run_hf_ov_genai_comparison_batched(models, config, prompts)
+
+
+@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("prompt", batched_prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config):
+    model_id, path, tokenizer, model, stateful = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 100
+    cb = get_continuous_batching(path)
+    generated = cb.generate(prompt, **generation_config)
+    reference = stateful.generate(prompt, **generation_config)
+    assert generated.texts == reference.texts
+    if 1 != generation_config.get("num_return_sequences", 1):
+        # Stateful puts zeroes to generated.scores. Don't compare them.
+        for gen, ref in zip(generated.scores, reference.scores):
+            assert math.isclose(gen, ref, abs_tol=0.0003)
+
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt):
+    model_id, path, tokenizer, model, stateful = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
+    cb = get_continuous_batching(path)
+    streamed = []
+    generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
+    reference = stateful.generate(prompt, max_new_tokens=20)
+    assert generated == "".join(streamed)
+    assert "".join(streamed) == reference
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 04795c1b78..880d569cd2 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c
+Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb

From 03590c52c5abe364c0de8963f4bee384b379f5e3 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Wed, 24 Jul 2024 18:11:08 +0200
Subject: [PATCH 30/54] return back py::object -> AnyMap (#679)

AnyMap can contains another AnyMap inside. Added handling such case.
This is needed for NPU
---
 src/python/py_generate_pipeline.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 47f38788d2..f8888ba258 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -151,6 +151,33 @@ OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfi
     return res_config;
 }
 
+ov::Any py_object_to_any(const py::object& py_obj);
+
+bool py_object_is_any_map(const py::object& py_obj) {
+    if (!py::isinstance<py::dict>(py_obj)) {
+        return false;
+    }
+    auto dict = py::cast<py::dict>(py_obj);
+    return std::all_of(dict.begin(), dict.end(), [&](const std::pair<py::object::handle, py::object::handle>& elem) {
+        return py::isinstance<py::str>(elem.first);
+    });
+}
+
+ov::AnyMap py_object_to_any_map(const py::object& py_obj) {
+    OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type.");
+    ov::AnyMap return_value = {};
+    for (auto& item : py::cast<py::dict>(py_obj)) {
+        std::string key = py::cast<std::string>(item.first);
+        py::object value = py::cast<py::object>(item.second);
+        if (py_object_is_any_map(value)) {
+            return_value[key] = py_object_to_any_map(value);
+        } else {
+            return_value[key] = py_object_to_any(value);
+        }
+    }
+    return return_value;
+}
+
 ov::Any py_object_to_any(const py::object& py_obj) {
     // Python types
     py::object float_32_type = py::module_::import("numpy").attr("float32");
@@ -213,6 +240,8 @@ ov::Any py_object_to_any(const py::object& py_obj) {
         }
     
     // OV types
+    } else if (py_object_is_any_map(py_obj)) {
+        return py_object_to_any_map(py_obj);
     } else if (py::isinstance<ov::Any>(py_obj)) {
         return py::cast<ov::Any>(py_obj);
     } else if (py::isinstance<ov::element::Type>(py_obj)) {

From 53945f73dc4a9b0cb6a284c4b5866a7eb1ca13a3 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 24 Jul 2024 20:20:06 +0400
Subject: [PATCH 31/54] Update openvino_tokenizers (#680)

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 880d569cd2..fb0157c30a 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb
+Subproject commit fb0157c30a8a7f6538471fe622b8b52a3800278a

From a769b336e12db039579993e211fbbeca731f6fb8 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 24 Jul 2024 21:21:54 +0400
Subject: [PATCH 32/54] Allow dev and rc tokenizers (#681)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c7f4f9eaf7..7cfa564ef9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "openvino_tokenizers~=2024.3.0.0"
+    "openvino_tokenizers~=2024.3.0.0.dev"
 ]
 
 [tool.py-build-cmake.module]

From e449ffed5e8d23f2bb442da2a9a6faf71caf55f7 Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Thu, 25 Jul 2024 14:25:32 +0300
Subject: [PATCH 33/54] Fix chat templates with slices, add tokenizer config
 for `mistralai/Mistral-7B-Instruct-v0.1` (#648)

---
 src/cpp/src/tokenizer.cpp                 | 63 ++++++++++-------------
 tests/python_tests/ov_genai_test_utils.py | 23 ++++-----
 tests/python_tests/tokenizer_configs.py   |  7 +++
 3 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index b1e36033ee..c6039d87bd 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -6,6 +6,7 @@
 #include "utils.hpp"
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
+#include <jinja2cpp/user_callable.h>
 #include "tokenizers_path.hpp"
 #include "circular_buffer_queue.hpp"
 #include <fstream>
@@ -368,40 +369,32 @@ class Tokenizer::TokenizerImpl {
                                     bool add_generation_prompt, 
                                     const std::string& chat_template) const {
         auto chat_tpl = chat_template.empty() ? m_chat_template : chat_template;
-        // Jinja2Cpp does not support slicing, e.g. [1:].
-        // In templates slicing is used typically in the header to find system prompt.
-        // If header containts that typical expression we update template and 
-        // extract system message manually from ChatHistory.
-        std::string header_with_slice = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}";
-        std::string replacement_string = "{% if false %}{% set placeholder = false %}";
-        
-        std::string system_message = "";
-        size_t pos = chat_tpl.find(header_with_slice);
-        if (pos != std::string::npos) {
-            chat_tpl.replace(pos, header_with_slice.length(), replacement_string);
-
-            if (!history.empty() && history[0].at("role") == "system") {
-                system_message = history[0].at("content");
-                history.erase(history.begin());
-            }
+
+        // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
+        // If chat template contains such slicing, we replace it with custom function `slice()` (user-defined callable) 
+        // that is defined below and does the same list slicing logic.
+        std::string slice_string = "messages[1:]";
+        std::string replacement_slice_string = "slice(messages, 1)";
+        size_t slice_pos = chat_tpl.find(slice_string);
+        if (slice_pos != std::string::npos) {
+            chat_tpl.replace(slice_pos, slice_string.length(), replacement_slice_string);
         }
-        
-        // Jinja2Cpp accepts system_message only as a string and incorrectly handles it as a bool.
-        // Both this patters are found frequently in chat templates, replace so that jinja2cpp 
-        // will not stumble on them.
-        std::pair<std::string, std::string> replace_str_map[] = {
-            {"{% set system_message = false %}", ""},
-            {"system_message != false", "true"},
-        };
-        if (!system_message.empty()) {
-            for (const auto& [from, to] : replace_str_map) {
-                size_t pos = 0;
-                while ((pos = chat_tpl.find(from, pos)) != std::string::npos) {
-                    chat_tpl.replace(pos, from.size(), to);
-                    pos += to.size();
+        jinja2::UserCallable slice_callable = jinja2::MakeCallable(
+            [](const jinja2::ValuesList& list, const int64_t start) {
+                if (list.empty())
+                    return jinja2::Value();
+                jinja2::ValuesList result;
+                int64_t stop = list.size();
+                int64_t step = 1;
+                for (int64_t i = start; i < stop && i < list.size(); i += step)
+                {
+                    result.push_back(list.at(i));
                 }
-            }
-        }
+
+                return jinja2::Value(result);
+            },
+            jinja2::ArgInfo{"list"}, jinja2::ArgInfo{"start"}
+        );
 
         jinja2::TemplateEnv env;
         env.GetSettings().lstripBlocks = true;
@@ -421,13 +414,13 @@ class Tokenizer::TokenizerImpl {
             {"bos_token",  m_bos_token},
             {"eos_token", m_eos_token},
             {"pad_token", m_pad_token},
-            {"system_message", system_message.empty() ? jinja2::EmptyValue() : jinja2::Value{system_message}},
             {"add_generation_prompt", add_generation_prompt},
+            {"slice", slice_callable},
         };
-        
+
         try {
             return tpl.RenderAsString(params).value();
-        } catch (const std::bad_alloc& error) {
+        } catch (const std::exception& error) {
             OPENVINO_THROW("Chat template for the current model is not supported by Jinja2Cpp. "
                            "Please apply template manually to your prompt before calling generate. "
                            "For exmaple: <start_of_turn>user{user_prompt}<end_of_turn><start_of_turn>model");
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index edfadb0988..ad5b7254cd 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -99,11 +99,11 @@ def get_chat_templates():
         # TODO: Need to support chat templates in more models: CVS-145963
         # Either ov_genai is unable to parse chat_template or results do not match with HF.
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        "databricks/dbrx-instruct",
+        "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp
         "mosaicml/mpt-30b-chat",
-        "deepseek-ai/deepseek-coder-6.7b-instruct",
-        "maldv/winter-garden-7b-alpha",
-        "ishorn5/RTLCoder-Deepseek-v1.1",
+        "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp
+        "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp
+        "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp
         "openchat/openchat-3.5-0106",
         "casperhansen/llama-3-70b-instruct-awq",
         "TheBloke/deepseek-coder-33B-instruct-GPTQ",
@@ -111,26 +111,23 @@ def get_chat_templates():
         "google/gemma-7b-it",
         "THUDM/cogvlm2-llama3-chat-19B",
         "KnutJaegersberg/internlm-20b-llama",
-        "alpindale/WizardLM-2-8x22B",
         "maywell/Synatra-Mixtral-8x7B",
         "MediaTek-Research/Breeze-7B-Instruct-v1_0",
         "bofenghuang/vigostral-7b-chat",
-        "meetkai/functionary-small-v2.5",
-        "nvidia/Llama3-ChatQA-1.5-8B",
+        "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp
         "openchat/openchat-3.6-8b-20240522",
         "tenyx/TenyxChat-7B-v1",
         "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2",
         "yam-peleg/Hebrew-Gemma-11B-V2",
-        "shenzhi-wang/Llama3-8B-Chinese-Chat",
+        "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError
         "nlpai-lab/KULLM3",
         "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
-        "MediaTek-Research/Breeze-7B-Instruct-v0_1",
-        "shanchen/llama3-8B-slerp-biomed-chat-chinese",
+        "MediaTek-Research/Breeze-7B-Instruct-v0_1", 
+        "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError
         "MLP-KTLim/llama-3-Korean-Bllossom-8B",
-        "lucyknada/microsoft_WizardLM-2-7B",
-        "aloobun/CosmicBun-8B",
+        "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp
         "codellama/CodeLlama-70b-Instruct-hf",
-        "gorilla-llm/gorilla-openfunctions-v2",
+        "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp
         "BramVanroy/Llama-2-13b-chat-dutch"
     }
     from tokenizer_configs import get_tokenizer_configs
diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py
index eb83f50836..4e8197ff5f 100644
--- a/tests/python_tests/tokenizer_configs.py
+++ b/tests/python_tests/tokenizer_configs.py
@@ -980,5 +980,12 @@ def get_tokenizer_configs():
         "pad_token": None,
         "unk_token": "<unk>",
         "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+        },
+        "mistralai/Mistral-7B-Instruct-v0.1": {
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+        "pad_token": None,
+        "unk_token": "<unk>",
+        "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"
         }
     }
\ No newline at end of file

From 406393f93063f2a82bc16d106ffb6df8893511d1 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 26 Jul 2024 08:51:32 +0200
Subject: [PATCH 34/54] Prefix caching. (#675)

Port of https://github.com/openvinotoolkit/openvino.genai/pull/639
---
 .../openvino/genai/scheduler_config.hpp       |   8 +
 src/cpp/src/block_manager.hpp                 | 259 +++++++++++++++++-
 src/cpp/src/scheduler.hpp                     |  28 +-
 src/cpp/src/sequence_group.hpp                |  21 ++
 src/python/py_generate_pipeline.cpp           |   4 +-
 tests/cpp/CMakeLists.txt                      |   5 +-
 tests/cpp/block_manager.cpp                   |  32 ++-
 tests/cpp/evictor.cpp                         |  54 ++++
 tests/cpp/scheduler.cpp                       |  66 +++++
 9 files changed, 442 insertions(+), 35 deletions(-)
 create mode 100644 tests/cpp/evictor.cpp

diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp
index 9d808fd424..aca823fa63 100644
--- a/src/cpp/include/openvino/genai/scheduler_config.hpp
+++ b/src/cpp/include/openvino/genai/scheduler_config.hpp
@@ -30,5 +30,13 @@ struct SchedulerConfig {
 
     // max number of scheduled sequences (you can think of it as "max batch size")
     std::size_t max_num_seqs = 256;
+
+    // Enable caching of KV-blocks.
+    // When turned on all previously calculated KV-caches are kept in memory for future usages.
+    // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
+    // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. 
+    // When turend off only KV-cache required for batch calculation is kept in memory and 
+    // when a sequence has finished genegartion its cache is released.
+    bool enable_prefix_caching = false;
 };
 }
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
index ab60b7f5ff..8c9c3ed512 100644
--- a/src/cpp/src/block_manager.hpp
+++ b/src/cpp/src/block_manager.hpp
@@ -6,6 +6,7 @@
 #include <memory>
 #include <list>
 #include <map>
+#include <chrono>
 
 #include "sequence_group.hpp"
 
@@ -13,13 +14,17 @@ namespace ov::genai {
 class KVCacheBlock {
     int m_ref_count;
     int m_index;
+    size_t m_hash;
+    size_t m_num_hashed_tokens;
+    std::chrono::time_point<std::chrono::system_clock> m_timestamp;
 public:
     using Ptr = std::shared_ptr<KVCacheBlock>;
     using CPtr = std::shared_ptr<const KVCacheBlock>;
 
     explicit KVCacheBlock(int index)
         : m_ref_count(0),
-          m_index(index) { }
+          m_index(index),
+          m_timestamp(std::chrono::system_clock::now()) { }
 
     int get_index() const {
         return m_index;
@@ -34,6 +39,7 @@ class KVCacheBlock {
     }
 
     void release() {
+        OPENVINO_ASSERT(m_ref_count > 0);
         --m_ref_count;
     }
 
@@ -44,15 +50,79 @@ class KVCacheBlock {
     int get_references_count() const {
         return m_ref_count;
     }
+
+    size_t get_hash() const {
+        return m_hash;
+    }
+
+    size_t get_num_hashed_tokens() const {
+        return m_num_hashed_tokens;
+    }
+
+    void set_hash(size_t hash, size_t num_hashed_tokens) {
+        m_hash = hash;
+        m_num_hashed_tokens = num_hashed_tokens;
+    }
+
+    void set_timestamp(const std::chrono::time_point<std::chrono::system_clock>& timestamp) {
+        m_timestamp = timestamp;
+    }
+
+    std::chrono::time_point<std::chrono::system_clock> get_timestamp() {
+        return m_timestamp;
+    }
+};
+
+
+class Evictor {
+    std::map<size_t, KVCacheBlock::Ptr> blocks;
+public:
+    void add(size_t hash, KVCacheBlock::Ptr block) {
+        blocks[hash] = block;
+    }
+
+    static bool block_is_less(const std::pair<size_t, KVCacheBlock::Ptr>& lhs, const std::pair<size_t, KVCacheBlock::Ptr>& rhs) {
+        return lhs.second->get_timestamp() < rhs.second->get_timestamp();
+    }
+
+    KVCacheBlock::Ptr get_block(size_t hash) {
+        if (blocks.find(hash)== blocks.end())
+        {
+            return nullptr;
+        }
+        KVCacheBlock::Ptr block = blocks[hash];
+        block->set_timestamp(std::chrono::system_clock::now());
+        block->increment();
+        blocks.erase(hash);
+        return block;
+    }
+
+    KVCacheBlock::Ptr get_lru_block() {
+        if (!blocks.size()) {
+            return nullptr;
+        }
+        auto hash_block = std::min_element(std::begin(blocks), std::end(blocks), block_is_less);
+        auto block = hash_block->second;
+        block->set_timestamp(std::chrono::system_clock::now());
+        block->increment();
+        blocks.erase(hash_block->first);
+        return block;
+    }
+
+    size_t num_blocks() const {
+        return blocks.size();
+    }
 };
 
 
 class BlockAllocator {
     std::list<KVCacheBlock::Ptr> m_free_blocks;
+    ov::genai::Evictor m_evictor;
     int m_total_num_blocks;
+    bool m_enable_prefix_caching;
 public:
-    BlockAllocator(int num_blocks) :
-        m_total_num_blocks(num_blocks) {
+    BlockAllocator(int num_blocks, bool enable_prefix_caching) :
+        m_total_num_blocks(num_blocks), m_enable_prefix_caching(enable_prefix_caching) {
         for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
             m_free_blocks.push_back(std::make_shared<KVCacheBlock>(block_id));
         }
@@ -64,21 +134,28 @@ class BlockAllocator {
     }
 
     size_t num_free_blocks() const {
-        return m_free_blocks.size();
+        return m_free_blocks.size() + m_evictor.num_blocks();
     }
 
     bool can_allocate_blocks(size_t num_blocks) const {
-        return num_blocks <= m_free_blocks.size();
+        return num_blocks <= num_free_blocks();
     }
 
     void free(KVCacheBlock::Ptr block) {
         block->release();
         if (block->is_free()) {
-            m_free_blocks.push_back(block);
+            if (m_enable_prefix_caching)
+            {
+                m_evictor.add(block->get_hash(), block);
+            }
+            else {
+                m_free_blocks.push_back(block);
+            }
         }
     }
 
     KVCacheBlock::Ptr allocate_block() {
+        OPENVINO_ASSERT(!m_enable_prefix_caching);
         OPENVINO_ASSERT(can_allocate_blocks(1));
         KVCacheBlock::Ptr allocated_block = m_free_blocks.front();
         allocated_block->increment();
@@ -86,20 +163,84 @@ class BlockAllocator {
         return allocated_block;
     }
 
+    KVCacheBlock::Ptr allocate_block(size_t hash, size_t num_hashed_tokens, std::map<uint64_t, KVCacheBlock::Ptr>& cached_blocks) {
+        OPENVINO_ASSERT(m_enable_prefix_caching);
+        OPENVINO_ASSERT(can_allocate_blocks(1));
+        auto block = m_evictor.get_block(hash);
+        if (block != nullptr) {
+            // use cached block from evictor
+            cached_blocks[hash] = block;
+            return block;
+        }
+        // TODO: Currently we cache all allocated blocks which might be redundant for beam search,
+        // where blocks of non-used candidates are not needed in cache.
+        // This part can be improved if we cache only blocks for prompt.
+        if (cached_blocks.find(hash) != cached_blocks.end()) {
+            // use cashed block from cached_blocks
+            block = cached_blocks[hash];
+            cached_blocks[hash]->increment();
+            return block;
+        }
+        if (m_free_blocks.size() > 0) {
+            // allocate new empty block
+            KVCacheBlock::Ptr allocated_block = m_free_blocks.front();
+            allocated_block->increment();
+            allocated_block->set_hash(hash, num_hashed_tokens);
+            cached_blocks[hash] = allocated_block;
+
+            m_free_blocks.pop_front();
+            return allocated_block;
+        }
+        if (m_evictor.num_blocks() > 0) {
+            // get least resently used block from evictor and reuse it
+            KVCacheBlock::Ptr block = m_evictor.get_lru_block();
+            cached_blocks.erase(block->get_hash());
+
+            // update block with new hash
+            block->set_hash(hash, num_hashed_tokens);
+            cached_blocks[hash] = block;
+            return block;
+        }
+        // out of memory
+        return nullptr;
+    }
+
+    KVCacheBlock::Ptr get_cached_block(size_t hash, std::map<uint64_t, KVCacheBlock::Ptr>& cached_blocks) {
+        auto block = m_evictor.get_block(hash);
+        if (block != nullptr) {
+            // use cashed block from evictor
+            cached_blocks[hash] = block;
+            return block;
+        }
+        if (cached_blocks.find(hash) != cached_blocks.end()) {
+            // use cashed block from cached_blocks
+            // TODO: add tokens validation in case of hash collision
+            block = cached_blocks[hash];
+            cached_blocks[hash]->increment();
+            return block;
+        }
+        return nullptr;
+    }
+
+
     float get_used_percentage() const {
-        return static_cast<float>(m_total_num_blocks - m_free_blocks.size()) / m_total_num_blocks;
+        return static_cast<float>(m_total_num_blocks - num_free_blocks()) / m_total_num_blocks;
     }
 };
 
 class BlockManager {
     BlockAllocator m_allocator;
+    bool m_enable_prefix_caching;
+    size_t m_block_size;
+    // TODO: caching time can probably be improved if we use the prefix tree
+    std::map<uint64_t, KVCacheBlock::Ptr> cached_blocks;
 
     // stores blocks for each sequence (not sequence group)
     // the same block can be seen in multiple block_tables for different sequences
     std::map<uint64_t, std::vector<KVCacheBlock::Ptr>> m_block_table;
 public:
-    BlockManager(int num_blocks)
-        : m_allocator(num_blocks) { }
+    BlockManager(int num_blocks, bool enable_prefix_caching, size_t block_size)
+        : m_allocator(num_blocks, enable_prefix_caching), m_enable_prefix_caching(enable_prefix_caching), m_block_size(block_size) { }
 
     ~BlockManager() {
         // sanity check that all sequences are freed
@@ -195,11 +336,32 @@ class BlockManager {
         return m_allocator.can_allocate_blocks(num_blocks);
     }
 
-    void allocate(uint64_t sequence_id, size_t num_blocks) {
+    void allocate(ov::genai::Sequence::CPtr sequence, size_t num_blocks, const ov::genai::TokenIds& prompt_ids = {}) {
         OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks));
+        if (m_enable_prefix_caching) {
+            OPENVINO_ASSERT(prompt_ids.size() > 0, "prompt_ids should be set for hash calculation.");
+        }
+        auto sequence_id = sequence->get_id();
+        auto block_table = m_block_table[sequence_id];
+        auto content_length = sequence->get_generated_len() + prompt_ids.size();
+        size_t num_hashed_tokens = block_table.size() * m_block_size;
 
         for (size_t i = 0; i < num_blocks; ++i) {
-            m_block_table[sequence_id].push_back(m_allocator.allocate_block());
+
+            ov::genai::KVCacheBlock::Ptr block = nullptr; 
+            if (m_enable_prefix_caching) {
+                num_hashed_tokens += m_block_size;
+                if (num_hashed_tokens > content_length) {
+                    num_hashed_tokens = content_length;
+                }
+                auto hash = sequence->get_hash(num_hashed_tokens, prompt_ids);
+                block = m_allocator.allocate_block(hash, num_hashed_tokens, cached_blocks);
+            }
+            else {
+                block = m_allocator.allocate_block();
+            }
+            OPENVINO_ASSERT(block != nullptr);
+            m_block_table[sequence_id].push_back(block);
         }
     }
 
@@ -324,21 +486,36 @@ class BlockManager {
 
             if (num_logical_blocks > num_physical_blocks) {
                 OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks));
-                allocate(seq_id, num_logical_blocks - num_physical_blocks);
+                allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_ids());
             } else {
                 OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path");
                 KVCacheBlock::Ptr last_block = block_table.back();
-
                 if (last_block->copy_on_write()) {
                     // we need to fork current block, because reference counter is more than 1
-                    KVCacheBlock::Ptr new_block = m_allocator.allocate_block();
+                    KVCacheBlock::Ptr new_block = nullptr;
+                    if (m_enable_prefix_caching) {
+                        auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids());
+                        new_block = m_allocator.allocate_block(hash, seq_group->get_context_len(), cached_blocks);
+                        cached_blocks[hash] = new_block;
+                    }
+                    else {
+                        new_block = m_allocator.allocate_block();
+                    }
                     block_table[num_physical_blocks - 1] = new_block;
                     // write information about block forking for later usage in CacheManager
                     copy_blocks_map[last_block->get_index()].push_back(new_block->get_index());
                     // release `last_block` usage
                     m_allocator.free(last_block);
                 } else {
-                    // nothing to do, because we are the only users of this block
+                    // we are the only users of this block
+                    if (m_enable_prefix_caching) {
+                        // update hash of block
+                        auto prev_hash = last_block->get_hash();
+                        auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids());
+                        last_block->set_hash(hash, seq_group->get_context_len());
+                        cached_blocks.erase(prev_hash);
+                        cached_blocks[hash] = last_block;
+                    }
                 }
             }
         }
@@ -346,5 +523,57 @@ class BlockManager {
         // it returns information which blocks should be forked by CacheManager
         return copy_blocks_map;
     }
+
+
+    void _restore_cached_blocks(SequenceGroup::Ptr group, size_t block_size) {
+        auto prompt_ids = group->get_prompt_ids(); 
+        auto sequences = group->get_not_finished_sequences();
+        OPENVINO_ASSERT(sequences.size() == 1);
+        auto sequence = sequences[0];
+        auto seq_id = sequence->get_id();
+        auto& block_table = m_block_table[seq_id];
+
+        size_t content_len = 0;       
+        while (content_len < prompt_ids.size()) {
+            size_t prev_iteration_content_len = content_len; 
+            content_len += block_size;
+            if (content_len > prompt_ids.size()) {
+                content_len = prompt_ids.size();
+            }
+            // restore fully filled blocks
+            auto hash = sequence->get_hash(content_len, prompt_ids);
+            auto block = m_allocator.get_cached_block(hash, cached_blocks);
+            if (block != nullptr) {
+                block->set_timestamp(std::chrono::system_clock::now());
+                m_block_table[seq_id].push_back(block);
+                group->update_processed_tokens_num(content_len);
+            }
+            else {
+                // restore partially filled block
+                for (size_t i = 1; i < block_size; i++) {
+                    if (prev_iteration_content_len + i > prompt_ids.size()) {
+                        break;
+                    }
+                    auto hash = sequence->get_hash(prev_iteration_content_len + i, prompt_ids);
+                    auto block = m_allocator.get_cached_block(hash, cached_blocks);
+                    if (block != nullptr) {
+                        block->set_timestamp(std::chrono::system_clock::now());
+                        m_block_table[seq_id].push_back(block);
+                        group->update_processed_tokens_num(prev_iteration_content_len + i);
+
+                        size_t new_tokens_count_in_block = std::min(content_len, prev_iteration_content_len + block_size);
+                        if (new_tokens_count_in_block > prev_iteration_content_len + i) {
+                            cached_blocks.erase(hash);
+                            auto new_hash = sequence->get_hash(new_tokens_count_in_block, prompt_ids);
+                            cached_blocks[new_hash] = block;
+                        }
+
+                        break;
+                    }
+                }
+                break;                
+            }
+        }
+    }
 };
 }
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index ca749137db..cbd6668f90 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -10,7 +10,6 @@
 #include "openvino/genai/scheduler_config.hpp"
 #include "block_manager.hpp"
 #include "sequence_group.hpp"
-#include "block_manager.hpp"
 
 namespace ov::genai {
 class Scheduler {
@@ -34,11 +33,14 @@ class Scheduler {
     };
 
     explicit Scheduler(const SchedulerConfig & config = {}) :
-        m_config(config), m_block_manager(m_config.num_kv_blocks) { }
+        m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, m_config.block_size) { }
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
 
+        if (m_config.enable_prefix_caching)
+            _restore_cached_blocks(sequence_groups);
+
         if (m_config.dynamic_split_fuse) {
             // deepspeed-mii case
             // generation phase is always scheduled first
@@ -167,6 +169,15 @@ class Scheduler {
         return std::numeric_limits<size_t>::max();
     }
 
+    void _restore_cached_blocks(const std::vector<SequenceGroup::Ptr>& sequence_groups) {
+        for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
+            SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
+            if (sequence_group->can_generate_tokens() || sequence_group->num_running_seqs() != 1)
+                continue;
+            m_block_manager._restore_cached_blocks(sequence_group, m_config.block_size);
+        }
+    }
+
     void _apply_preemption(size_t sequence_group_id, const std::vector<SequenceGroup::Ptr>& sequence_groups) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
 
@@ -222,7 +233,7 @@ class Scheduler {
                 if (num_scheduled_tokens > 0) {
                     // allocate KV blocks if required
                     if (num_scheduled_blocks > 0)
-                        m_block_manager.allocate(seq_id, num_scheduled_blocks);
+                        m_block_manager.allocate(sequence, num_scheduled_blocks, sequence_group->get_prompt_ids());
                     // and schedule tokens
                     sequence_group->schedule_tokens(num_scheduled_tokens);
 
@@ -326,7 +337,8 @@ class Scheduler {
                 // prompt phases can have a single running sequence
                 OPENVINO_ASSERT(num_running_seqs == 1);
                 // here we also assume that sequence must be scheduler in a single shot and has no already generated context
-                OPENVINO_ASSERT(sequence_group->get_context_len() == 0);
+                if (!m_config.enable_prefix_caching)
+                    OPENVINO_ASSERT(sequence_group->get_context_len() == 0);
 
                 size_t num_available_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens;
                 size_t sequence_len = sequence_group->get_num_available_tokens_for_batching();
@@ -354,11 +366,15 @@ class Scheduler {
                     Sequence::Ptr sequence = (*sequence_group)[0];
                     uint64_t seq_id = sequence->get_id();
 
-                    // allocate KV blocks
-                    m_block_manager.allocate(seq_id, num_required_blocks);
                     // and schedule tokens
                     sequence_group->schedule_tokens(sequence_len);
 
+                    // allocate KV blocks
+                    if (sequence_group->get_num_processed_tokens() == 0)
+                        m_block_manager.allocate(sequence, num_required_blocks, sequence_group->get_prompt_ids());
+                    else 
+                        m_block_manager.append_slots(sequence_group);
+
                     // add information to scheduler_output
                     {
                         scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id);
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 3df1820cfb..008a36282e 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include <set>
 #include <cstdlib>
+#include <string_view>
 
 #include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/generation_config.hpp"
@@ -121,6 +122,21 @@ class Sequence {
         float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty);
         return score;
     }
+
+    // Each KV block can be uniquely identified by 
+    // the tokens within the block and the tokens in the prefix before the block.
+    // hash(prefix tokens + block tokens) <--> KV Block
+    size_t get_hash(size_t content_length, const ov::genai::TokenIds& prompt_ids) const {
+        std::vector<int64_t> content;
+        OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size());
+        content.insert( content.end(), prompt_ids.begin(), prompt_ids.begin() + std::min(prompt_ids.size(), content_length));
+        if (content_length > prompt_ids.size()) {
+            content.insert(content.end(), m_generated_ids.begin(), m_generated_ids.begin() + content_length - prompt_ids.size());
+        }
+        const char* data = reinterpret_cast<const char*>(content.data());
+        std::size_t size = content.size() * sizeof(content[0]);
+        return std::hash<std::string_view>{}(std::string_view(data, size));
+    }
 };
 
 // contains a list of Sequences in generic case (beam search or parallel sampling)
@@ -345,6 +361,11 @@ class SequenceGroup {
         clear_scheduled_tokens();
     }
 
+    void update_processed_tokens_num(size_t processed_tokens) {
+        m_num_processed_tokens = processed_tokens;
+        m_max_content_len = processed_tokens;
+    }
+
     void clear_waiting_sequences() {
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
             if (m_sequences[seq_id]->is_waiting()) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index f8888ba258..6175001c29 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -618,11 +618,11 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def(py::init<>())
         .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens)
         .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks)
-        .def_readwrite("cache_size", &SchedulerConfig::cache_size)
         .def_readwrite("block_size", &SchedulerConfig::block_size)
         .def_readwrite("cache_size", &SchedulerConfig::cache_size)
         .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse)
-        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
+        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs)
+        .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
         .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 025a58a507..083b911416 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -4,6 +4,9 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(googletest)
 set(TEST_TARGET_NAME "tests_continuous_batching")
-add_executable(${TEST_TARGET_NAME} scheduler.cpp block_manager.cpp logit_filtering.cpp cache_manager.cpp generate_config.cpp)
+file(GLOB tests_src
+     "*.cpp"
+)
+add_executable(${TEST_TARGET_NAME} ${tests_src})
 target_link_libraries(${TEST_TARGET_NAME} PUBLIC openvino::genai gtest_main)
 target_include_directories(${TEST_TARGET_NAME} PRIVATE "${PROJECT_SOURCE_DIR}/src/cpp/src")
diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp
index b3c89535a6..5a76a7a0ce 100644
--- a/tests/cpp/block_manager.cpp
+++ b/tests/cpp/block_manager.cpp
@@ -10,30 +10,40 @@
 #include "scheduler.hpp"
 
 TEST(TestBlockManager, general_test) {
-    ov::genai::BlockManager bm = ov::genai::BlockManager(6);
+    ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4);
+    ov::genai::TokenIds prompt_ids;
+
+    ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
+        0, 
+        ov::Tensor(ov::element::i64, {
+        prompt_ids.size()}, prompt_ids.data()),
+        ov::genai::beam_search(),
+        4);
+    auto sequence = sequence_group->get_not_finished_sequences()[0];
+    bm.allocate(sequence, 6);
+    auto seq_id = sequence->get_id();
+    EXPECT_TRUE(bm.has_block_table(seq_id));
+    EXPECT_EQ(bm.get_block_table(seq_id).size(), 6);
 
-    bm.allocate(0, 6);
-    EXPECT_TRUE(bm.has_block_table(0));
-    EXPECT_EQ(bm.get_block_table(0).size(), 6);
     EXPECT_EQ(bm.num_free_blocks(), 0);
 
-    bm.free_sequence_partially_single_runnning_sequence(0, 4);
-    EXPECT_EQ(bm.get_block_table(0).size(), 2);
+    bm.free_sequence_partially_single_runnning_sequence(seq_id, 4);
+    EXPECT_EQ(bm.get_block_table(seq_id).size(), 2);
     EXPECT_EQ(bm.num_free_blocks(), 4);
 
-    bm.free_sequence(0);
-    EXPECT_FALSE(bm.has_block_table(0));
+    bm.free_sequence(seq_id);
+    EXPECT_FALSE(bm.has_block_table(seq_id));
     EXPECT_EQ(bm.num_free_blocks(), 6);
 
-    bm.allocate(0, 2);
-    bm.fork_sequence(0, 1);
+    bm.allocate(sequence, 2);
+    bm.fork_sequence(seq_id, 1);
     EXPECT_TRUE(bm.has_block_table(1));
     EXPECT_EQ(bm.get_block_table(1).back()->get_references_count(), 2);
 
 }
 
 TEST(TestBlockManager, required_blocks_count) {
-    ov::genai::BlockManager bm = ov::genai::BlockManager(8);
+    ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, 4);
 
     std::vector<uint64_t> tokens = {0,1,2,3,4};
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
diff --git a/tests/cpp/evictor.cpp b/tests/cpp/evictor.cpp
new file mode 100644
index 0000000000..9867dfa2b5
--- /dev/null
+++ b/tests/cpp/evictor.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "openvino/runtime/core.hpp"
+#include "scheduler.hpp"
+#include <chrono>
+#include <thread>
+
+TEST(TestEvictor, general_test) {
+    ov::genai::Evictor evictor;
+    auto block0 = std::make_shared<ov::genai::KVCacheBlock>(0);
+    block0->set_hash(77, 1);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block1 = std::make_shared<ov::genai::KVCacheBlock>(1);
+    block1->set_hash(56, 2);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block2 = std::make_shared<ov::genai::KVCacheBlock>(2);
+    block2->set_hash(23, 3);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    evictor.add(block0->get_hash(), block0);
+    evictor.add(block1->get_hash(), block1);
+    evictor.add(block2->get_hash(), block2);
+    EXPECT_EQ(evictor.num_blocks(), 3);
+
+    auto block = evictor.get_block(56);
+    EXPECT_EQ(block->get_index(), 1);
+    EXPECT_EQ(block->get_hash(), 56);
+    EXPECT_EQ(block->get_references_count(), 1);
+    EXPECT_EQ(evictor.num_blocks(), 2);
+
+    EXPECT_EQ(evictor.get_block(44), nullptr);
+    EXPECT_EQ(evictor.num_blocks(), 2);
+
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 0);
+    EXPECT_EQ(evictor.num_blocks(), 1);
+
+    auto block3 = std::make_shared<ov::genai::KVCacheBlock>(7);
+    block3->set_hash(12, 4);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block4 = std::make_shared<ov::genai::KVCacheBlock>(10);
+    block4->set_hash(99, 5);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    evictor.add(block3->get_hash(), block3);
+    evictor.add(block4->get_hash(), block4);
+    block2->set_timestamp(std::chrono::system_clock::now());
+
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 7);
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 10);
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 2);
+    EXPECT_EQ(evictor.get_lru_block(), nullptr);
+    EXPECT_EQ(evictor.num_blocks(), 0);
+}
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index b4114dd1b2..82b104223c 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -366,3 +366,69 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
         EXPECT_FALSE(scheduler.has_block_table(idx0));
     }
 }
+
+TEST(TestScheduler, prefix_caching_test) {
+    std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()};
+    configs.at(0).max_num_batched_tokens = 32;
+    configs.at(0).num_kv_blocks = 100;
+    configs.at(0).block_size = 4;
+    configs.at(0).dynamic_split_fuse = false;
+    configs.at(0).max_num_seqs = 5;
+    configs.at(0).enable_prefix_caching = true;
+    configs.at(1).max_num_batched_tokens = 32;
+    configs.at(1).num_kv_blocks = 100;
+    configs.at(1).block_size = 4;
+    configs.at(1).dynamic_split_fuse = true;
+    configs.at(1).max_num_seqs = 5;
+    configs.at(1).enable_prefix_caching = true;
+    for (auto scheduler_config: configs) {
+        std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
+        std::vector<uint64_t> histrory_tokens = {};
+        // schedule prompt
+        Scheduler scheduler = Scheduler(scheduler_config);
+
+        size_t chat_iterations = 10;
+
+        for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) {
+            std::vector<uint64_t> tokens = histrory_tokens;
+            tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
+            SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
+                                                                                    ov::genai::greedy(), scheduler_config.block_size);
+            std::vector<SequenceGroup::Ptr> requests = {sequence_group};
+
+            auto out1 = scheduler.schedule(requests);
+            if (chat_iteration == 0)
+                EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
+            else 
+                EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1);
+            for (auto seq: requests) {
+                std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
+                running_sequences[0]->append_token(23, 0.7);
+                seq->finish_iteration();
+            }
+
+            // schedule generate
+            size_t num_generate_tokens = 10;
+            for (size_t i = 0; i < num_generate_tokens; i++) {
+                auto out2 = scheduler.schedule(requests);
+                EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1);
+                for (auto seq: requests) {
+                    std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
+                    running_sequences[0]->append_token(16, 0.9);
+                    seq->finish_iteration();
+                }
+            }
+
+            // finish sequence
+            auto sequence = requests[0]->get_running_sequences()[0];
+            sequence->set_status(SequenceStatus::FINISHED);
+            auto idx0 = sequence->get_id();
+            scheduler.free_sequence(idx0);
+            auto generated_ids = sequence->get_generated_ids();
+
+            histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
+            histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end());
+        }
+    }
+
+}
\ No newline at end of file

From be2fdafb273319084999fe944d02e5653d030de7 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Jul 2024 10:12:55 +0200
Subject: [PATCH 35/54] resolve conflicts

---
 src/cpp/src/llm_pipeline.cpp | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 40d4377b00..8505daf3b2 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -510,7 +510,10 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
+    auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();   
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -518,27 +521,35 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& plugin_config
-): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
+){
+    auto start_time = std::chrono::steady_clock::now();    
     if ("CB" == device) {
-        return std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
-    } if ("NPU" == device) {
-        return std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
+    } else {
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
     }
-    return std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
-}()} {}
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();    
+}
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::string& path,
     const std::string& device,
     const ov::AnyMap& config
-): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
+){ 
+    auto start_time = std::chrono::steady_clock::now();
     if ("CB" == device) {
-        return std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
-    } if ("NPU" == device) {
-        return std::make_unique<StaticLLMPipeline>(path, device, config);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config);
+    } else {
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(path, device, config);
     }
-    return std::make_unique<StatefulLLMPipeline>(path, device, config);
-}()} {}
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+}
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;

From b00bcd8f411e65c7a5d455fec502fcf2639fa022 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Jul 2024 14:05:36 +0200
Subject: [PATCH 36/54] apply comments

---
 samples/cpp/benchmark_genai/CMakeLists.txt    |  1 -
 .../cpp/benchmark_genai/benchmark_genai.cpp   |  9 +-
 .../python/benchmark_genai/benchmark_genai.py |  2 +-
 .../benchmark_genai_automatic.py              | 62 --------------
 src/README.md                                 | 82 ++++++++++++++-----
 src/cpp/src/greedy_decoding.cpp               |  1 -
 src/cpp/src/perf_metrics.cpp                  |  2 -
 7 files changed, 67 insertions(+), 92 deletions(-)
 delete mode 100755 samples/python/benchmark_genai/benchmark_genai_automatic.py

diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt
index bfa1592f61..5443439de5 100644
--- a/samples/cpp/benchmark_genai/CMakeLists.txt
+++ b/samples/cpp/benchmark_genai/CMakeLists.txt
@@ -18,7 +18,6 @@ set_target_properties(benchmark_genai PROPERTIES
     COMPILE_PDB_NAME benchmark_genai
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-# target_compile_features(benchmark_genai PRIVATE cxx_std_11)
 install(TARGETS benchmark_genai
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
index 24b9491219..2fd5eafc69 100644
--- a/samples/cpp/benchmark_genai/benchmark_genai.cpp
+++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -50,14 +50,15 @@ int main(int argc, char* argv[]) try {
         res = pipe.generate(prompt, config);
         metrics = metrics + res.perf_metrics;
     }
-
+        
+    std::cout << std::fixed << std::setprecision(2);
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
     std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl;
     std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl;
     std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl;
-    std::cout << "ttft: " << metrics.ttft.mean  << " ± " << metrics.ttft.std << " ms" << std::endl;
-    std::cout << "tpot: " << metrics.tpot.mean  << " ± " << metrics.tpot.std << " ms " << std::endl;
-    std::cout << "Tokens/s: " << metrics.throughput.mean  << " ± " << metrics.throughput.std << std::endl;
+    std::cout << "TTFT: " << metrics.ttft.mean  << " ± " << metrics.ttft.std << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.tpot.mean  << " ± " << metrics.tpot.std << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.throughput.mean  << " ± " << metrics.throughput.std << " tokens/s" << std::endl;
 
     return 0;
 } catch (const std::exception& error) {
diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
index c29c508bf4..ef468053d8 100755
--- a/samples/python/benchmark_genai/benchmark_genai.py
+++ b/samples/python/benchmark_genai/benchmark_genai.py
@@ -43,7 +43,7 @@ def main():
     print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms")
     print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms")
     print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms")
-    print(f"Throughput tokens/s: {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f}")
+    print(f"Throughput : {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f} tokens/s")
 
 if __name__ == "__main__":
     main()
diff --git a/samples/python/benchmark_genai/benchmark_genai_automatic.py b/samples/python/benchmark_genai/benchmark_genai_automatic.py
deleted file mode 100755
index 98a00a8c99..0000000000
--- a/samples/python/benchmark_genai/benchmark_genai_automatic.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import openvino_genai as ov_genai
-import pdb
-
-def main():
-    parser = argparse.ArgumentParser(description="Help command")
-    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
-    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
-    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
-    parser.add_argument("-n", "--num_iter", type=int, default=5, help="Number of iterations")
-    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
-    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
-    
-    args = parser.parse_args()
-
-    # Perf metrics is stored in DecodedResults. 
-    # In order to get DecodedResults instead of a string input should be a list.
-    
-    model_path = args.model
-    device = args.device
-    num_warmup = args.num_warmup
-    num_iter = args.num_iter
-    
-    config = ov_genai.GenerationConfig()
-    config.max_new_tokens = 20
-    # config.num_beam_groups = 3
-    # config.num_beams = 15
-
-    pipe = ov_genai.LLMPipeline(model_path, device)
-    
-    import pandas as pd
-    metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot'])
-
-    batch_sizes = [1, 2, 4, 16, 32, 64, 256]
-    for batch_size in batch_sizes:
-        prompt = [args.prompt] * batch_size
-        for _ in range(num_warmup):
-            pipe.generate(prompt, config)
-        
-        res = pipe.generate(prompt, config)
-        metrics = res.metrics
-        for _ in range(num_iter - 1):
-            res = pipe.generate(prompt, config)
-            metrics += res.metrics
-        # pdb.set_trace()
-        metrics_df = metrics_df._append({
-            'batch_size': batch_size,
-            'throughput': metrics.mean_throughput,
-            'ttft': metrics.mean_ttft,
-            'tpot': metrics.mean_tpot,
-            'std_throughput': metrics.std_throughput,
-            'std_ttft': metrics.std_ttft,
-            'std_tpot': metrics.std_tpot,
-        }, ignore_index=True)
-
-    metrics_df.to_csv('metrics.csv', index=False)
-
-if __name__ == "__main__":
-    main()
diff --git a/src/README.md b/src/README.md
index 3a53e175dd..aa4dc0f301 100644
--- a/src/README.md
+++ b/src/README.md
@@ -198,29 +198,49 @@ int main(int argc, char* argv[]) {
 
 ### Performance Metrics
 
-`ov.genai.PerfMetrics` (referred to as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` hold fields with mean and standard deviations for the following metrics:
-- `ttft`
-- `tpot`
-- `load_time`
-- `generate_duration`
-- `tokenization_duration`
-- `detokenization_duration`
-- `throughput`
+`openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics:
+- Time To the First Token (TTFT), ms
+- Time per Output Token (TPOT), ms/token
+- Generate total duration, ms
+- Tokenization duration, ms
+- Detokenization duration, ms
+- Throughput, tokens/s
 
 and:
-- `num_generated_tokens`
-- `num_input_tokens`
+- Load time, ms
+- Number of generated tokens
+- Number of tokens in the input prompt
 
-Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `ov.genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`.
+Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `openvino_genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`.
 
 ```python
 import openvino_genai as ov_genai
 pipe = ov_genai.LLMPipeline(model_path, "CPU")
-res = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
-perf_metrics = res.perf_metrics
-print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
-print(f'ttft: {perf_metrics.mean_ttft:.2f}')
-print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = result.perf_metrics
+
+print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}')
+print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms')
+print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token')
+print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s')
+```
+
+```cpp
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
+int main(int argc, char* argv[]) {
+    std::string model_path = argv[1];
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
+    auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20));
+    auto perf_metrics = result.perf_metrics;
+    
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.ttft.mean  << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.tpot.mean  << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.throughput.mean  << " tokens/s" << std::endl;
+}
 ```
 output:
 ```sh
@@ -229,9 +249,28 @@ mean_ttft: 42.58
 mean_tpot 3.80
 ```
 
->**Note**: If the input prompt is just a string, the generate function will return only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs.
+>**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs.
 
-Several `perf_metrics` can be added with each other. In that case `raw_metrics` will be concatenated and mean/std values will be recalculated. This enhances benchmarking and accumulating statistics from several calls.
+Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls
+
+```cpp
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
+int main(int argc, char* argv[]) {
+    std::string model_path = argv[1];
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
+    auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20));
+    auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20));
+    auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics
+    
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.ttft.mean  << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.tpot.mean  << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.throughput.mean  << " tokens/s" << std::endl;
+}
+```
 
 ```python
 import openvino_genai as ov_genai
@@ -240,9 +279,10 @@ res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
 res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20)
 perf_metrics = res_1.perf_metrics + res_2.perf_metrics
 
-print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
-print(f'ttft: {perf_metrics.mean_ttft:.2f}')
-print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}')
+print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms')
+print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token')
+print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s')
 ```
 
 ## How It Works
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 8b0cf19c1f..8dc56b4ba8 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/perf_metrics.hpp"
-// #include "perf_counters.hpp"
 #include "utils.hpp"
 
 namespace ov {
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index bc394fae52..92b6315990 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -63,8 +63,6 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
     ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token);
 
     generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
-    generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
-
     tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations);
     detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);    
     

From 60e71881766334a2dfd05e4b17b22e7de740d2d1 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Jul 2024 14:56:13 +0200
Subject: [PATCH 37/54] uset getter and cache evaluate results

---
 .../cpp/benchmark_genai/benchmark_genai.cpp   | 16 +++---
 .../python/benchmark_genai/benchmark_genai.py | 14 ++---
 src/README.md                                 | 32 ++++++------
 .../include/openvino/genai/perf_metrics.hpp   | 16 ++++++
 src/cpp/src/perf_metrics.cpp                  | 52 ++++++++++++++++++-
 src/python/py_generate_pipeline.cpp           | 14 ++---
 6 files changed, 104 insertions(+), 40 deletions(-)

diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
index 2fd5eafc69..287d6b379a 100644
--- a/samples/cpp/benchmark_genai/benchmark_genai.cpp
+++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -50,15 +50,15 @@ int main(int argc, char* argv[]) try {
         res = pipe.generate(prompt, config);
         metrics = metrics + res.perf_metrics;
     }
-        
+
     std::cout << std::fixed << std::setprecision(2);
-    std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
-    std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl;
-    std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl;
-    std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl;
-    std::cout << "TTFT: " << metrics.ttft.mean  << " ± " << metrics.ttft.std << " ms" << std::endl;
-    std::cout << "TPOT: " << metrics.tpot.mean  << " ± " << metrics.tpot.std << " ms/token " << std::endl;
-    std::cout << "Throughput: " << metrics.throughput.mean  << " ± " << metrics.throughput.std << " tokens/s" << std::endl;
+    std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl;
+    std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.get_ttft().mean  << " ± " << metrics.get_ttft().std << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.get_tpot().mean  << " ± " << metrics.get_tpot().std << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.get_throughput().mean  << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl;
 
     return 0;
 } catch (const std::exception& error) {
diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
index ef468053d8..9851483880 100755
--- a/samples/python/benchmark_genai/benchmark_genai.py
+++ b/samples/python/benchmark_genai/benchmark_genai.py
@@ -37,13 +37,13 @@ def main():
         res = pipe.generate(prompt, config)
         perf_metrics += res.perf_metrics
     
-    print(f"Load time: {perf_metrics.load_time:.2f} ms")
-    print(f"Generate time: {perf_metrics.generate_duration.mean:.2f} ± {perf_metrics.generate_duration.std:.2f} ms")
-    print(f"Tokenization time: {perf_metrics.tokenization_duration.mean:.2f} ± {perf_metrics.tokenization_duration.std:.2f} ms")
-    print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms")
-    print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms")
-    print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms")
-    print(f"Throughput : {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f} tokens/s")
+    print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
+    print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
+    print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
+    print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
+    print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
+    print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
+    print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")
 
 if __name__ == "__main__":
     main()
diff --git a/src/README.md b/src/README.md
index aa4dc0f301..aefa993d8e 100644
--- a/src/README.md
+++ b/src/README.md
@@ -219,10 +219,10 @@ pipe = ov_genai.LLMPipeline(model_path, "CPU")
 result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
 perf_metrics = result.perf_metrics
 
-print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}')
-print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms')
-print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token')
-print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s')
+print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
+print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms')
+print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
+print(f'Throughput: {perf_metrics.get_throughput()get_.mean():.2f} tokens/s')
 ```
 
 ```cpp
@@ -236,10 +236,10 @@ int main(int argc, char* argv[]) {
     auto perf_metrics = result.perf_metrics;
     
     std::cout << std::fixed << std::setprecision(2);
-    std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl;
-    std::cout << "TTFT: " << metrics.ttft.mean  << " ms" << std::endl;
-    std::cout << "TPOT: " << metrics.tpot.mean  << " ms/token " << std::endl;
-    std::cout << "Throughput: " << metrics.throughput.mean  << " tokens/s" << std::endl;
+    std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.get_ttft().mean  << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.get_tpot().mean  << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.get_throughput().mean  << " tokens/s" << std::endl;
 }
 ```
 output:
@@ -265,10 +265,10 @@ int main(int argc, char* argv[]) {
     auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics
     
     std::cout << std::fixed << std::setprecision(2);
-    std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl;
-    std::cout << "TTFT: " << metrics.ttft.mean  << " ms" << std::endl;
-    std::cout << "TPOT: " << metrics.tpot.mean  << " ms/token " << std::endl;
-    std::cout << "Throughput: " << metrics.throughput.mean  << " tokens/s" << std::endl;
+    std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.get_ttft().mean  << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.get_tpot().mean  << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.get_throughput().mean  << " tokens/s" << std::endl;
 }
 ```
 
@@ -279,10 +279,10 @@ res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
 res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20)
 perf_metrics = res_1.perf_metrics + res_2.perf_metrics
 
-print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}')
-print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms')
-print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token')
-print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s')
+print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
+print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms')
+print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
+print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
 ```
 
 ## How It Works
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index 8715761792..ddb9ff581f 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -57,6 +57,22 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
 
     size_t num_generated_tokens;
     size_t num_input_tokens;
+    
+    float get_load_time();         // Load time in ms.
+    float get_num_generated_tokens();
+    float get_num_input_tokens();
+    MeanStdPair get_ttft();         // Time to the first token (in ms) (TTTFT).
+    MeanStdPair get_tpot();        // Time (in ms) per output token (TPOT).
+    MeanStdPair get_throughput();  // Tokens per second.
+    
+    MeanStdPair get_generate_duration();
+    MeanStdPair get_tokenization_duration();
+    MeanStdPair get_detokenization_duration();
+
+    // Flag indicating if raw metrics were evaluated.
+    // If false means current mean/std ttft, tpot, etc. are not actual 
+    // and evaluate_statistics() should recalculate them.
+    bool m_evaluated = false;
 
     /** 
      * @brief calculates mean/std values from raw_metrics. 
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 92b6315990..2f378ab302 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -32,11 +32,58 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSecon
 namespace ov {
 namespace genai {
 
+float PerfMetrics::get_load_time() {
+    return load_time;
+}
+
+float PerfMetrics::get_num_generated_tokens() {
+    evaluate_statistics();
+    return num_generated_tokens;
+}
+
+float PerfMetrics::get_num_input_tokens() {
+    evaluate_statistics();
+    return num_generated_tokens;
+}
+
+MeanStdPair PerfMetrics::get_ttft() {
+    evaluate_statistics();
+    return ttft;
+}
+
+MeanStdPair PerfMetrics::get_tpot() {
+    evaluate_statistics();
+    return tpot;
+}
+
+MeanStdPair PerfMetrics::get_throughput() {
+    evaluate_statistics();
+    return throughput;
+}
+
+MeanStdPair PerfMetrics::get_generate_duration() {
+    evaluate_statistics();
+    return generate_duration;
+}
+
+MeanStdPair PerfMetrics::get_tokenization_duration() {
+    evaluate_statistics();
+    return tokenization_duration;
+}
+
+MeanStdPair PerfMetrics::get_detokenization_duration() {
+    evaluate_statistics();
+    return detokenization_duration;
+}
+
 float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) {
     return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
 }
-    
+
 void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
+    if (m_evaluated){
+        return;
+    }
     // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that.
     if (start_time.has_value()) {
         auto start_time_val = *start_time;
@@ -68,6 +115,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
     
     // tokens per second
     throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)};
+    m_evaluated = true;
 }
 
 PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
@@ -103,7 +151,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
     res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens;
     res.num_input_tokens = num_generated_tokens + right.num_input_tokens;
     res.load_time = load_time;
-    res.evaluate_statistics();
+    res.m_evaluated = false;
     return res;
 }
 
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index ed687d6f40..9bee185ff7 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -606,13 +606,13 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
 
     py::class_<PerfMetrics>(m, "PerfMetrics")
         .def(py::init<>())
-        .def_readonly("generate_duration", &PerfMetrics::generate_duration)
-        .def_readonly("tokenization_duration", &PerfMetrics::tokenization_duration)
-        .def_readonly("detokenization_duration", &PerfMetrics::detokenization_duration)
-        .def_readonly("throughput", &PerfMetrics::throughput)
-        .def_readonly("tpot", &PerfMetrics::tpot)
-        .def_readonly("ttft", &PerfMetrics::ttft)
-        .def_readonly("load_time", &PerfMetrics::load_time)
+        .def("get_generate_duration", &PerfMetrics::get_generate_duration)
+        .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)
+        .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration)
+        .def("get_throughput", &PerfMetrics::get_throughput)
+        .def("get_tpot", &PerfMetrics::get_tpot)
+        .def("get_ttft", &PerfMetrics::get_ttft)
+        .def("get_load_time", &PerfMetrics::get_load_time)
         .def("__add__", &PerfMetrics::operator+)
         .def("__iadd__", &PerfMetrics::operator+=)
         .def_readonly("raw_metrics", &PerfMetrics::raw_metrics);

From e553ef5dd78ea6bb11cc32bdfb6fb397cba55a24 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Jul 2024 15:11:41 +0200
Subject: [PATCH 38/54] update Readme's

---
 samples/cpp/benchmark_genai/README.md    | 4 ++--
 samples/python/benchmark_genai/README.md | 4 ++--
 src/README.md                            | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md
index bac16c2f7d..616bb6a36d 100644
--- a/samples/cpp/benchmark_genai/README.md
+++ b/samples/cpp/benchmark_genai/README.md
@@ -1,6 +1,6 @@
-# Benchmarking Vanilla GenAI
+# LLMs benchmarking sample
 
-This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
 
 ## Download and convert the model and tokenizers
 
diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md
index fa4fa85576..1ff9ef4305 100644
--- a/samples/python/benchmark_genai/README.md
+++ b/samples/python/benchmark_genai/README.md
@@ -1,6 +1,6 @@
-# Benchmarking Vanilla GenAI
+# LLMs benchmarking sample
 
-This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
 
 ## Download and convert the model and tokenizers
 
diff --git a/src/README.md b/src/README.md
index aefa993d8e..e88c2f784f 100644
--- a/src/README.md
+++ b/src/README.md
@@ -285,6 +285,8 @@ print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
 print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
 ```
 
+For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/cpp/benchmark_genai/README.md) samples.
+
 ## How It Works
 
 For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md).

From 3bfbab55b3d862c9f360a3ba1a58536a328b28fc Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 26 Jul 2024 15:10:11 +0100
Subject: [PATCH 39/54] StaticLLMPipeline dangling models hotfix (#693)

---
 src/cpp/src/llm_pipeline_static.cpp | 18 +++++++++---------
 src/cpp/src/llm_pipeline_static.hpp |  4 ++++
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 3f50d30ec9..351e10b523 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -144,26 +144,26 @@ StaticLLMPipeline::StaticLLMPipeline(
     */
     ov::Core core;
     // (1) Read the template model - this will be kvcache model
-    auto kvcache_model = core.read_model(path / "openvino_model.xml");
+    m_kvcache_model = core.read_model(path / "openvino_model.xml");
     // (2) Expose KV-cache input and output layers from kvcache model
-    ov::pass::StatefulToStateless().run_on_model(kvcache_model);
+    ov::pass::StatefulToStateless().run_on_model(m_kvcache_model);
     // (3) Clone the model - this will be prefill
-    auto prefill_model = kvcache_model->clone();
-    prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
+    m_prefill_model = m_kvcache_model->clone();
+    m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill");
     // (4) Reshape both models to static shape
     m_kvcache_desc = KVCacheDesc { 1024u, 0u };
     const uint32_t max_prompt_size = m_kvcache_desc.total_size;
     const uint32_t max_kvcache_size = m_kvcache_desc.total_size;
-    reshape_to_static(prefill_model, max_prompt_size, max_kvcache_size);
-    reshape_to_static(kvcache_model, 1u, max_kvcache_size);
+    reshape_to_static(m_prefill_model, max_prompt_size, max_kvcache_size);
+    reshape_to_static(m_kvcache_model, 1u, max_kvcache_size);
     // (5) Add slices to kvcache model
-    kvcache_model = add_slices_to_kvcache_inputs(kvcache_model);
+    m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
+        m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
-        kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
+        m_kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
     ).create_infer_request();
     // (7) Initialize tensors
     prepare_for_new_conversation();
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 85488e1880..7560b7e336 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -46,6 +46,10 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         uint32_t num_stored_tokens;
     };
 
+    // FIXME: Ideally, we don't need to keep those
+    std::shared_ptr<ov::Model> m_kvcache_model;
+    std::shared_ptr<ov::Model> m_prefill_model;
+
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;

From 06c57b70de3093830c7a475ed61ad9a5bbf3cb87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Mon, 29 Jul 2024 15:56:27 +0200
Subject: [PATCH 40/54] Remove Dockerfile (#700)

Removing dockerfile from release branch due to process requirements.
---
 Dockerfile | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index b73d907b87..0000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM ubuntu:22.04
-
-ARG JOBS
-WORKDIR /workspace
-RUN apt-get update -y && apt-get install -y python3-pip python3-venv git
-
-# Install OpenVINO
-RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \
-    cd /workspace/openvino && \
-    git submodule update --init -- /workspace/openvino/thirdparty/xbyak /workspace/openvino/thirdparty/pugixml /workspace/openvino/thirdparty/open_model_zoo \
-        /workspace/openvino/thirdparty/protobuf /workspace/openvino/thirdparty/snappy /workspace/openvino/thirdparty/telemetry /workspace/openvino/src/plugins/intel_cpu/thirdparty/mlas \
-        /workspace/openvino/src/plugins/intel_cpu/thirdparty/onednn /workspace/openvino/src/bindings/python/thirdparty/pybind11 && cd -
-
-RUN /workspace/openvino/install_build_dependencies.sh
-RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt
-RUN cmake -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_WHEEL=ON -DENABLE_CPPLINT=OFF -DENABLE_SAMPLES=OFF -DENABLE_INTEL_GPU=OFF \
-        -DENABLE_INTEL_NPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF -DENABLE_OV_TF_FRONTEND=ON -DENABLE_OV_ONNX_FRONTEND=OFF \
-        -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -S /workspace/openvino -B /workspace/openvino_build
-RUN cmake --build /workspace/openvino_build --parallel $JOBS
-RUN cmake -P /workspace/openvino_build/cmake_install.cmake
-RUN python3 -m pip install /workspace/openvino_build/wheels/openvino-2024* 
-ENV OpenVINO_DIR=/workspace/openvino_build
-
-# Download dataset
-RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-# Build GenAI library with dependencies
-RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tokenizer openvino.genai && \
-        cd /workspace/openvino.genai/thirdparty && git submodule update --remote --init && \
-        mkdir /workspace/openvino.genai/build && cd /workspace/openvino.genai/build && \
-        cmake -DCMAKE_BUILD_TYPE=Release .. && \
-        make -j${JOBS}
-
-# Install test dependencies
-RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/continuous_batching/requirements.txt
-ENV PYTHONPATH=/workspace/openvino.genai/build/
-ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/

From e2864696954f8b9e73cee7704dc231e7fed07b10 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Tue, 30 Jul 2024 12:26:04 +0100
Subject: [PATCH 41/54] StaticLLMPipeline - align u4 zero points (#705)

---
 src/cpp/src/llm_pipeline_static.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 351e10b523..c4ff0a90ab 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -12,6 +12,23 @@
 
 namespace {
 
+void align_u4_zp_constants(const std::shared_ptr<ov::Model>& model) {
+    for (auto op : model->get_ops()) {
+        if (ov::op::util::is_constant(op)) {
+            auto cst_op = std::dynamic_pointer_cast<ov::op::v0::Constant>(op);
+            const auto cst_op_out = cst_op->output(0);
+            if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) {
+                ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape());
+                *static_cast<uint8_t*>(cst_tensor.data()) = cst_op->get_vector<uint8_t>()[0] & 0x0f;
+                auto new_cst_op = std::make_shared<ov::op::v0::Constant>(cst_tensor);
+                for (auto target_input : cst_op_out.get_target_inputs()) {
+                    target_input.replace_source_output(new_cst_op);
+                }
+            }
+        }
+    }
+}
+
 std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov::Model>& model) {
     const auto kvcache_name_pattern = "past_key_values";
     std::vector<std::shared_ptr<ov::opset13::Parameter>> new_params;
@@ -147,6 +164,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     m_kvcache_model = core.read_model(path / "openvino_model.xml");
     // (2) Expose KV-cache input and output layers from kvcache model
     ov::pass::StatefulToStateless().run_on_model(m_kvcache_model);
+    align_u4_zp_constants(m_kvcache_model);
     // (3) Clone the model - this will be prefill
     m_prefill_model = m_kvcache_model->clone();
     m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill");

From 2a8082880bb271e82dd225b49c11e410efcd2170 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 31 Jul 2024 05:43:57 +0400
Subject: [PATCH 42/54] Disable broken test (#707)

---
 .github/workflows/genai_package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index cf604b4bcc..3df73d437b 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -113,5 +113,6 @@ jobs:
           && cmake --install "samples build" --config ${{ matrix.build-type }} --component samples_bin --prefix samples_install
         if: ${{ 'Release' != matrix.build-type }}
       - run: call ov\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ ""
+        if: ${{ 'Release' == matrix.build-type }} # Tokenizers don't work in debug
       - run: call ov\setupvars.bat && python .\ov\samples\python\multinomial_causal_lm\multinomial_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 0
         if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only

From d89cdcb5c8d236b2caeca8e9e1303d4ba6e729e6 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 31 Jul 2024 12:15:59 +0400
Subject: [PATCH 43/54] update optimum commit for releases/2024/3 (#711)

---
 llm_bench/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index ed80a66deb..e7f7dfcd10 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@439d61f79cf55d5d0b28334f577b6ac3c5ced28f#egg=optimum-intel
+git+https://github.com/eaidova/optimum-intel.git@ea/remove_bf16_rotary_emb_patching#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From 2428a3ab71f8855407e84a70c7efe47c0d2793f6 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 31 Jul 2024 16:42:57 +0400
Subject: [PATCH 44/54] change commit for optimum

---
 llm_bench/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index ed80a66deb..bbeb5de89e 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@439d61f79cf55d5d0b28334f577b6ac3c5ced28f#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@6388aeb8738b63e28fc594af84df94590e77cb9a#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From 2f778f3e7aacfe3a2eafed5966494d9d13bd9c2c Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Wed, 31 Jul 2024 14:33:48 +0200
Subject: [PATCH 45/54] Add perf metric docstrings (#713)

Docstring for generation time metrics
Ticket: CVS-132859
---
 .../include/openvino/genai/perf_metrics.hpp   | 64 ++++++++++++--
 src/python/py_generate_pipeline.cpp           | 84 ++++++++++++++++++-
 2 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index ddb9ff581f..ad53d8d941 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -16,8 +16,18 @@ using TimePoint = std::chrono::steady_clock::time_point;
 using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
 
 /**
-* @brief Structure with raw performance metrics for each generation before any statistics calculated.
-*/
+ * @brief Structure with raw performance metrics for each generation before any statistics are calculated.
+ * 
+ * @param generate_durations Durations for each generate call in microseconds.
+ * @param tokenization_durations Durations for the tokenization process in microseconds.
+ * @param detokenization_durations Durations for the detokenization process in microseconds.
+ * @param m_times_to_first_token Times to the first token for each call in microseconds.
+ * @param m_new_token_times Time points for each new token generated.
+ * @param m_batch_sizes Batch sizes for each generate call.
+ * @param m_durations Total durations for each generate call in microseconds.
+ * @param num_generated_tokens Total number of tokens generated.
+ * @param num_input_tokens Total number of tokens in the input prompt.
+ */
 struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
     std::vector<MicroSeconds> generate_durations;
     std::vector<MicroSeconds> tokenization_durations;
@@ -41,10 +51,52 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair {
 };
 
 /**
-* @brief Structure to store performance metric for each generation.
-* 
-* @param
-*/
+ * @brief Holds performance metrics for each generate call. 
+ * 
+ * PerfMetrics holds fields with mean and standard deviations for the following metrics:
+ * - Time To the First Token (TTFT), ms
+ * - Time per Output Token (TPOT), ms/token
+ * - Generate total duration, ms
+ * - Tokenization duration, ms
+ * - Detokenization duration, ms
+ * - Throughput, tokens/s
+ * 
+ * Additional fields include:
+ * - Load time, ms
+ * - Number of generated tokens
+ * - Number of tokens in the input prompt
+ * 
+ * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs.
+ * If mean and std were already calcualted getters return cached values.
+ * @param get_load_time Returns the load time in milliseconds.
+ * @param get_num_generated_tokens Returns the number of generated tokens.
+ * @param get_num_input_tokens Returns the number of tokens in the input prompt.
+ * @param get_ttft Returns the mean and standard deviation of TTFT.
+ * @param get_tpot Returns the mean and standard deviation of TPOT.
+ * @param get_throughput Returns the mean and standard deviation of throughput.
+ * @param get_generate_duration Returns the mean and standard deviation of generate duration.
+ * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration.
+ * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration.
+ * @param get_microsec Converts a duration to microseconds.
+ * @param m_evaluated Flag indicating if raw metrics were evaluated. 
+ *        If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them.
+ * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. 
+ *        Optional start_time can be provided to update durations.
+ * @param operator+ Adds two PerfMetrics objects.
+ * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object.
+ * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics.
+ * @param load_time Load time in milliseconds.
+ * 
+ * Cached mean and standard deviations.
+ * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds.
+ * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token.
+ * @param throughput Mean and standard deviation of tokens per second.
+ * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds.
+ * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds.
+ * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds.
+ * @param num_generated_tokens Number of generated tokens.
+ * @param num_input_tokens Number of tokens in the input prompt.
+ */
 struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     float load_time;   // Load time in ms.
     MeanStdPair ttft;  // Time to the first token (in ms) (TTTFT).
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 031c8fb97b..9518e1ece4 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -102,6 +102,86 @@ auto generation_config_docstring = R"(
     repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
 )";
 
+auto raw_perf_metrics_docstring = R"(
+    Structure with raw performance metrics for each generation before any statistics are calculated.
+
+    :param generate_durations: Durations for each generate call in microseconds.
+    :type generate_durations: List[MicroSeconds]
+
+    :param tokenization_durations: Durations for the tokenization process in microseconds.
+    :type tokenization_durations: List[MicroSeconds]
+
+    :param detokenization_durations: Durations for the detokenization process in microseconds.
+    :type detokenization_durations: List[MicroSeconds]
+
+    :param m_times_to_first_token: Times to the first token for each call in microseconds.
+    :type m_times_to_first_token: List[MicroSeconds]
+
+    :param m_new_token_times: Time points for each new token generated.
+    :type m_new_token_times: List[TimePoint]
+
+    :param m_batch_sizes: Batch sizes for each generate call.
+    :type m_batch_sizes: List[int]
+
+    :param m_durations: Total durations for each generate call in microseconds.
+    :type m_durations: List[MicroSeconds]
+
+    :param num_generated_tokens: Total number of tokens generated.
+    :type num_generated_tokens: int
+
+    :param num_input_tokens: Total number of tokens in the input prompt.
+    :type num_input_tokens: int
+)";
+
+auto perf_metrics_docstring = R"(
+    Holds performance metrics for each generate call.
+    
+    PerfMetrics holds fields with mean and standard deviations for the following metrics:
+    - Time To the First Token (TTFT), ms
+    - Time per Output Token (TPOT), ms/token
+    - Generate total duration, ms
+    - Tokenization duration, ms
+    - Detokenization duration, ms
+    - Throughput, tokens/s
+
+    Additional fields include:
+    - Load time, ms
+    - Number of generated tokens
+    - Number of tokens in the input prompt
+
+    Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs.
+    If mean and std were already calculated, getters return cached values.
+
+    :param get_load_time: Returns the load time in milliseconds.
+    :type get_load_time: float
+
+    :param get_num_generated_tokens: Returns the number of generated tokens.
+    :type get_num_generated_tokens: int
+
+    :param get_num_input_tokens: Returns the number of tokens in the input prompt.
+    :type get_num_input_tokens: int
+
+    :param get_ttft: Returns the mean and standard deviation of TTFT.
+    :type get_ttft: MeanStdPair
+
+    :param get_tpot: Returns the mean and standard deviation of TPOT.
+    :type get_tpot: MeanStdPair
+
+    :param get_throughput: Returns the mean and standard deviation of throughput.
+    :type get_throughput: MeanStdPair
+
+    :param get_generate_duration: Returns the mean and standard deviation of generate duration.
+    :type get_generate_duration: MeanStdPair
+
+    :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration.
+    :type get_tokenization_duration: MeanStdPair
+
+    :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration.
+    :type get_detokenization_duration: MeanStdPair
+
+    :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics.
+    :type raw_metrics: RawPerfMetrics
+)";
 
 OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) {
     if(!config.has_value() && kwargs.empty())
@@ -580,7 +660,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("perf_metrics", &DecodedResults::perf_metrics)
         .def("__str__", &DecodedResults::operator std::string);
 
-    py::class_<RawPerfMetrics>(m, "RawPerfMetrics")
+    py::class_<RawPerfMetrics>(m, "RawPerfMetrics", raw_perf_metrics_docstring)
         .def(py::init<>())
         .def_readonly("generate_durations", &RawPerfMetrics::generate_durations)
         .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { 
@@ -604,7 +684,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readonly("mean", &MeanStdPair::mean)
         .def_readonly("std", &MeanStdPair::std);
 
-    py::class_<PerfMetrics>(m, "PerfMetrics")
+    py::class_<PerfMetrics>(m, "PerfMetrics", perf_metrics_docstring)
         .def(py::init<>())
         .def("get_generate_duration", &PerfMetrics::get_generate_duration)
         .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)

From 2dc6b644140db07b31e72bc34888fb2e9488cf24 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 31 Jul 2024 18:02:05 +0400
Subject: [PATCH 46/54] rc1->rc2 (#695)

---
 .github/workflows/causal_lm_cpp.yml    | 6 +++---
 .github/workflows/genai_package.yml    | 6 +++---
 .github/workflows/genai_python_lib.yml | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 527259f203..79af10734d 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 3df73d437b..b59f5f1eb3 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 257a9c2f57..cf5ce91f01 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.

From 3bfdd3faa0e4c37966e435f0a3a0c7f7a20e9579 Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Wed, 31 Jul 2024 17:14:16 +0300
Subject: [PATCH 47/54] Docs for version compatibility (#692)

Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
---
 src/README.md     |  40 +++++++---
 src/docs/BUILD.md | 200 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 186 insertions(+), 54 deletions(-)

diff --git a/src/README.md b/src/README.md
index e88c2f784f..198251efa1 100644
--- a/src/README.md
+++ b/src/README.md
@@ -5,10 +5,24 @@ It hides the complexity of the generation process and minimizes the amount of co
 
 ## Install OpenVINO™ GenAI
 
+> **NOTE**: Please make sure that you are following the versions compatibility rules, refer to the [OpenVINO™ GenAI Dependencies](#openvino-genai-dependencies) for more information.
+
 The OpenVINO™ GenAI flavor is available for installation via Archive and PyPI distributions.
 To install OpenVINO™ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html).
 
-To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/BUILD.md).
+To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/BUILD.md).
+
+### OpenVINO™ GenAI Dependencies
+
+OpenVINO™ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers).
+
+When installing OpenVINO™ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`).
+If you update one of the dependency packages (e.g. install `openvino-nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2440: cannot open shared object file: No such file or directory`).
+Having packages version in format `<MAJOR>.<MINOR>.<PATCH>.<REVISION>`, only `<REVISION>` part of the full version can be varied to ensure ABI compatibility, while changing `<MAJOR>`, `<MINOR>` or `<PATCH>` parts of the version might break ABI.
+
+GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI.
+
+If you want to try OpenVINO GenAI with different dependencies versions (**not** prebuilt packages as archives or python wheels), build OpenVINO GenAI library from source.
 
 ## Usage
 
@@ -16,16 +30,16 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions
 
 1. Installed OpenVINO™ GenAI
 
-    > If OpenVINO GenAI is installed via archive distribution or built from source, you will need to install additional python dependencies (e.g. `optimum-cli` for simplified model downloading and exporting, it's not required to install [./samples/requirements.txt](./samples/requirements.txt) for deployment if the model has already been exported):
-    > 
-    > ```sh
-    > # (Optional) Clone OpenVINO GenAI repository if it does not exist
-    > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
-    > cd openvino.genai
-    > # Install python dependencies
-    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-    > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-    > ```
+    > To use OpenVINO GenAI with models that are already in OpenVINO format, no additional python dependencies are needed. To
+    > convert models with optimum-cli and to run the examples, install the dependencies in [./samples/requirements.txt](./samples/requirements.txt):
+    ```sh
+    # (Optional) Clone OpenVINO GenAI repository if it does not exist
+    git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
+    cd openvino.genai
+    # Install python dependencies
+    python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
+    ```
 
 2. A model in OpenVINO IR format
 
@@ -289,8 +303,8 @@ For more examples of how metrics are used, please refer to the Python [benchmark
 
 ## How It Works
 
-For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md).
+For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/HOW_IT_WORKS.md).
 
 ## Supported Models
 
-For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/SUPPORTED_MODELS.md).
+For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/SUPPORTED_MODELS.md).
diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 3b89995dc2..81cff10b48 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -3,85 +3,203 @@
 > **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`.
 The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html).
 
-## Build for Linux Systems
+## Software Requirements
 
-### Software Requirements 
+### Linux
 
 - [CMake](https://cmake.org/download/) 3.23 or higher
 - GCC 7.5 or higher
 - Python 3.8 or higher
+- Git
 
-### Build Instructions
+### Windows
+
+- [CMake](https://cmake.org/download/) 3.23 or higher
+- Microsoft Visual Studio 2019 or higher, version 16.3 or later
+- Python 3.8 or higher
+- Git for Windows
+- [NSIS](https://sourceforge.net/projects/nsis/)
+
+### macOS
+
+- [CMake](https://cmake.org/download/) 3.23 or higher
+- [brew](https://brew.sh/) package manager to install additional dependencies:
+    ```sh
+    brew install coreutils scons
+    ```
+- Clang compiler and other command line tools from Xcode 10.1 or higher:
+    ```sh
+    xcode-select --install
+    ```
+- Python 3.8 or higher
+- Git
+
+
+## Build Instructions
+
+### Build OpenVINO, OpenVINO Tokenizers, and OpenVINO GenAI From Source
 
 1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build).  
-The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+The path to the OpenVINO install directory is referred as `<INSTALL_DIR>` throughout the document.
 2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-3. Build the project:
+3. Set up the environment:
+
+    #### Option 1 - using OpenVINO `setupvars` script:
+
+    Linux and macOS:
     ```sh
     source <INSTALL_DIR>/setupvars.sh
-    cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-    cmake --build ./build/ --config Release --target package -j
-    cmake --install ./build/ --config Release --prefix ov
     ```
 
-## Build for Windows Systems
+    Windows Command Prompt:
+    ```cmd
+    call <INSTALL_DIR>\setupvars.bat
+    ```
 
-### Software Requirements 
+    Windows PowerShell:
+    ```cmd
+    . <INSTALL_DIR>/setupvars.ps1
+    ```
 
-- [CMake](https://cmake.org/download/) 3.23 or higher
-- Microsoft Visual Studio 2019 or higher, version 16.3 or later
-- Python 3.8 or higher
-- Git for Windows
+    #### Option 2 - setting environment variables manually:
 
-### Build Instructions
+    Linux:
+    ```sh
+    export OpenVINO_DIR=<INSTALL_DIR>/runtime
+    export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH
+    export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH
+    ```
 
-1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
-The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
-2. Clone OpenVINO GenAI repository and init submodules:
+    macOS:
     ```sh
-    git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
-    cd openvino.genai
+    export OpenVINO_DIR=<INSTALL_DIR>/runtime
+    export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH
+    export DYLD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH
+    ```
+
+    Windows Command Prompt:
+    ```cmd
+    set OpenVINO_DIR=<INSTALL_DIR>\runtime
+    set PYTHONPATH=<INSTALL_DIR>\python;%CD%\build;%PYTHONPATH%
+    set OPENVINO_LIB_PATHS=<INSTALL_DIR>\bin\intel64\Release;%OPENVINO_LIB_PATHS%
+    set PATH=%OPENVINO_LIB_PATHS%;%PATH%
     ```
-3. Build the project:
+    
+    Windows PowerShell:
+    ```sh
+    $env:OpenVINO_DIR = "<INSTALL_DIR>\runtime"
+    $env:PYTHONPATH = "<INSTALL_DIR>\python;$PWD\build;$env:PYTHONPATH"
+    $env:OPENVINO_LIB_PATHS = "<INSTALL_DIR>\bin\intel64\Release;$env:OPENVINO_LIB_PATHS"
+    $env:PATH = "$env:OPENVINO_LIB_PATHS;$env:PATH"
+    ```
+
+4. Build the project:
     ```sh
-    call <INSTALL_DIR>\setupvars.bat
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-    cmake --build ./build/ --config Release --target package -j
-    cmake --install ./build/ --config Release --prefix ov
+    cmake --build ./build/ --config Release -j
     ```
 
-## Build for macOS Systems
+5. Install OpenVINO GenAI:
 
-### Software Requirements
+    #### Option 1 - using cmake:
+    
+    The following command will store built OpenVINO GenAI artifacts along with OpenVINO in `<INSTALL_DIR>`:
 
-- [CMake](https://cmake.org/download/) 3.23 or higher
-- [brew](https://brew.sh/) package manager to install additional dependencies:
     ```sh
-    brew install coreutils scons
+    cmake --install ./build/ --config Release --prefix <INSTALL_DIR>
     ```
-- Clang compiler and other command line tools from Xcode 10.1 or higher:
+
+    #### Option 2 - setting paths to built OpenVINO GenAI artifacts manually:
+
+    The path to the OpenVINO GenAI root directory is referred as `<GENAI_ROOT_DIR>` throughout the document.
+
+    Linux:
     ```sh
-    xcode-select --install
+    export PYTHONPATH=<GENAI_ROOT_DIR>/build/:$PYTHONPATH
+    export LD_LIBRARY_PATH=<GENAI_ROOT_DIR>/build/openvino_genai/:$LD_LIBRARY_PATH
     ```
-- Python 3.8 or higher
 
-### Build Instructions
+    macOS:
+    ```sh
+    export PYTHONPATH=<GENAI_ROOT_DIR>/build:$PYTHONPATH
+    export DYLD_LIBRARY_PATH=<GENAI_ROOT_DIR>/build/openvino_genai:$DYLD_LIBRARY_PATH
+    ```
 
-1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
-The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
-2. Clone OpenVINO GenAI repository and init submodules:
+    Windows Command Prompt:
+    ```cmd
+    set PYTHONPATH=<GENAI_ROOT_DIR>\build;%PYTHONPATH%
+    set PATH=<GENAI_ROOT_DIR>\build\openvino_genai;%PATH%
+    ```
+
+    Windows PowerShell:
+    ```sh
+    $env:PYTHONPATH = "<GENAI_ROOT_DIR>\build;$env:PYTHONPATH"
+    $env:PATH = "<GENAI_ROOT_DIR>\build\openvino_genai;$env:PATH"
+    ```
+
+To optimize the package size, you can reduce the ICU (International Components for Unicode) data size when OpenVINO Tokenizers are built as a submodule of OpenVINO GenAI.
+For more information please refer to the [OpenVINO Tokenizers instructions](https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#reducing-the-icu-data-size).
+
+
+### Build OpenVINO GenAI Wheel
+
+1. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-3. Build the project:
+2. Set up the environment:
+    - Option 1 - using OpenVINO `setupvars.sh` script:
+        ```sh
+        source <INSTALL_DIR>/setupvars.sh
+        ```
+    - Option 2 - setting environment variables manually:
+        ```sh
+        export OpenVINO_DIR=<INSTALL_DIR>/runtime
+        export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH
+        export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH
+        ```
+3. Upgrade pip to ensure you have the latest version:
     ```sh
-    source <INSTALL_DIR>/setupvars.sh
-    cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-    cmake --build ./build/ --config Release --target package -j
-    cmake --install ./build/ --config Release --prefix ov
+    python -m pip install --upgrade pip
+    ```
+4. Build the wheel in the `dist` directory:
+    ```sh
+    python -m pip wheel . -w dist/ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+    ```
+
+### Install OpenVINO GenAI From Source
+
+1. Clone OpenVINO GenAI repository and init submodules:
+    ```sh
+    git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
+    cd openvino.genai
+    ```
+2. Set up the environment:
+    - Option 1 - using OpenVINO `setupvars.sh` script:
+        ```sh
+        source <INSTALL_DIR>/setupvars.sh
+        ```
+    - Option 2 - setting environment variables manually:
+        ```sh
+        export OpenVINO_DIR=<INSTALL_DIR>/runtime
+        export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH
+        export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH
+        ```
+3. Upgrade pip to ensure you have the latest version:
+    ```sh
+    python -m pip install --upgrade pip
+    ```
+4. Install the package directly from source:
+    ```sh
+    python -m pip install .
+    ```
+5. To verify the installation, run a simple Python script:
+    ```python
+    import openvino_genai
+    print(openvino_genai.__version__)
     ```

From e76f9f97817496cb74412e2c0339155a64186d95 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 00:13:44 +0400
Subject: [PATCH 48/54] coorect after git merge conflict resolution

---
 .github/workflows/causal_lm_cpp.yml           | 60 +++++++++----------
 .github/workflows/genai_package.yml           | 12 ++--
 .github/workflows/genai_python_lib.yml        |  6 +-
 .github/workflows/lcm_dreamshaper_cpp.yml     |  8 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 llm_bench/python/utils/nncf_utils.py          |  4 +-
 src/cpp/src/block_manager.hpp                 |  4 --
 src/cpp/src/continuous_batching_pipeline.cpp  | 19 ------
 tests/cpp/scheduler.cpp                       |  5 --
 9 files changed, 47 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 99b98a9064..2263277b68 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -34,8 +34,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -77,8 +77,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -210,8 +210,8 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -255,8 +255,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -282,8 +282,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -310,8 +310,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -338,8 +338,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -366,8 +366,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
@@ -403,8 +403,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -447,8 +447,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -495,8 +495,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -545,8 +545,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -605,8 +605,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -650,8 +650,8 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -691,8 +691,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index c401926b08..d89ad2097b 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -28,8 +28,8 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -57,8 +57,8 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -100,8 +100,8 @@ jobs:
         shell: bash
       - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 825c9500c6..58e340a5b9 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -29,7 +29,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -m pytest ./tests/python_tests/
@@ -52,7 +52,7 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
@@ -81,7 +81,7 @@ jobs:
         shell: bash
       - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
+      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index b59e7014d2..8d6398027b 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -50,8 +50,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
@@ -95,8 +95,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 4f323dc218..c947bdb4b0 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -49,8 +49,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
-          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index d89f21aaf2..01d0dd95b3 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -38,7 +38,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
@@ -69,7 +69,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
index e66299fd8a..d9815610c5 100644
--- a/src/cpp/src/block_manager.hpp
+++ b/src/cpp/src/block_manager.hpp
@@ -222,10 +222,6 @@ class BlockAllocator {
         return nullptr;
     }
 
-<<<<<<< HEAD
-=======
-
->>>>>>> releases/2024/3
     float get_used_percentage() const {
         return static_cast<float>(m_total_num_blocks - num_free_blocks()) / m_total_num_blocks;
     }
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 02f52cc594..ebfd13c0ed 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -127,24 +127,9 @@ class ContinuousBatchingPipeline::Impl {
         return m_tokenizer;
     }
 
-<<<<<<< HEAD
-    GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
-        sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
-        sampling_params.validate();
-
-        ov::Tensor input_ids;
-        {
-            static ManualTimer timer("tokenize");
-            timer.start();
-            input_ids = m_tokenizer.encode(prompt).input_ids;
-            timer.end();
-        }
-
-=======
     GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) {
         sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
->>>>>>> releases/2024/3
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                             sampling_params, m_scheduler->get_config().block_size);
         {
@@ -309,12 +294,8 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-<<<<<<< HEAD
-                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
                 result.m_generation_ids.push_back(output_text);
-=======
                 result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids));
->>>>>>> releases/2024/3
                 result.m_scores.push_back(generation_output.score);
             }
             result.m_status = generation->get_status();
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index 9de5d0dfeb..0a4b04f880 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -367,11 +367,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
     }
 }
 
-<<<<<<< HEAD
-
-
-=======
->>>>>>> releases/2024/3
 TEST(TestScheduler, prefix_caching_test) {
     std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()};
     configs.at(0).max_num_batched_tokens = 32;

From 9a0b7e936f469058bec359ebc138496d3f86ead5 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 00:13:44 +0400
Subject: [PATCH 49/54] coorect after git merge conflict resolution

---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index ebfd13c0ed..a66a88cad4 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -294,7 +294,6 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                result.m_generation_ids.push_back(output_text);
                 result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids));
                 result.m_scores.push_back(generation_output.score);
             }

From a88cfc820a6eeee67f556ec3355637854ab63d4d Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 00:13:44 +0400
Subject: [PATCH 50/54] coorect after git merge conflict resolution

---
 src/cpp/src/llm_pipeline_static.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index ebe0858ce8..f58a38e34c 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -176,7 +176,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
+        m_prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
         kvcache_model, device, extract_config_or_empty(config, "GENERATE_CONFIG")

From 2311f6ed4572eda9685cca8b48f98fd49992520d Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 00:13:44 +0400
Subject: [PATCH 51/54] coorect after git merge conflict resolution

---
 src/cpp/src/llm_pipeline_static.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index f58a38e34c..d05d928df6 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -176,10 +176,10 @@ StaticLLMPipeline::StaticLLMPipeline(
     m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        m_prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
+        m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
-        kvcache_model, device, extract_config_or_empty(config, "GENERATE_CONFIG")
+        kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
     ).create_infer_request();
     // (7) Initialize tensors
     prepare_for_new_conversation();

From 45937bc24f02cd377a5264610e92993ac8d285e7 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 10:20:03 +0400
Subject: [PATCH 52/54] cache_size

---
 src/python/py_generate_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 128f9c12d1..a429fc4801 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -751,6 +751,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def(py::init<>())
         .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens)
         .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks)
+        .def_readwrite("cache_size", &SchedulerConfig::cache_size)
         .def_readwrite("block_size", &SchedulerConfig::block_size)
         .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse)
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs)

From 4962039c803d13b4aacd5fd235d46cac195e7fcf Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 10:46:55 +0400
Subject: [PATCH 53/54] skip

---
 tests/python_tests/test_chat_generate_api.py | 1 +
 tests/python_tests/test_generate_api.py      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index bd1d45d18f..295674e101 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -174,6 +174,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
 @pytest.mark.parametrize("generation_config", configs[1:])
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793")
 def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
     model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat'))
     cb = get_continuous_batching(path)
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 278b3d398a..fe306e2a37 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -711,6 +711,7 @@ def test_left_pad():
 @pytest.mark.parametrize("prompt", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793")
 def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config):
     model_id, path, tokenizer, model, stateful = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -730,6 +731,7 @@ def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config)
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793")
 def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt):
     model_id, path, tokenizer, model, stateful = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",

From b5f21dc78359e0c2564c2050bdbe752a39ab42f6 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 1 Aug 2024 14:05:33 +0400
Subject: [PATCH 54/54] Correct links and typos

---
 samples/python/benchmark_genai/README.md        |  4 ++--
 src/README.md                                   | 10 +++++-----
 src/cpp/include/openvino/genai/llm_pipeline.hpp |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md
index 1ff9ef4305..9baf17c4d7 100644
--- a/samples/python/benchmark_genai/README.md
+++ b/samples/python/benchmark_genai/README.md
@@ -16,7 +16,7 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 ## Usage
 
 ```sh
-python benchmark_vanilla_genai.py [OPTIONS]
+python benchmark_genai.py [OPTIONS]
 ```
 
 ### Options
@@ -31,7 +31,7 @@ python benchmark_vanilla_genai.py [OPTIONS]
 ### Output:
 
 ```
-python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
+python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
 ```
 
 ```
diff --git a/src/README.md b/src/README.md
index 198251efa1..893ffb5ea9 100644
--- a/src/README.md
+++ b/src/README.md
@@ -10,14 +10,14 @@ It hides the complexity of the generation process and minimizes the amount of co
 The OpenVINO™ GenAI flavor is available for installation via Archive and PyPI distributions.
 To install OpenVINO™ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html).
 
-To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/BUILD.md).
+To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](./docs/BUILD.md).
 
 ### OpenVINO™ GenAI Dependencies
 
 OpenVINO™ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers).
 
 When installing OpenVINO™ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`).
-If you update one of the dependency packages (e.g. install `openvino-nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2440: cannot open shared object file: No such file or directory`).
+If you update one of the dependency packages (e.g. `pip install openvino --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2430: cannot open shared object file: No such file or directory`).
 Having packages version in format `<MAJOR>.<MINOR>.<PATCH>.<REVISION>`, only `<REVISION>` part of the full version can be varied to ensure ABI compatibility, while changing `<MAJOR>`, `<MINOR>` or `<PATCH>` parts of the version might break ABI.
 
 GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI.
@@ -299,12 +299,12 @@ print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
 print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
 ```
 
-For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/cpp/benchmark_genai/README.md) samples.
+For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples.
 
 ## How It Works
 
-For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/HOW_IT_WORKS.md).
+For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md).
 
 ## Supported Models
 
-For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/SUPPORTED_MODELS.md).
+For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md).
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index e79a6e65f0..4be298128e 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -16,7 +16,7 @@
 namespace ov {
 namespace genai {
 
-// Return flag correspods whether generation should be stopped: false means continue generation, true means stop.
+// Return flag corresponds whether generation should be stopped: false means continue generation, true means stop.
 using StreamerVariant = std::variant<std::function<bool(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
 using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>;