openvinotoolkit · andrei-kochin · Jul 26, 2024 · Jul 11, 2024 · Jul 12, 2024 · Jul 19, 2024
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
+add_subdirectory(cpp/benchmark_vanilla_genai)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)

diff --git a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt b/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+find_package(OpenVINOGenAI REQUIRED PATHS
+    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+)
+
+FetchContent_Declare(cxxopts
+    URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz
+    URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
+FetchContent_MakeAvailable(cxxopts)
+
+add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp)
+target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_vanilla_genai PROPERTIES
+    COMPILE_PDB_NAME benchmark_vanilla_genai
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11)
+install(TARGETS benchmark_vanilla_genai
+    RUNTIME DESTINATION samples_bin/
+    COMPONENT samples_bin
+    EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md
@@ -0,0 +1,3 @@
+# benchmark OpenVINO GenAI sample
+
+TODO: adapt from python sample to c++
diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/llm_pipeline.hpp"
+#include <cxxopts.hpp>
+
+int main(int argc, char* argv[]) try {
+    cxxopts::Options options("benchmark_vanilla_genai", "Help command");
+
+    options.add_options()
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
+    ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(5)))
+    ("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
+    ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
+    ("h,help", "Print usage");
+
+    cxxopts::ParseResult result;
+    try {
+        result = options.parse(argc, argv);
+    } catch (const cxxopts::exceptions::exception& e) {
+        std::cout << e.what() << "\n\n";
+        std::cout << options.help() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (result.count("help")) {
+        std::cout << options.help() << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    std::string prompt = result["prompt"].as<std::string>();
+    const std::string model_path = result["model"].as<std::string>();
+    std::string device = result["device"].as<std::string>();
+    size_t num_warmup = result["num_warmup"].as<size_t>();
+    size_t num_iter = result["num_iter"].as<size_t>();
+
+    ov::genai::GenerationConfig config;
+    config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+
+    ov::genai::LLMPipeline pipe(model_path, device);
+
+    for (size_t i = 0; i < num_warmup; i++)
+        pipe.generate(prompt, config);
+
+    ov::genai::DecodedResults res = pipe.generate(prompt, config);
+    ov::genai::PerfMetrics metrics = res.metrics;
+    for (size_t i = 0; i < num_iter - 1; i++) {
+        res = pipe.generate(prompt, config);
+        metrics = metrics + res.metrics;
+    }
+
+    std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
+    std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl;
+    std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
+    std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl;
+    std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl;
+
+    return 0;
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md
@@ -0,0 +1,66 @@
+# Benchmark Vanilla GenAI
+
+This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+# ov.genai.PerfMetrics structure
+ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics:
+- mean_ttft
+ - std_ttft
+ - mean_tpot
+ - std_tpot
+ - load_time
+ - mean_generate_duration
+ - std_generate_duration
+ - mean_tokenization_duration
+ - std_tokenization_duration
+ - mean_detokenization_duration
+ - std_detokenization_duration
+ - mean_throughput
+ - std_throughput
+ - num_generated_tokens
+ - num_input_tokens
+
+Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated.
+
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+python benchmark_vanilla_genai.py [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
+```
+
+```
+Load time: 3446 ms
+Generate time: 876.2 ± 3.30719 ms
+Tokenization time: 0 ± 0 ms
+Detokenization time: 0 ± 0 ms
+ttft: 168 ± 0 ms
+tpot: 174.68 ± 4.08671 ms
+Tokens/s: 5.72475 ± 0.133933
+```
diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+import pdb
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+
+    args = parser.parse_args()
+
+    prompt = [args.prompt]
+    model_path = args.model
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+
+
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = args.num_new_tokens
+
+    pipe = ov_genai.LLMPipeline(model_path, device)
+
+    for _ in range(num_warmup):
+        pipe.generate(prompt, config)
+
+    res = pipe.generate(prompt, config)
+    metrics = res.metrics
+    for _ in range(num_iter - 1):
+        # pdb.set_trace()
+        res = pipe.generate(prompt, config)
+        metrics += res.metrics
+
+    print(f"Load time: {metrics.load_time} ms")
+    print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
+    print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
+    print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
+    print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms")
+    print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms")
+    print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -5,11 +5,13 @@
 
 #include <optional>
 #include <variant>
+#include <chrono>
 
 #include "openvino/core/any.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
+#include "openvino/genai/perf_metrics.hpp"
 
 namespace ov {
 namespace genai {
@@ -29,11 +31,13 @@ using StringInputs = std::variant<std::string, std::vector<std::string>>;
 *
 * @param tokens sequence of resulting tokens
 * @param scores sum of logarithmic probabilities of all tokens in the sequence
+* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
 */
 class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
+    PerfMetrics metrics;
 };
 
 /**
@@ -42,11 +46,13 @@ class EncodedResults {
 *
 * @param texts vector of resulting sequences
 * @param scores scores for each sequence
+* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
 */
 class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
+    PerfMetrics metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {

diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include "openvino/genai/visibility.hpp"
+#include <vector>
+#include <memory>
+#include <optional>
+
+namespace ov {
+namespace genai {
+
+using TimePoint = std::chrono::steady_clock::time_point;
+
+/**
+* @brief Structure with raw performance metrics for each generation before any statistics calculated.
+*/
+struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
+    std::vector<float> generate_durations;
+    std::vector<float> tokenization_durations;
+    std::vector<float> detokenization_durations;
+
+    std::vector<float> m_times_to_first_token;
+    std::vector<TimePoint> m_new_token_times;
+    std::vector<size_t> m_batch_sizes;
+    std::vector<float> m_durations;
+
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+};
+
+/**
+* @brief Structure to store performance metric for each generation
+*
+*/
+struct OPENVINO_GENAI_EXPORTS PerfMetrics {
+    // First token time.
+    float mean_ttft;
+    float std_ttft;
+
+    // Time per output token.
+    float mean_tpot;
+    float std_tpot;
+
+    float load_time;
+
+    float mean_generate_duration;
+    float std_generate_duration;
+    float mean_tokenization_duration;
+    float std_tokenization_duration;
+    float mean_detokenization_duration;
+    float std_detokenization_duration;
+
+    float mean_throughput;
+    float std_throughput;
+
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+
+    void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt);
+    static float get_duration_ms(std::chrono::steady_clock::duration duration);
+    PerfMetrics operator+(const PerfMetrics& metrics) const;
+    PerfMetrics& operator+=(const PerfMetrics& right);
+
+    RawPerfMetrics raw_counters;
+};
+
+} // namespace genai
+} // namespace ov
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# benchmark OpenVINO GenAI sample

		TODO: adapt from python sample to c++