Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add generation time metrics #613

Merged
1 change: 1 addition & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
add_subdirectory(cpp/multinomial_causal_lm)
add_subdirectory(cpp/prompt_lookup_decoding_lm)
add_subdirectory(cpp/speculative_decoding_lm)
add_subdirectory(cpp/benchmark_vanilla_genai)

install(FILES requirements.txt DESTINATION samples
COMPONENT cpp_samples_genai)
Expand Down
25 changes: 25 additions & 0 deletions samples/cpp/benchmark_vanilla_genai/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


find_package(OpenVINOGenAI REQUIRED PATHS
"${CMAKE_BINARY_DIR}" # Reuse the package from the build.
${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO.
)

FetchContent_Declare(cxxopts
URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz
URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
FetchContent_MakeAvailable(cxxopts)

add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp)
target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts)
set_target_properties(benchmark_vanilla_genai PROPERTIES
COMPILE_PDB_NAME benchmark_vanilla_genai
# Ensure out of box LC_RPATH on macOS with SIP
INSTALL_RPATH_USE_LINK_PATH ON)
# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11)
install(TARGETS benchmark_vanilla_genai
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)
3 changes: 3 additions & 0 deletions samples/cpp/benchmark_vanilla_genai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# benchmark OpenVINO GenAI sample

TODO: adapt from python sample to c++
69 changes: 69 additions & 0 deletions samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include <cxxopts.hpp>

int main(int argc, char* argv[]) try {
cxxopts::Options options("benchmark_vanilla_genai", "Help command");

options.add_options()
("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(5)))
("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
("h,help", "Print usage");

cxxopts::ParseResult result;
try {
result = options.parse(argc, argv);
} catch (const cxxopts::exceptions::exception& e) {
std::cout << e.what() << "\n\n";
std::cout << options.help() << std::endl;
return EXIT_FAILURE;
}

if (result.count("help")) {
std::cout << options.help() << std::endl;
return EXIT_SUCCESS;
}

std::string prompt = result["prompt"].as<std::string>();
const std::string model_path = result["model"].as<std::string>();
std::string device = result["device"].as<std::string>();
size_t num_warmup = result["num_warmup"].as<size_t>();
size_t num_iter = result["num_iter"].as<size_t>();

ov::genai::GenerationConfig config;
config.max_new_tokens = result["max_new_tokens"].as<size_t>();

ov::genai::LLMPipeline pipe(model_path, device);

for (size_t i = 0; i < num_warmup; i++)
pipe.generate(prompt, config);

ov::genai::DecodedResults res = pipe.generate(prompt, config);
ov::genai::PerfMetrics metrics = res.metrics;
for (size_t i = 0; i < num_iter - 1; i++) {
res = pipe.generate(prompt, config);
metrics = metrics + res.metrics;
}

std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl;
std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl;
std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl;
std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl;
std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl;

return 0;
} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return EXIT_FAILURE;
} catch (...) {
std::cerr << "Non-exception object thrown\n";
return EXIT_FAILURE;
}
66 changes: 66 additions & 0 deletions samples/python/benchmark_vanilla_genai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Benchmark Vanilla GenAI

This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.

# ov.genai.PerfMetrics structure
ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics:
- mean_ttft
- std_ttft
- mean_tpot
- std_tpot
- load_time
- mean_generate_duration
- std_generate_duration
- mean_tokenization_duration
- std_tokenization_duration
- mean_detokenization_duration
- std_detokenization_duration
- mean_throughput
- std_throughput
- num_generated_tokens
- num_input_tokens

Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated.


## Download and convert the model and tokenizers

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.

It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.

```sh
pip install --upgrade-strategy eager -r ../../requirements.txt
optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
```

## Usage

```sh
python benchmark_vanilla_genai.py [OPTIONS]
```

### Options

- `-m, --model`: Path to the model and tokenizers base directory.
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
- `-n, --num_iter` (default: `3`): Number of iterations.
- `-d, --device` (default: `"CPU"`): Device to run the model on.

### Output:

```
python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
```

```
Load time: 3446 ms
Generate time: 876.2 ± 3.30719 ms
Tokenization time: 0 ± 0 ms
Detokenization time: 0 ± 0 ms
ttft: 168 ± 0 ms
tpot: 174.68 ± 4.08671 ms
Tokens/s: 5.72475 ± 0.133933
```
50 changes: 50 additions & 0 deletions samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import openvino_genai as ov_genai
import pdb

def main():
parser = argparse.ArgumentParser(description="Help command")
parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations")
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")

args = parser.parse_args()

prompt = [args.prompt]
model_path = args.model
device = args.device
num_warmup = args.num_warmup
num_iter = args.num_iter


config = ov_genai.GenerationConfig()
config.max_new_tokens = args.num_new_tokens

pipe = ov_genai.LLMPipeline(model_path, device)

for _ in range(num_warmup):
pipe.generate(prompt, config)

res = pipe.generate(prompt, config)
metrics = res.metrics
for _ in range(num_iter - 1):
# pdb.set_trace()
res = pipe.generate(prompt, config)
metrics += res.metrics

print(f"Load time: {metrics.load_time} ms")
print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms")
print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms")
print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}")

if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

#include <optional>
#include <variant>
#include <chrono>

#include "openvino/core/any.hpp"
#include "openvino/genai/generation_config.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/perf_metrics.hpp"

namespace ov {
namespace genai {
Expand All @@ -29,11 +31,13 @@ using StringInputs = std::variant<std::string, std::vector<std::string>>;
*
* @param tokens sequence of resulting tokens
* @param scores sum of logarithmic probabilities of all tokens in the sequence
* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
*/
class EncodedResults {
public:
std::vector<std::vector<int64_t>> tokens;
std::vector<float> scores;
PerfMetrics metrics;
};

/**
Expand All @@ -42,11 +46,13 @@ class EncodedResults {
*
* @param texts vector of resulting sequences
* @param scores scores for each sequence
* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics
*/
class DecodedResults {
public:
std::vector<std::string> texts;
std::vector<float> scores;
PerfMetrics metrics;

// @brief Convert DecodedResults to a string.
operator std::string() const {
Expand Down
71 changes: 71 additions & 0 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <chrono>
#include "openvino/genai/visibility.hpp"
#include <vector>
#include <memory>
#include <optional>

namespace ov {
namespace genai {

using TimePoint = std::chrono::steady_clock::time_point;

/**
* @brief Structure with raw performance metrics for each generation before any statistics calculated.
*/
struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
std::vector<float> generate_durations;
std::vector<float> tokenization_durations;
std::vector<float> detokenization_durations;

std::vector<float> m_times_to_first_token;
std::vector<TimePoint> m_new_token_times;
std::vector<size_t> m_batch_sizes;
std::vector<float> m_durations;

size_t num_generated_tokens;
size_t num_input_tokens;
};

/**
* @brief Structure to store performance metric for each generation
*
*/
struct OPENVINO_GENAI_EXPORTS PerfMetrics {
// First token time.
float mean_ttft;
float std_ttft;

// Time per output token.
float mean_tpot;
float std_tpot;

float load_time;

float mean_generate_duration;
float std_generate_duration;
float mean_tokenization_duration;
float std_tokenization_duration;
float mean_detokenization_duration;
float std_detokenization_duration;

float mean_throughput;
float std_throughput;

size_t num_generated_tokens;
size_t num_input_tokens;

void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt);
static float get_duration_ms(std::chrono::steady_clock::duration duration);
PerfMetrics operator+(const PerfMetrics& metrics) const;
PerfMetrics& operator+=(const PerfMetrics& right);

RawPerfMetrics raw_counters;
};

} // namespace genai
} // namespace ov
Loading
Loading