Skip to content

Commit

Permalink
Tests for generation config (#1448)
Browse files Browse the repository at this point in the history
CVS-159946
  • Loading branch information
ilya-lavrenov authored Dec 28, 2024
1 parent d88dda9 commit 6c56a7b
Show file tree
Hide file tree
Showing 24 changed files with 450 additions and 360 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try {
config.max_new_tokens = 20;
config.num_beam_groups = 3;
config.num_beams = 15;
config.diversity_penalty = 1.0f;
config.num_return_sequences = config.num_beams;

// Since the streamer is set, the results will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def main():
config.max_new_tokens = 20
config.num_beam_groups = 3
config.num_beams = 15
config.diversity_penalty = 1
config.num_return_sequences = config.num_beams

beams = pipe.generate(args.prompts, config)
Expand Down
28 changes: 18 additions & 10 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool echo = false;
size_t logprobs = 0;

// EOS special token
int64_t eos_token_id = -1;
std::set<std::string> stop_strings;
// Default setting in vLLM (and OpenAI API) is not to include stop string in the output
bool include_stop_str_in_output = false;
std::set<int64_t> stop_token_ids;

// penalties (not used in beam search)
float repetition_penalty = 1.0f;
float presence_penalty = 0.0;
float frequency_penalty = 0.0f;

// Beam search specific
size_t num_beam_groups = 1;
size_t num_beams = 1;
float diversity_penalty = 1.0f;
float diversity_penalty = 0.0f;
float length_penalty = 1.0f;
size_t num_return_sequences = 1;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
Expand All @@ -112,19 +119,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
float top_p = 1.0f;
size_t top_k = std::numeric_limits<size_t>::max();
bool do_sample = false;
float repetition_penalty = 1.0f;
float presence_penalty = 0.0;
float frequency_penalty = 0.0f;
size_t rng_seed = 0;

// Assisting generation parameters
float assistant_confidence_threshold = 0.f;
size_t num_assistant_tokens = 0;
size_t max_ngram_size = 0;

// EOS special token
int64_t eos_token_id = -1;

std::optional<AdapterConfig> adapters;

/** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
Expand All @@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multinomial() const;
OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release")
bool is_speculative_decoding() const;
bool is_assisting_generation() const;
bool is_prompt_lookup() const;
void update_generation_config(const ov::AnyMap& config_map);

OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
bool is_speculative_decoding() const;

void update_generation_config(const ov::AnyMap& properties);

template <typename... Properties>
util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
Expand Down Expand Up @@ -187,8 +190,13 @@ static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_c
static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};

// Predefined Configs

OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig beam_search();
OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig greedy();
OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
OPENVINO_GENAI_EXPORTS GenerationConfig multinomial();

} // namespace genai
} // namespace ov
240 changes: 154 additions & 86 deletions src/cpp/src/generation_config.cpp

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions src/cpp/src/json_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

#pragma once

#include <vector>
#include <set>

#include <nlohmann/json.hpp>

namespace ov {
Expand Down Expand Up @@ -40,6 +43,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, std::v
}
}

template <typename V>
void read_json_param(const nlohmann::json& data, const std::string& name, std::set<V>& param) {
if (data.contains(name) && data[name].is_array()) {
for (const auto elem : data[name]) {
param.insert(elem.get<V>());
}
}
}

} // namespace utils
} // namespace genai
} // namespace ov
5 changes: 2 additions & 3 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
const ov::AnyMap& config,
const ov::genai::GenerationConfig& generation_config
) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
ov::Core core = utils::singleton_core();
ov::CompiledModel compiled_model;
auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
utils::slice_matmul_stateful_model(model);
Expand All @@ -81,10 +80,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable
compiled_model = core.compile_model(model, device, *filtered_plugin_config);
compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
m_model_runner = compiled_model.create_infer_request();
} else {
compiled_model = core.compile_model(model, device, plugin_config);
compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
m_model_runner = compiled_model.create_infer_request();
}
ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
Expand Down
26 changes: 16 additions & 10 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -367,16 +367,16 @@ class ContinuousBatchingPipeline:
def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
...
@typing.overload
def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle:
def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
...
@typing.overload
def add_request(self, request_id: int, prompt: str, sampling_params: GenerationConfig) -> GenerationHandle:
def add_request(self, request_id: int, prompt: str, generation_config: GenerationConfig) -> GenerationHandle:
...
@typing.overload
def generate(self, input_ids: list[openvino._pyopenvino.Tensor], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]:
def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]:
...
@typing.overload
def generate(self, prompts: list[str], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]:
def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]:
...
def get_config(self) -> GenerationConfig:
...
Expand Down Expand Up @@ -609,11 +609,15 @@ class GenerationConfig:
...
def is_greedy_decoding(self) -> bool:
...
def is_multinomial(self) -> bool:
...
def is_prompt_lookup(self) -> bool:
...
def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
...
def update_generation_config(self, config_map: dict[str, openvino._pyopenvino.OVAny]) -> None:
def update_generation_config(self, **kwargs) -> None:
...
def validate(self) -> None:
...
class GenerationFinishReason:
"""
Expand Down Expand Up @@ -826,7 +830,7 @@ class Image2ImagePipeline:
...
def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
...
def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
def set_generation_config(self, config: ImageGenerationConfig) -> None:
...
def set_scheduler(self, scheduler: Scheduler) -> None:
...
Expand Down Expand Up @@ -927,7 +931,7 @@ class InpaintingPipeline:
...
def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
...
def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
def set_generation_config(self, config: ImageGenerationConfig) -> None:
...
def set_scheduler(self, scheduler: Scheduler) -> None:
...
Expand Down Expand Up @@ -1615,7 +1619,7 @@ class Text2ImagePipeline:
...
def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
...
def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
def set_generation_config(self, config: ImageGenerationConfig) -> None:
...
def set_scheduler(self, scheduler: Scheduler) -> None:
...
Expand Down Expand Up @@ -1865,9 +1869,9 @@ class VLMPipeline:
...
def get_tokenizer(self) -> Tokenizer:
...
def set_chat_template(self, new_template: str) -> None:
def set_chat_template(self, chat_template: str) -> None:
...
def set_generation_config(self, new_config: GenerationConfig) -> None:
def set_generation_config(self, config: GenerationConfig) -> None:
...
def start_chat(self, system_message: str = '') -> None:
...
Expand Down Expand Up @@ -2043,6 +2047,8 @@ class WhisperGenerationConfig:
...
def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
...
def update_generation_config(self, **kwargs) -> None:
...
class WhisperPerfMetrics(PerfMetrics):
"""
Expand Down
8 changes: 4 additions & 4 deletions src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,22 +235,22 @@ void init_continuous_batching_pipeline(py::module_& m) {
.def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
.def("get_config", &ContinuousBatchingPipeline::get_config)
.def("get_metrics", &ContinuousBatchingPipeline::get_metrics)
.def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("sampling_params"))
.def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("sampling_params"))
.def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config"))
.def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config"))
.def("step", &ContinuousBatchingPipeline::step)
.def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
.def(
"generate",
py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
py::arg("input_ids"),
py::arg("sampling_params"),
py::arg("generation_config"),
py::arg("streamer") = std::monostate{}
)
.def(
"generate",
py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
py::arg("prompts"),
py::arg("sampling_params"),
py::arg("generation_config"),
py::arg("streamer") = std::monostate{}
);
}
8 changes: 7 additions & 1 deletion src/python/py_generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,13 @@ void init_generation_config(py::module_& m) {
.def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
.def("is_beam_search", &GenerationConfig::is_beam_search)
.def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding)
.def("is_multinomial", &GenerationConfig::is_multinomial)
.def("is_assisting_generation", &GenerationConfig::is_assisting_generation)
.def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup)
.def("update_generation_config", static_cast<void (GenerationConfig::*)(const ov::AnyMap&)>(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map"));
.def("validate", &GenerationConfig::validate)
.def("update_generation_config", [](
ov::genai::GenerationConfig& config,
const py::kwargs& kwargs) {
config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
});
}
14 changes: 7 additions & 7 deletions src/python/py_image_generation_pipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ void init_image_generation_pipelines(py::module_& m) {
.def_readwrite("max_sequence_length", &ov::genai::ImageGenerationConfig::max_sequence_length)
.def("validate", &ov::genai::ImageGenerationConfig::validate)
.def("update_generation_config", [](
ov::genai::ImageGenerationConfig config,
ov::genai::ImageGenerationConfig& config,
const py::kwargs& kwargs) {
config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
});
Expand Down Expand Up @@ -255,8 +255,8 @@ void init_image_generation_pipelines(py::module_& m) {
device (str): Device to run the model on (e.g., CPU, GPU).
kwargs: Text2ImagePipeline properties
)")
.def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config)
.def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("generation_config"))
.def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config, py::return_value_policy::copy)
.def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("config"))
.def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler, py::arg("scheduler"))
.def("reshape", &ov::genai::Text2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
.def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
Expand Down Expand Up @@ -323,8 +323,8 @@ void init_image_generation_pipelines(py::module_& m) {
device (str): Device to run the model on (e.g., CPU, GPU).
kwargs: Image2ImagePipeline properties
)")
.def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config)
.def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("generation_config"))
.def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config, py::return_value_policy::copy)
.def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("config"))
.def("set_scheduler", &ov::genai::Image2ImagePipeline::set_scheduler, py::arg("scheduler"))
.def("reshape", &ov::genai::Image2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
.def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
Expand Down Expand Up @@ -386,8 +386,8 @@ void init_image_generation_pipelines(py::module_& m) {
device (str): Device to run the model on (e.g., CPU, GPU).
kwargs: InpaintingPipeline properties
)")
.def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config)
.def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("generation_config"))
.def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config, py::return_value_policy::copy)
.def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("config"))
.def("set_scheduler", &ov::genai::InpaintingPipeline::set_scheduler, py::arg("scheduler"))
.def("reshape", &ov::genai::InpaintingPipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
.def_static("stable_diffusion", &ov::genai::InpaintingPipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
Expand Down
9 changes: 2 additions & 7 deletions src/python/py_llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,10 @@ py::object call_common_generate(
const pyutils::PyBindStreamerVariant& py_streamer,
const py::kwargs& kwargs
) {
ov::genai::GenerationConfig default_config;
if (config.has_value()) {
default_config = *config;
} else {
default_config = pipe.get_generation_config();
}
ov::genai::GenerationConfig default_config = config.has_value() ? *config : pipe.get_generation_config();
auto updated_config = pyutils::update_config_from_kwargs(default_config, kwargs);

py::object results;
EncodedInputs tensor_data;
StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer);

// Call suitable generate overload for each type of input.
Expand Down
5 changes: 4 additions & 1 deletion src/python/py_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,10 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O
ov::genai::GenerationConfig res_config;
if(config.has_value())
res_config = *config;
res_config.update_generation_config(kwargs_to_any_map(kwargs));

if (!kwargs.empty())
res_config.update_generation_config(kwargs_to_any_map(kwargs));

return res_config;
}

Expand Down
6 changes: 3 additions & 3 deletions src/python/py_vlm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,10 @@ void init_vlm_pipeline(py::module_& m) {

.def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "")
.def("finish_chat", &ov::genai::VLMPipeline::finish_chat)
.def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("new_template"))
.def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("chat_template"))
.def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer)
.def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config)
.def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("new_config"))
.def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config, py::return_value_policy::copy)
.def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("config"))
.def(
"generate",
[](ov::genai::VLMPipeline& pipe,
Expand Down
Loading

0 comments on commit 6c56a7b

Please sign in to comment.