diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp index 236b31b351..fc18fa8e0c 100644 --- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try { config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; + config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; // Since the streamer is set, the results will diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py index 16b8b76175..4e2430a47f 100755 --- a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py +++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py @@ -19,6 +19,7 @@ def main(): config.max_new_tokens = 20 config.num_beam_groups = 3 config.num_beams = 15 + config.diversity_penalty = 1 config.num_return_sequences = config.num_beams beams = pipe.generate(args.prompts, config) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 4ea75e94c5..164ff29131 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool echo = false; size_t logprobs = 0; + // EOS special token + int64_t eos_token_id = -1; std::set stop_strings; // Default setting in vLLM (and OpenAI API) is not to include stop string in the output bool include_stop_str_in_output = false; std::set stop_token_ids; + // penalties (not used in beam search) + float repetition_penalty = 1.0f; + float presence_penalty = 0.0; + float frequency_penalty = 0.0f; + // Beam search specific size_t num_beam_groups = 1; size_t num_beams = 1; - float diversity_penalty = 1.0f; + float diversity_penalty = 0.0f; float length_penalty = 1.0f; size_t num_return_sequences = 1; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -112,9 +119,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { float top_p = 1.0f; size_t top_k = std::numeric_limits::max(); bool do_sample = false; - float repetition_penalty = 1.0f; - float presence_penalty = 0.0; - float frequency_penalty = 0.0f; size_t rng_seed = 0; // Assisting generation parameters @@ -122,9 +126,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t num_assistant_tokens = 0; size_t max_ngram_size = 0; - // EOS special token - int64_t eos_token_id = -1; - std::optional adapters; /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. @@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool is_greedy_decoding() const; bool is_beam_search() const; bool is_multinomial() const; - OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release") - bool is_speculative_decoding() const; bool is_assisting_generation() const; bool is_prompt_lookup() const; - void update_generation_config(const ov::AnyMap& config_map); + + OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release") + bool is_speculative_decoding() const; + + void update_generation_config(const ov::AnyMap& properties); template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -187,8 +190,13 @@ static constexpr ov::Property assistant_confidence_threshold{"assistant_c static constexpr ov::Property num_assistant_tokens{"num_assistant_tokens"}; // Predefined Configs + +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig multinomial(); + } // namespace genai } // namespace ov diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 4ff184547e..59be603fd9 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -24,6 +24,7 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { nlohmann::json data = nlohmann::json::parse(f); + read_json_param(data, "eos_token_id", eos_token_id); read_json_param(data, "max_new_tokens", max_new_tokens); read_json_param(data, "max_length", max_length); // note that ignore_eos is not present in HF GenerationConfig @@ -32,28 +33,40 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { read_json_param(data, "stop_strings", stop_strings); // note that include_stop_str_in_output is not present in HF GenerationConfig read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output); - // note that stop_token_ids is not present in HF GenerationConfig - read_json_param(data, "stop_token_ids", stop_token_ids); + // note that stop_token_ids is not present in HF GenerationConfig, but some generation_config.json define + // multiple eos_token_id (e.g. https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/generation_config.json) + // so, we need to read them as 'stop_token_ids' + std::vector ordered_stop_token_ids; + read_json_param(data, "eos_token_id", ordered_stop_token_ids); + + if (!ordered_stop_token_ids.empty()) { + for (int64_t stop_token_id : ordered_stop_token_ids) + stop_token_ids.insert(stop_token_id); + + if (eos_token_id == -1) { + eos_token_id = ordered_stop_token_ids[0]; + } + } + + // note that echo is not present in HF GenerationConfig + read_json_param(data, "echo", echo); + // note that logprobs is not present in HF GenerationConfig + read_json_param(data, "logprobs", logprobs); + + // penalties + read_json_param(data, "repetition_penalty", repetition_penalty); + // note that frequency_penalty is not present in HF GenerationConfig + read_json_param(data, "frequency_penalty", frequency_penalty); + // note that presence_penalty is not present in HF GenerationConfig + read_json_param(data, "presence_penalty", presence_penalty); + + // beam search read_json_param(data, "num_beam_groups", num_beam_groups); read_json_param(data, "num_beams", num_beams); read_json_param(data, "diversity_penalty", diversity_penalty); read_json_param(data, "length_penalty", length_penalty); read_json_param(data, "num_return_sequences", num_return_sequences); read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); - read_json_param(data, "temperature", temperature); - read_json_param(data, "top_p", top_p); - read_json_param(data, "top_k", top_k); - read_json_param(data, "do_sample", do_sample); - read_json_param(data, "repetition_penalty", repetition_penalty); - read_json_param(data, "eos_token_id", eos_token_id); - // note that echo is not present in HF GenerationConfig - read_json_param(data, "echo", echo); - // note that logprobs is not present in HF GenerationConfig - read_json_param(data, "logprobs", logprobs); - - // append EOS to stop_token_ids - if (eos_token_id != -1) - set_eos_token_id(eos_token_id); if (data.contains("early_stopping")) { auto field_type = data["early_stopping"].type(); @@ -65,6 +78,21 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { stop_criteria = StopCriteria::HEURISTIC; } } + + // multinomial + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + + // assistant generation + read_json_param(data, "assistant_confidence_threshold", assistant_confidence_threshold); + read_json_param(data, "num_assistant_tokens", num_assistant_tokens); + read_json_param(data, "max_ngram_size", max_ngram_size); + + // append EOS to stop_token_ids + if (eos_token_id != -1) + set_eos_token_id(eos_token_id); } void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { @@ -79,35 +107,50 @@ void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { stop_token_ids.insert(eos_token_id); } -void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { +void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { using utils::read_anymap_param; - read_anymap_param(config_map, "max_new_tokens", max_new_tokens); - read_anymap_param(config_map, "max_length", max_length); - read_anymap_param(config_map, "ignore_eos", ignore_eos); - read_anymap_param(config_map, "min_new_tokens", min_new_tokens); - read_anymap_param(config_map, "stop_strings", stop_strings); - read_anymap_param(config_map, "include_stop_str_in_output", include_stop_str_in_output); - read_anymap_param(config_map, "stop_token_ids", stop_token_ids); - read_anymap_param(config_map, "num_beam_groups", num_beam_groups); - read_anymap_param(config_map, "num_beams", num_beams); - read_anymap_param(config_map, "diversity_penalty", diversity_penalty); - read_anymap_param(config_map, "length_penalty", length_penalty); - read_anymap_param(config_map, "num_return_sequences", num_return_sequences); - read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size); - read_anymap_param(config_map, "stop_criteria", stop_criteria); - read_anymap_param(config_map, "temperature", temperature); - read_anymap_param(config_map, "top_p", top_p); - read_anymap_param(config_map, "top_k", top_k); - read_anymap_param(config_map, "do_sample", do_sample); - read_anymap_param(config_map, "repetition_penalty", repetition_penalty); - read_anymap_param(config_map, "eos_token_id", eos_token_id); - read_anymap_param(config_map, "echo", echo); - read_anymap_param(config_map, "logprobs", logprobs); - read_anymap_param(config_map, "adapters", adapters); + // stop conditions + read_anymap_param(properties, "eos_token_id", eos_token_id); + read_anymap_param(properties, "max_new_tokens", max_new_tokens); + read_anymap_param(properties, "max_length", max_length); + read_anymap_param(properties, "ignore_eos", ignore_eos); + read_anymap_param(properties, "min_new_tokens", min_new_tokens); + read_anymap_param(properties, "stop_strings", stop_strings); + read_anymap_param(properties, "include_stop_str_in_output", include_stop_str_in_output); + read_anymap_param(properties, "stop_token_ids", stop_token_ids); + + // generic + read_anymap_param(properties, "echo", echo); + read_anymap_param(properties, "logprobs", logprobs); + read_anymap_param(properties, "num_return_sequences", num_return_sequences); + read_anymap_param(properties, "adapters", adapters); + // penalties + read_anymap_param(properties, "frequency_penalty", frequency_penalty); + read_anymap_param(properties, "presence_penalty", presence_penalty); + read_anymap_param(properties, "repetition_penalty", repetition_penalty); + + // beam search + read_anymap_param(properties, "num_beam_groups", num_beam_groups); + read_anymap_param(properties, "num_beams", num_beams); + read_anymap_param(properties, "diversity_penalty", diversity_penalty); + read_anymap_param(properties, "length_penalty", length_penalty); + read_anymap_param(properties, "stop_criteria", stop_criteria); + read_anymap_param(properties, "no_repeat_ngram_size", no_repeat_ngram_size); + + // multinomial + read_anymap_param(properties, "do_sample", do_sample); + read_anymap_param(properties, "temperature", temperature); + read_anymap_param(properties, "top_p", top_p); + read_anymap_param(properties, "top_k", top_k); // TODO: add support of 'generator' property similar to Image generation - read_anymap_param(config_map, "rng_seed", rng_seed); + read_anymap_param(properties, "rng_seed", rng_seed); + + // assistant generation + read_anymap_param(properties, "assistant_confidence_threshold", assistant_confidence_threshold); + read_anymap_param(properties, "num_assistant_tokens", num_assistant_tokens); + read_anymap_param(properties, "max_ngram_size", max_ngram_size); } size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { @@ -136,69 +179,94 @@ bool GenerationConfig::is_speculative_decoding() const { } bool GenerationConfig::is_assisting_generation() const { - return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0); + return assistant_confidence_threshold > 0 || num_assistant_tokens > 0; } bool GenerationConfig::is_prompt_lookup() const { - return (max_ngram_size > 0 && num_assistant_tokens > 0); + return max_ngram_size > 0 && num_assistant_tokens > 0; } void GenerationConfig::validate() const { + OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); + + // Stop conditions + OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(), "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value"); - OPENVINO_ASSERT(!do_sample || num_beams == 1, - "Beam search with sampling is not supported yet. " - "Please either set do_sample=false to use beam search " - "or set num_beams=1 if you with to use multinomial sampling."); - OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); + auto stop_token_ids_it = std::find_if(stop_token_ids.begin(), stop_token_ids.end(), [] (int64_t stop_token_id) -> bool { + return stop_token_id < 0; + }); + OPENVINO_ASSERT(stop_token_ids_it == stop_token_ids.end(), "'stop_token_ids' must be non-negative, but it contains a value ", *stop_token_ids_it); + + OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "ignore_eos is true, in this case either 'max_new_tokens', or 'max_length' should be defined."); + + OPENVINO_ASSERT(eos_token_id != -1 || !stop_token_ids.empty() || !stop_strings.empty() || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "Either 'eos_token_id', or 'stop_token_ids', or 'stop_strings', or 'max_new_tokens', or 'max_length' should be defined."); + OPENVINO_ASSERT(max_new_tokens > 0 || (max_new_tokens == 0 && echo), "'max_new_tokens' must be greater than 0, if `echo` is set, 0 is also accepted"); OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); - OPENVINO_ASSERT( - num_beams % num_beam_groups == 0, - "number of beams should be divisible by number of groups" - ); - - // max_new_tokens has priority over max_length - // if max_new_tokens is defined no need to check max_length - OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, - "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); - - OPENVINO_ASSERT(!do_sample || top_k > 0, - "top_k must be a strictly positive, but got ", - top_k); - OPENVINO_ASSERT(!do_sample || (top_p > 0 && top_p <= 1.0f), - "top_p must be a positive float > 0 and < 1, but got ", - top_p); - OPENVINO_ASSERT(!do_sample || temperature > 0, - "Temperature must be a strictly positive float, but got ", - temperature); - - OPENVINO_ASSERT(repetition_penalty > 0, - "Repetition penalty must be a strictly positive float, but got ", - repetition_penalty); - - OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "ignore_eos == true, in this case either 'max_new_tokens', or 'max_length' should be defined."); - OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + // Sampling strategies + + OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), + "'num_return_sequences' can be more than 1 only in case of beam search or multinomial sampling, but got ", num_return_sequences); + + // generic penalties, but not supported by beam search currently + if (!is_beam_search()) { + OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "'frequence_penalty' penalty must be within [-2.0; 2.0], but got ", frequency_penalty); + OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "'presence_penalty' penalty must be within [-2.0; 2.0], but got ", presence_penalty); + OPENVINO_ASSERT(repetition_penalty > 0.0f, "'repetition_penalty' must be a strictly positive float, but got ", repetition_penalty); + } else { + OPENVINO_ASSERT(frequency_penalty == 0.0f, "'frequency_penalty' is not currently supported by beam search and should be 0.0f, but got ", frequency_penalty); + OPENVINO_ASSERT(presence_penalty == 0.0f, "'presence_penalty' is not currently supported by beam search and should be 0.0f, but got ", presence_penalty); + OPENVINO_ASSERT(repetition_penalty == 1.0f, "'repetition_penalty' is not currently supported by beam search and should be 1.0f, but got ", repetition_penalty); + } + + if (is_multinomial()) { + OPENVINO_ASSERT(top_k >= 0, "When 'do_sample' is true, top_k must be a non-negative, but got ", top_k); + OPENVINO_ASSERT(top_p > 0 && top_p <= 1.0f, "When 'do_sample' is true, top_p must be a positive float > 0.0 and <= 1.0, but got ", top_p); + OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature); + } else { + // parameters requiring multinomial + OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k); + OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p); + OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature); + } + if (is_beam_search()) { - OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + OPENVINO_ASSERT(num_beams % num_beam_groups == 0, "'num_beams' (", num_beams, ") should be divisible by 'num_beam_groups' (", num_beam_groups, ")"); + OPENVINO_ASSERT(num_beams >= num_return_sequences, "'num_beams' (", num_beams, ") must be greater equal than 'num_return_sequences' (", num_return_sequences, ")"); + + OPENVINO_ASSERT(!do_sample, + "Beam search with sampling is not supported yet. " + "Please either set do_sample=false to use beam search " + "or set num_beams=1 if you with to use multinomial sampling."); + + OPENVINO_ASSERT(no_repeat_ngram_size > 0, "'no_repeat_ngram_size' must be positive"); if (num_beam_groups > 1) { - OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search"); + OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, otherwise it fallbacks to non-grouped beam search"); + } else { + OPENVINO_ASSERT(diversity_penalty == 0.0f, "For beam search 'diversity_penalty' is applicable only when grouped beam search is used, but got 'num_beam_groups' == 1"); } } else { - OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); - OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); + // parameters requiring beam search + OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups); + OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size); + OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling"); + OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling"); } + + // assistant generation + if (is_assisting_generation()) { - if (assistant_confidence_threshold != 0.f) { - OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); - OPENVINO_ASSERT(!is_prompt_lookup(), "Parameters `assistant_confidence_threshold` cannot be used while Prompt Lookup decoding"); - } else { - OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); - }; + OPENVINO_ASSERT(!is_beam_search() && num_return_sequences == 1, "Beam search and parallel sampling are not compatible with assistant generation"); + OPENVINO_ASSERT(assistant_confidence_threshold == 0.0f || num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); + } + + if (num_assistant_tokens == 0) { + OPENVINO_ASSERT(max_ngram_size == 0, "'max_ngram_size' should be set to default value 0 when prompt lookup is disabled"); } } diff --git a/src/cpp/src/json_utils.hpp b/src/cpp/src/json_utils.hpp index 13d792e9db..4a4bb001df 100644 --- a/src/cpp/src/json_utils.hpp +++ b/src/cpp/src/json_utils.hpp @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include namespace ov { @@ -40,6 +43,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, std::v } } +template +void read_json_param(const nlohmann::json& data, const std::string& name, std::set& param) { + if (data.contains(name) && data[name].is_array()) { + for (const auto elem : data[name]) { + param.insert(elem.get()); + } + } +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 81f411020e..3e378e78cf 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -72,7 +72,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const ov::AnyMap& config, const ov::genai::GenerationConfig& generation_config ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) { - ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model; auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config); utils::slice_matmul_stateful_model(model); @@ -81,10 +80,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable - compiled_model = core.compile_model(model, device, *filtered_plugin_config); + compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config); m_model_runner = compiled_model.create_infer_request(); } else { - compiled_model = core.compile_model(model, device, plugin_config); + compiled_model = utils::singleton_core().compile_model(model, device, plugin_config); m_model_runner = compiled_model.create_infer_request(); } ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model"); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 8510a8389f..5d82fa89a3 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -367,16 +367,16 @@ class ContinuousBatchingPipeline: def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload - def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: int, prompt: str, sampling_params: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: int, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def generate(self, input_ids: list[openvino._pyopenvino.Tensor], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]: + def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]: ... @typing.overload - def generate(self, prompts: list[str], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... @@ -609,11 +609,15 @@ class GenerationConfig: ... def is_greedy_decoding(self) -> bool: ... + def is_multinomial(self) -> bool: + ... def is_prompt_lookup(self) -> bool: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: ... - def update_generation_config(self, config_map: dict[str, openvino._pyopenvino.OVAny]) -> None: + def update_generation_config(self, **kwargs) -> None: + ... + def validate(self) -> None: ... class GenerationFinishReason: """ @@ -826,7 +830,7 @@ class Image2ImagePipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -927,7 +931,7 @@ class InpaintingPipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -1615,7 +1619,7 @@ class Text2ImagePipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -1865,9 +1869,9 @@ class VLMPipeline: ... def get_tokenizer(self) -> Tokenizer: ... - def set_chat_template(self, new_template: str) -> None: + def set_chat_template(self, chat_template: str) -> None: ... - def set_generation_config(self, new_config: GenerationConfig) -> None: + def set_generation_config(self, config: GenerationConfig) -> None: ... def start_chat(self, system_message: str = '') -> None: ... @@ -2043,6 +2047,8 @@ class WhisperGenerationConfig: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: ... + def update_generation_config(self, **kwargs) -> None: + ... class WhisperPerfMetrics(PerfMetrics): """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index be7a72481f..2b48e4d44d 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -235,22 +235,22 @@ void init_continuous_batching_pipeline(py::module_& m) { .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("sampling_params")) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("sampling_params")) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config")) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config")) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) .def( "generate", py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), py::arg("input_ids"), - py::arg("sampling_params"), + py::arg("generation_config"), py::arg("streamer") = std::monostate{} ) .def( "generate", py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), py::arg("prompts"), - py::arg("sampling_params"), + py::arg("generation_config"), py::arg("streamer") = std::monostate{} ); } diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index f49bcf29bd..a97a43fc5c 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -118,7 +118,13 @@ void init_generation_config(py::module_& m) { .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) + .def("is_multinomial", &GenerationConfig::is_multinomial) .def("is_assisting_generation", &GenerationConfig::is_assisting_generation) .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup) - .def("update_generation_config", static_cast(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map")); + .def("validate", &GenerationConfig::validate) + .def("update_generation_config", []( + ov::genai::GenerationConfig& config, + const py::kwargs& kwargs) { + config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + }); } diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index 311f3f3760..c246557a97 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -224,7 +224,7 @@ void init_image_generation_pipelines(py::module_& m) { .def_readwrite("max_sequence_length", &ov::genai::ImageGenerationConfig::max_sequence_length) .def("validate", &ov::genai::ImageGenerationConfig::validate) .def("update_generation_config", []( - ov::genai::ImageGenerationConfig config, + ov::genai::ImageGenerationConfig& config, const py::kwargs& kwargs) { config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); }); @@ -255,8 +255,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties )") - .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config) - .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::Text2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) @@ -323,8 +323,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Image2ImagePipeline properties )") - .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config) - .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::Image2ImagePipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::Image2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) @@ -386,8 +386,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: InpaintingPipeline properties )") - .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config) - .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::InpaintingPipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::InpaintingPipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::InpaintingPipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index b1d5136253..7360975a0b 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -53,15 +53,10 @@ py::object call_common_generate( const pyutils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { - ov::genai::GenerationConfig default_config; - if (config.has_value()) { - default_config = *config; - } else { - default_config = pipe.get_generation_config(); - } + ov::genai::GenerationConfig default_config = config.has_value() ? *config : pipe.get_generation_config(); auto updated_config = pyutils::update_config_from_kwargs(default_config, kwargs); + py::object results; - EncodedInputs tensor_data; StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); // Call suitable generate overload for each type of input. diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 45a0c46174..34522409ea 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -358,7 +358,10 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O ov::genai::GenerationConfig res_config; if(config.has_value()) res_config = *config; - res_config.update_generation_config(kwargs_to_any_map(kwargs)); + + if (!kwargs.empty()) + res_config.update_generation_config(kwargs_to_any_map(kwargs)); + return res_config; } diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 340cb3da62..b0cfa0a42a 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -150,10 +150,10 @@ void init_vlm_pipeline(py::module_& m) { .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) - .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("new_template")) + .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("chat_template")) .def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer) - .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) - .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("new_config")) + .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("config")) .def( "generate", [](ov::genai::VLMPipeline& pipe, diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index cd42dcf58d..d290612ed6 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -187,7 +187,10 @@ OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const Optional WhisperGenerationConfig res_config; if (config.has_value()) res_config = *config; - res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + + if (!kwargs.empty()) + res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + return res_config; } @@ -295,7 +298,12 @@ void init_whisper_pipeline(py::module_& m) { .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt) .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords) - .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")); + .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) + .def("update_generation_config", []( + ov::genai::WhisperGenerationConfig& config, + const py::kwargs& kwargs) { + config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + });; py::class_(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 093cd993de..b8c2e625c5 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -25,8 +25,8 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp") -add_executable(${TEST_TARGET_NAME} ${tests_src} - block_allocator.cpp) +add_executable(${TEST_TARGET_NAME} ${tests_src}) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE openvino::genai gtest_main) target_include_directories(${TEST_TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") target_sources(${TEST_TARGET_NAME} PRIVATE ${src_files}) diff --git a/tests/cpp/generate_config.cpp b/tests/cpp/generate_config.cpp deleted file mode 100644 index 974fd499f8..0000000000 --- a/tests/cpp/generate_config.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include "openvino/genai/generation_config.hpp" - - -using namespace ov::genai; - -TEST(GenerationConfigTest, invalid_temperature) { - GenerationConfig config; - config.max_new_tokens = 20; - config.temperature = -0.1; - config.do_sample = true; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_temperature) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.temperature = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_top_p) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.top_p = -0.5; - EXPECT_THROW(config.validate(), ov::Exception); - config.top_p = 1.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_top_p) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.top_p = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_repeatition_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.repetition_penalty = -3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.repetition_penalty = -0.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_repeatition_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.repetition_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.repetition_penalty = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_presence_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.presence_penalty = 3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.presence_penalty = -3.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_presence_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.presence_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.presence_penalty = -2.0; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_frequency_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.frequency_penalty = 3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.frequency_penalty = -3.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_frequency_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.frequency_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.frequency_penalty = -2.0; - EXPECT_NO_THROW(config.validate()); -} - -ov::genai::GenerationConfig speculative_decoding_multinomial() { - auto speculative_decoding_multinomial_config = ov::genai::multinomial(); - speculative_decoding_multinomial_config.num_assistant_tokens = 5; - return speculative_decoding_multinomial_config; -} - -ov::genai::GenerationConfig speculative_decoding_greedy() { - auto speculative_decoding_greedy_config = ov::genai::greedy(); - speculative_decoding_greedy_config.assistant_confidence_threshold = 0.4f; - return speculative_decoding_greedy_config; -} - -TEST(GenerationConfigTest, invalid_static_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0.2; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_static_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_dynamic_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0.5; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_dynamic_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.assistant_confidence_threshold = 0.5; - config.num_assistant_tokens = 0; - EXPECT_NO_THROW(config.validate()); -} diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index f940d272ed..9040fa435f 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -73,6 +73,7 @@ def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 generation_config.num_return_sequences = generation_config.num_beams @@ -82,6 +83,7 @@ def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.min_new_tokens = 15 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 @@ -92,6 +94,7 @@ def get_beam_search_with_single_stop_string() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 50 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {"open sour"} # expected match on "open source" @@ -102,6 +105,7 @@ def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 50 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {".", "software", "Intel"} @@ -112,6 +116,7 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} @@ -299,7 +304,7 @@ def convert_to_hf( kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty - if generation_config.num_beams > 1: + if generation_config.is_beam_search(): # beam search case kwargs['num_beam_groups'] = generation_config.num_beam_groups kwargs['num_beams'] = generation_config.num_beams @@ -309,7 +314,7 @@ def convert_to_hf( kwargs['output_scores'] = True if generation_config.num_beam_groups > 1: kwargs['diversity_penalty'] = generation_config.diversity_penalty - elif generation_config.do_sample: + elif generation_config.is_multinomial(): # mulitinomial kwargs['temperature'] = generation_config.temperature kwargs['top_k'] = generation_config.top_k diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 3fc89cb8a7..9e8e4681f9 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -111,7 +111,7 @@ def read_model(params, **tokenizer_kwargs): path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}), + ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False), ) @@ -139,7 +139,7 @@ def model_tmp_path(tmpdir_factory): @pytest.fixture(scope="module") -def model_tokenizers_path_tmp_path(tmpdir_factory): +def model_tokenizers_tmp_path(tmpdir_factory): model_id, path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) @@ -180,10 +180,15 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return ov_genai.LLMPipeline(temp_path, 'CPU') + + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU') + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_pipe @functools.lru_cache(1) def get_continuous_batching(path): - scheduler_config = ov_genai.SchedulerConfig() - return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config}) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig()) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 3a1e9fa092..01762bf9e3 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -105,7 +105,7 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): generation_configs = [ dict(do_sample=False, max_new_tokens=20), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) + dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0, repetition_penalty=1.0) ] questions = [ '1+1=', @@ -113,19 +113,22 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): 'Why is the Sun yellow?', 'What was my first question?' ] -@pytest.mark.parametrize("generation_config", generation_configs[1:]) +@pytest.mark.parametrize("generation_config_kwargs", generation_configs[1:]) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit -def test_chat_scenario_vs_stateful(model_descr, generation_config: Dict): +def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict): model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) cb_pipe = get_continuous_batching(path) ov_pipe.start_chat() cb_pipe.start_chat() + generation_config = GenerationConfig(**generation_config_kwargs) + ov_pipe.set_generation_config(generation_config) + for question in questions: - generated = cb_pipe.generate(question, **generation_config) - reference = ov_pipe.generate(question, **generation_config) + generated = cb_pipe.generate(question, generation_config=generation_config) + reference = ov_pipe.generate(question) assert generated == reference # Test that finish_chat() doesn't fail just in case. @@ -168,9 +171,13 @@ def test_post_oom_health(tmp_path, sampling_config): # Pre-emption # -def get_greedy_seq_len_300() -> GenerationConfig: +def get_parallel_sampling_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_return_sequences = 3 + # TODO: add generation_config.generator and return parameters below + # generation_config.num_return_sequences = 3 + # generation_config.do_sample = True + # generation_config.top_k = 10 + # generation_config.top_p = 0.5 generation_config.max_new_tokens = 300 return generation_config @@ -178,14 +185,15 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 300 generation_config.num_return_sequences = generation_config.num_beams return generation_config scheduler_params_list = [({"num_kv_blocks": 2, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), ({"num_kv_blocks": 2, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_parallel_sampling_seq_len_300()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_parallel_sampling_seq_len_300()), ({"num_kv_blocks": 34, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), ({"num_kv_blocks": 34, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), ({"num_kv_blocks": 100, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py new file mode 100644 index 0000000000..110caaf0e5 --- /dev/null +++ b/tests/python_tests/test_generation_config.py @@ -0,0 +1,142 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino_genai import GenerationConfig +from typing import Tuple, List +import json +import os +import pytest + +configs = [ + # stop conditions + dict(max_new_tokens=12), + dict(max_length=12), + dict(stop_token_ids={2}), + dict(eos_token_id=1, stop_token_ids={1}), + dict(stop_strings={"a", "b"}), + dict(ignore_eos=True, max_new_tokens=10), + dict(ignore_eos=True, max_length=10), + dict(max_new_tokens=0, echo=True), + dict(min_new_tokens=1, max_new_tokens=1), + # multinomial + dict(max_new_tokens=1, do_sample=True, num_return_sequences=2), + dict(max_new_tokens=1, do_sample=True, top_k=1), + dict(max_new_tokens=1, do_sample=True, top_p=0.5), + dict(max_new_tokens=1, do_sample=True, temperature=0.5), + # beam search + dict(max_new_tokens=1, num_beams=2), + dict(max_new_tokens=1, num_beams=2, num_return_sequences=1), + dict(max_new_tokens=1, num_beams=2, num_return_sequences=2), + dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0), + dict(max_new_tokens=1, num_beams=4, length_penalty=1.0), + dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2), + # assistant generation + dict(max_new_tokens=1, assistant_confidence_threshold=0.5), + dict(max_new_tokens=1, num_assistant_tokens=2), + dict(max_new_tokens=1, num_assistant_tokens=2, max_ngram_size=2), # prompt lookup +] +@pytest.mark.parametrize("generation_config_kwargs", configs) +@pytest.mark.precommit +@pytest.mark.nightly +def test_valid_configs(generation_config_kwargs): + config = GenerationConfig(**generation_config_kwargs) + config.validate() + + config = GenerationConfig() + config.update_generation_config(**generation_config_kwargs) + config.validate() + + +invalid_configs = [ + dict(num_return_sequences=0), # no reason to run with empty output + dict(num_return_sequences=2), # beam search or multimonial is required + # stop conditions + dict(), # no stop conditions at all + dict(eos_token_id=1), # 'stop_token_ids' does not contain 'eos_token_id' + dict(eos_token_id=1, stop_token_ids={2}), # 'stop_token_ids' is not empty, but does not contain 'eos_token_id' + dict(ignore_eos=True), # no 'max_new_tokens', no 'max_length' with 'ignore_eos' + dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative + dict(max_new_tokens=0), # max new tokens cannot be empty (only when 'echo' is True) + dict(max_new_tokens=10, min_new_tokens=20), # 'max_new_tokens' must be >= 'min_new_tokens' + # penalties + dict(max_new_tokens=1, repetition_penalty=-1.0), # invalid repetition_penalty + dict(max_new_tokens=1, presence_penalty=-3.0), # invalid presence_penalty + dict(max_new_tokens=1, frequency_penalty=3.0), # invalid frequency_penalty + # multinomial sampling + dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True + dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True + dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp + # parameters requiring multimonial + dict(max_new_tokens=1, top_k=1), # requires do_sample=True + dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True + dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True + # beam search + dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences' + dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups' + dict(max_new_tokens=1, num_beams=3, do_sample=True), # 'beam sample is not supported + dict(max_new_tokens=1, num_beams=3, no_repeat_ngram_size=0), # invalid 'no_repeat_ngram_size' + dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=0.0), # 'diversity_penalty' should not be a default value + dict(max_new_tokens=1, num_beams=4, diversity_penalty=1.0), # 'diversity_penalty' is used only for grouped beam search + dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search + dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search + dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search + # parameters requiring beam search + dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search + dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search + dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search + dict(max_new_tokens=1, length_penalty=2), # requiring beam search + # assistant generation + dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group + dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group + dict(max_new_tokens=1, num_assistant_tokens=2, num_beams=2), # beam search is not compatible with assistant generation + dict(max_new_tokens=1, assistant_confidence_threshold=1.0, num_assistant_tokens=2), # 'assistant_confidence_threshold' and 'num_assistant_tokens' are mutually exclusive + dict(max_new_tokens=1, max_ngram_size=1), # 'max_ngram_size' is for prompt lookup, but assistant generation is turned off ('num_assistant_tokens' is 0) + # TODO: add tests for invalid properties +] +@pytest.mark.parametrize("generation_config_kwargs", invalid_configs) +@pytest.mark.precommit +@pytest.mark.nightly +def test_invalid_generation_configs_throws(generation_config_kwargs): + config = GenerationConfig(**generation_config_kwargs) + with pytest.raises(RuntimeError): + config.validate() + + config = GenerationConfig() + config.update_generation_config(**generation_config_kwargs) + with pytest.raises(RuntimeError): + config.validate() + + +def load_genai_generation_config_from_file(configs: List[Tuple], temp_path): + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + + ov_generation_config = GenerationConfig(temp_path / "generation_config.json") + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_generation_config + +@pytest.mark.precommit +@pytest.mark.nightly +def test_multiple_eos_are_read_as_stop_token_ids(tmp_path): + generation_config_json = { + "eos_token_id": [ + 2, + 32000, + 32007 + ] + } + configs = [ + (generation_config_json, "generation_config.json"), + ] + + generation_config = load_genai_generation_config_from_file(configs, tmp_path) + + assert generation_config.eos_token_id == 2 + assert generation_config.stop_token_ids == { 2, 32000, 32007 } diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index bbd0da6bb2..6228f53dd1 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -147,7 +147,6 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t def get_greedy_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_return_sequences = 3 generation_config.max_new_tokens = 300 return generation_config @@ -155,6 +154,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 300 generation_config.num_return_sequences = generation_config.num_beams return generation_config diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 9f00996a58..6e3cce06d0 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino_genai import StopCriteria +from openvino_genai import StopCriteria, GenerationConfig import pytest from typing import Union, List, Dict, Optional import numpy as np @@ -18,7 +18,6 @@ get_chat_models_list, model_tmp_path, STOP_CRITERIA_MAP, - get_continuous_batching, ) @@ -299,11 +298,10 @@ def test_batch_size_switch(): # generation_configs = [ - dict(do_sample=False, max_new_tokens=20), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) + dict(max_new_tokens=20), + dict(max_new_tokens=10, num_beam_groups=3, num_beams=15, num_return_sequences=1, diversity_penalty=1.0) ] - questions = [ '1+1=', 'What is the previous answer?', @@ -311,12 +309,11 @@ def test_batch_size_switch(): 'What was my first question?' ] - -@pytest.mark.parametrize("generation_config", generation_configs) +@pytest.mark.parametrize("generation_config_kwargs", generation_configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_chat_compare_with_HF(model_descr, generation_config: Dict): +def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict): chat_history_hf = [] chat_history_ov = [] chat_prompt = '' @@ -324,6 +321,10 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): # Will set add_special_tokens=False inside pipeline when start_chat() is called. model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + from transformers import GenerationConfig as HFGenerationConfig + hf_generation_config = HFGenerationConfig(**generation_config_kwargs) + ov_generation_config = GenerationConfig(**generation_config_kwargs) + ov_pipe.start_chat() for prompt in questions: chat_history_hf.append({'role': 'user', 'content': prompt}) @@ -332,11 +333,11 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - answer = opt_model.generate(**tokenized, **generation_config) + answer = opt_model.generate(**tokenized, generation_config=hf_generation_config) answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - answer_ov = ov_pipe.generate(prompt, **generation_config) + answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config) chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) ov_pipe.finish_chat() @@ -492,30 +493,9 @@ def test_operator_with_streamer_kwargs_batch_throws(): ov_pipe('', num_beams=2, streamer=printer) # -# Tests on generation configs (invalid cases and handling within LLMPipeline) +# Tests on generation configs handling # -invalid_configs = [ - dict(num_beam_groups=3, num_beams=15, do_sample=True), - # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests - # dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k -] -@pytest.mark.parametrize("generation_config", invalid_configs) -@pytest.mark.precommit -@pytest.mark.nightly -def test_invalid_generation_configs_throws(model_tmp_path, generation_config): - model_id, temp_path = model_tmp_path - config_json = {} - ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path) - with pytest.raises(RuntimeError): - ov_pipe.generate('blah blah', **generation_config) - - @pytest.mark.precommit @pytest.mark.nightly def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path): @@ -529,28 +509,14 @@ def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path): assert 37 == ov_pipe.get_generation_config().eos_token_id -invalid_py_configs = [ - dict(num_beam_groups=3, num_beams=15, do_sample=True), - # TODO: Currently unexpected params do not cause exceptions. Need to implement it in c++ and return this test - # dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k -] @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.parametrize("generation_config", invalid_py_configs) -def test_python_generation_config_validation_throws(model_tmp_path, generation_config): - model_id, temp_path = model_tmp_path - ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) - - # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned - # instead of RuntimeError, which is returned when GenerationConfig values are validated - return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError - with pytest.raises(return_exception_type): - ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) +def test_pipeline_validates_generation_config(): + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + ov_pipe = read_model((model_id, path))[4] + invalid_generation_config = dict(num_beam_groups=3, num_beams=15, do_sample=True) # beam sample is not supported + with pytest.raises(RuntimeError): + ov_pipe.generate("dummy prompt", **invalid_generation_config) # # Work with Unicode in Python API @@ -699,7 +665,7 @@ def test_stop_token_ids(): res = ov_pipe.generate( ov.Tensor([(1,)]), max_new_tokens=3, - stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()}, + stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()}, include_stop_str_in_output=False ) assert 2 == len(res.tokens[0]) diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 0c2a106d50..8129298763 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -1,6 +1,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import pytest import numpy as np from transformers import AutoTokenizer @@ -17,15 +18,19 @@ def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): - # load Tokenizer where all configs are cleared. - # remove existing jsons from previous tests for json_file in temp_path.glob("*.json"): json_file.unlink() for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return openvino_genai.Tokenizer(temp_path) + + ov_tokenizer = openvino_genai.Tokenizer(temp_path) + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_tokenizer def get_chat_templates(): @@ -181,7 +186,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): @pytest.mark.nightly def test_set_chat_template(): model_descr = get_chat_models_list()[0] - model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) prompt = "how are you?" dummy_conversation = [ @@ -265,7 +270,7 @@ def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path): @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons") -def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path): +def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_tmp_path): # special_tokens_map is not available # but tokenize_config.json exists # will load both string and integer representations @@ -280,7 +285,7 @@ def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tm "eos_token": "", } - tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1]) + tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_tmp_path[1]) assert tok.get_pad_token() == tok_config_json['pad_token'] assert tok.get_bos_token() == tok_config_json['bos_token'] assert tok.get_eos_token() == tok_config_json['eos_token'] diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index 6cf462fdf8..e0c50cda02 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -123,11 +123,6 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); greedy_search.max_new_tokens = std::min(max_output_len, output_len); greedy_search.ignore_eos = true; - greedy_search.repetition_penalty = 1.0; - greedy_search.frequency_penalty = 0.0; - greedy_search.presence_penalty = 0.0; - greedy_search.diversity_penalty = 0.0; - greedy_search.length_penalty = 0.0; dataset.push_data(human_question, greedy_search); dataset.push_lens(input_len, output_len);