diff --git a/Dockerfile b/Dockerfile index b73d907b87..665b23427d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:22.04 ARG JOBS WORKDIR /workspace -RUN apt-get update -y && apt-get install -y python3-pip python3-venv git +RUN apt-get update -y && apt-get install -y --no-install-recommends python3-pip python3-venv git # Install OpenVINO RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \ @@ -25,7 +25,7 @@ ENV OpenVINO_DIR=/workspace/openvino_build RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # Build GenAI library with dependencies -RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tokenizer openvino.genai && \ +RUN git clone https://github.com/openvinotoolkit/openvino.genai.git && \ cd /workspace/openvino.genai/thirdparty && git submodule update --remote --init && \ mkdir /workspace/openvino.genai/build && cd /workspace/openvino.genai/build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ @@ -33,6 +33,6 @@ RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tok # Install test dependencies RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/continuous_batching/requirements.txt +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/requirements.txt ENV PYTHONPATH=/workspace/openvino.genai/build/ ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/ diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index f5f8c53309..6dd2b2f910 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -56,13 +56,13 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { PipelineMetrics get_metrics() const; - GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params); + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params); void step(); bool has_non_finished_requests(); // more high level interface, which can process multiple prompts in continuous batching manner - std::vector generate(const std::vector& prompts, std::vector sampling_params); + std::vector generate(const std::vector& prompts, const std::vector& sampling_params); }; } diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index d0ddbc3a32..5fd0fa5fe2 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { public: GenerationHandleImpl(std::shared_ptr generation_stream, const ov::genai::GenerationConfig& sampling_params) : - m_generation_stream(generation_stream), + m_generation_stream(std::move(generation_stream)), m_sampling_params(sampling_params) {}; ~GenerationHandleImpl(); diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index ab60b7f5ff..489f4e6159 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -257,7 +257,7 @@ class BlockManager { } bool can_append_slots(SequenceGroup::CPtr seq_group) { - return required_blocks_count(seq_group) <= m_allocator.num_free_blocks(); + return required_blocks_count(std::move(seq_group)) <= m_allocator.num_free_blocks(); } size_t required_blocks_count(SequenceGroup::CPtr seq_group) { @@ -336,7 +336,7 @@ class BlockManager { // write information about block forking for later usage in CacheManager copy_blocks_map[last_block->get_index()].push_back(new_block->get_index()); // release `last_block` usage - m_allocator.free(last_block); + m_allocator.free(std::move(last_block)); } else { // nothing to do, because we are the only users of this block } diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 55100f3cb4..5418a14242 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -238,7 +238,7 @@ class ContinuousBatchingPipeline::Impl { return !m_awaiting_requests.empty() || !m_requests.empty(); } - std::vector generate(const std::vector prompts, std::vector sampling_params) { + std::vector generate(const std::vector& prompts, const std::vector& sampling_params) { OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); OPENVINO_ASSERT(prompts.size() == sampling_params.size()); @@ -307,8 +307,8 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{ return m_impl->get_metrics(); } -GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) { - return m_impl->add_request(request_id, prompt, sampling_params); +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) { + return m_impl->add_request(request_id, prompt, std::move(sampling_params)); } void ContinuousBatchingPipeline::step() { @@ -319,6 +319,6 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() { return m_impl->has_non_finished_requests(); } -std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, std::vector sampling_params) { +std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, const std::vector& sampling_params) { return m_impl->generate(prompts, sampling_params); } \ No newline at end of file diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp index 0d51897e82..57cb7253c9 100644 --- a/src/cpp/src/generation_stream.hpp +++ b/src/cpp/src/generation_stream.hpp @@ -27,7 +27,7 @@ class GenerationStream { } void push(GenerationOutputs outputs) { - m_output_queue.push(outputs); + m_output_queue.push(std::move(outputs)); } // Retriving vector of pairs as we can generate multiple outputs for a single prompt diff --git a/src/cpp/src/logit_processor.hpp b/src/cpp/src/logit_processor.hpp index cb3ffb37c0..06ba819b9d 100644 --- a/src/cpp/src/logit_processor.hpp +++ b/src/cpp/src/logit_processor.hpp @@ -16,12 +16,38 @@ struct Token { Token() = default; }; +struct Logits { + float * m_data = nullptr; + size_t m_size; + // Late initialized for top_p or top_k transforms + std::vector m_vector; + + Logits(float* data, size_t size): m_data(data), m_size(size) {} + + + void initialize_vector() { + OPENVINO_ASSERT(m_vector.size() == 0, "Logits vector already initialized"); + m_vector.reserve(m_size); + for (size_t i = 0; i < m_size; i++) + m_vector.emplace_back(m_data[i], i); + } + + bool is_vector_initialized() const { + return m_vector.size() > 0; + } + + void resize(size_t new_size) { + m_size = new_size; + m_vector.resize(new_size); + } +}; + namespace LogitTransformers { using TokenIds = std::vector; class ILogitTransformer { public: - virtual void apply(std::vector& logits) = 0; + virtual void apply(Logits& logits) = 0; virtual bool is_applicable(size_t generated_tokens_cnt = 0) { return true; @@ -32,11 +58,15 @@ class TopPFilter : public ILogitTransformer { public: TopPFilter(double top_p) : m_top_p(top_p) {} - void apply(std::vector& logits) override { - std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + void apply(Logits& logits) override { + if (!logits.is_vector_initialized()) { + // Initialize and sort vector + logits.initialize_vector(); + std::sort(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + } float probability_sum = 0.0f; size_t nucleus_size = 0; - for (const auto& probability : logits) { + for (const auto& probability : logits.m_vector) { probability_sum += probability.m_log_prob; nucleus_size += 1; if (probability_sum > m_top_p) break; @@ -52,10 +82,26 @@ class TopKFilter : public ILogitTransformer { public: TopKFilter(size_t top_k) : m_top_k(top_k) {} - void apply(std::vector& logits) override { - std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); - size_t top_k = logits.size() >= m_top_k ? m_top_k : logits.size(); - logits.resize(top_k); + // If this transform is used along with top_p, it should be applied after it since top_p sorts entire vector and top_k does it only partially + void apply(Logits& logits) override { + + /* + TODO: Uncommenting this section requires changes in reference texts in tests + + if (m_top_k >= logits.m_size) + return; + */ + + if (!logits.is_vector_initialized()) { + // Initialize and partially sort vector + logits.initialize_vector(); + // TODO: Uncommenting below requires uncommenting section above + // std::partial_sort(logits.m_vector.begin(), logits.m_vector.begin() + m_top_k, logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + + std::sort(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + } + if (m_top_k < logits.m_size) + logits.resize(m_top_k); } protected: @@ -66,18 +112,23 @@ class TemperatureLogitTransform : public ILogitTransformer { public: TemperatureLogitTransform(double temperature) : m_temperature(temperature) {}; - void apply(std::vector& logits) override { - auto max_prob_token = std::max_element(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); - float max_logit = max_prob_token->m_log_prob; - - std::for_each(logits.begin(), logits.end(), [max_logit, this](Token& val) {val.m_log_prob = expf((val.m_log_prob - max_logit) / this->m_temperature);}); + void apply(Logits& logits) override { + float max_logit = -std::numeric_limits::infinity(); + for (size_t i = 0; i < logits.m_size; i++) { + if (logits.m_data[i] > max_logit) { + max_logit = logits.m_data[i]; + } + } float norm_sum = 0.0; - for (const auto& val : logits) { - norm_sum += val.m_log_prob; + for (size_t i = 0; i < logits.m_size; i++) { + logits.m_data[i] = expf((logits.m_data[i] - max_logit) / this->m_temperature); + norm_sum += logits.m_data[i]; } - std::for_each(logits.begin(), logits.end(), [norm_sum](Token& val) {val.m_log_prob /= norm_sum;}); + for (size_t i = 0; i < logits.m_size; i++) { + logits.m_data[i] /= norm_sum; + } } protected: @@ -118,32 +169,28 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer { m_penalty = repetition_penalty; }; - void apply(std::vector& logits) override { - size_t vocab_size = logits.size(); + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; for (const auto& prompt_id : *m_unique_prompt_token_ids) { OPENVINO_ASSERT((prompt_id >= 0) && (prompt_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(logits[prompt_id].m_index == prompt_id, "input_logits must have original index order"); - auto logit_value = logits[prompt_id].m_log_prob; - if (logit_value >= 0) { - logits[prompt_id].m_log_prob /= m_penalty; + if (logits.m_data[prompt_id] >= 0) { + logits.m_data[prompt_id] /= m_penalty; } else { - logits[prompt_id].m_log_prob *= m_penalty; + logits.m_data[prompt_id] *= m_penalty; }; } for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = logits[input_id].m_log_prob; - if (logit_value >= 0) { - logits[input_id].m_log_prob /= m_penalty; + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] /= m_penalty; } else { - logits[input_id].m_log_prob *= m_penalty; + logits.m_data[input_id] *= m_penalty; }; } } - void apply(std::vector& logits, const TokenIds& input_ids) { + void apply(Logits& logits, const TokenIds& input_ids) { set_unique_prompt_token_ids(nullptr); extract_generated_tokens(input_ids); apply(logits); @@ -166,10 +213,10 @@ class EOSPenaltyTransform : public ILogitTransformer { EOSPenaltyTransform(size_t eos_token_id, size_t min_generated_tokens) : m_eos_token_id(eos_token_id), m_applicable_tensor_len(min_generated_tokens) {} - void apply(std::vector& logits) override { - // Since EOS penalty is applied early, the token vector is not sorted + void apply(Logits& logits) override { + // Since EOS penalty is applied early, the token vector is not initialized yet // and we can assume element order match token ids. - logits[m_eos_token_id].m_log_prob = 0.f; + logits.m_data[m_eos_token_id] = 0.f; } @@ -188,22 +235,20 @@ class FrequencyPenaltyTransform : public IPenaltyTransformer { m_penalty = value; }; - void apply(std::vector& logits) override { - size_t vocab_size = logits.size(); + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = logits[input_id].m_log_prob; - if (logit_value >= 0) { - logits[input_id].m_log_prob -= m_penalty * input_id_pair.second; + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] -= m_penalty * input_id_pair.second; } else { - logits[input_id].m_log_prob += m_penalty * input_id_pair.second; + logits.m_data[input_id] += m_penalty * input_id_pair.second; }; } } - void apply(std::vector& logits, const TokenIds& input_ids) { + void apply(Logits& logits, const TokenIds& input_ids) { extract_generated_tokens(input_ids); apply(logits); } @@ -215,22 +260,20 @@ class PresencePenaltyTransform : public IPenaltyTransformer { m_penalty = value; }; - void apply(std::vector& logits) override { - size_t vocab_size = logits.size(); + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = logits[input_id].m_log_prob; - if (logit_value >= 0) { - logits[input_id].m_log_prob -= m_penalty; + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] -= m_penalty; } else { - logits[input_id].m_log_prob += m_penalty; + logits.m_data[input_id] += m_penalty; }; } } - void apply(std::vector& logits, const TokenIds& input_ids) { + void apply(Logits& logits, const TokenIds& input_ids) { extract_generated_tokens(input_ids); apply(logits); } @@ -286,14 +329,15 @@ class LogitProcessor { if (sampling_params.top_p != 1.0f) { m_logit_transformers.emplace_back(new LogitTransformers::TopPFilter(sampling_params.top_p)); } - if (sampling_params.top_k > 0) { + // TODO: Uncommenting below condition requires changes in reference texts in tests + if (sampling_params.top_k > 0 /* && sampling_params.top_k < std::numeric_limits::max() */) { m_logit_transformers.emplace_back(new LogitTransformers::TopKFilter(sampling_params.top_k)); } } } } - void apply(std::vector& logits) { + void apply(Logits& logits) { for (const auto& transformer : m_logit_transformers) { if (transformer->is_applicable(m_generated_tokens)) { transformer->apply(logits); diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp index 5fb2e0f524..e2a4bc1aa7 100644 --- a/src/cpp/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -19,7 +19,7 @@ class ModelRunner { SchedulerConfig m_scheduler_config; public: ModelRunner(ov::InferRequest request, const SchedulerConfig& scheduler_config) : - m_request(request), + m_request(std::move(request)), m_scheduler_config(scheduler_config) { } ov::InferRequest get_infer_request() const { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 6390fc8725..ab8f81ab1c 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -95,7 +95,7 @@ struct Beam { float m_score = -std::numeric_limits::infinity(); Beam(Sequence::Ptr sequence) - : m_sequence(sequence) { } + : m_sequence(std::move(sequence)) { } size_t get_generated_len() const { return m_sequence->get_generated_len(); @@ -203,40 +203,49 @@ class GroupBeamSearcher { class Sampler { - std::vector _get_logit_vector(ov::Tensor logits, size_t batch_idx = 1) { + Logits _get_logit_vector(ov::Tensor logits, size_t batch_idx = 1) { ov::Shape logits_shape = logits.get_shape(); size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; OPENVINO_ASSERT(batch_idx <= batch_size); size_t batch_offset = batch_idx * seq_len * vocab_size; size_t sequence_offset = (seq_len - 1) * vocab_size; - const float* logits_data = logits.data() + batch_offset + sequence_offset; + float* logits_data = logits.data() + batch_offset + sequence_offset; - std::vector logit_vector(vocab_size); - for (size_t i = 0; i < logit_vector.size(); i++) { - logit_vector[i] = Token(logits_data[i], i); - } - return logit_vector; + return Logits{logits_data, vocab_size}; } - Token _greedy_sample(const std::vector& logit_vector) const { - Token max_token{-std::numeric_limits::infinity() , 0}; - for (const auto& logit : logit_vector) { - if (logit.m_log_prob > max_token.m_log_prob) { - max_token = logit; + Token _greedy_sample(const Logits& logits) const { + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + float max_value = -std::numeric_limits::infinity(); + size_t max_index = 0; + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > max_value) { + max_value = logits.m_data[i]; + max_index = i; } } - return max_token; + return Token(logits.m_data[max_index], max_index); } - std::vector _multinomial_sample(const std::vector& logit_vector, size_t num_tokens_per_sequence) { - std::vector multinomial_weights(logit_vector.size()); - for (size_t i = 0; i < logit_vector.size(); i++) multinomial_weights[i] = logit_vector[i].m_log_prob; + std::vector _multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + std::vector out_tokens; for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { size_t element_to_pick = dist(rng_engine); - out_tokens.push_back(logit_vector[element_to_pick]); + if (logits.is_vector_initialized()) + out_tokens.push_back(logits.m_vector[element_to_pick]); + else + out_tokens.emplace_back(logits.m_data[element_to_pick], element_to_pick); } return out_tokens; } @@ -296,7 +305,6 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, for (size_t running_sequence_id = 0; running_sequence_id < num_running_sequences; ++running_sequence_id) { auto logit_vector = _get_logit_vector(sequence_group_logits, running_sequence_id); logit_processor.apply(logit_vector); - Token sampled_token_id; if (sampling_params.is_greedy_decoding()) { sampled_token_id = _greedy_sample(logit_vector); diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 88b86b4484..f5f1bb1db5 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -371,7 +371,7 @@ class SequenceGroup { } Sequence::Ptr fork_sequence(Sequence::CPtr sequence) { - m_sequences.emplace_back(Sequence::fork(sequence, m_next_sequence_id++)); + m_sequences.emplace_back(Sequence::fork(std::move(sequence), m_next_sequence_id++)); return m_sequences.back(); } @@ -433,7 +433,7 @@ class SequenceGroup { output.score = sequence->get_beam_search_score(m_sampling_params); outputs.emplace(sequence->get_grouped_id(), output); } - m_generation_stream->push(outputs); + m_generation_stream->push(std::move(outputs)); } void push_partial_outputs() { @@ -445,7 +445,7 @@ class SequenceGroup { const auto last_gen_token = sequence->get_last_generation_output(); outputs.emplace(sequence->get_grouped_id(), last_gen_token); } - m_generation_stream->push(outputs); + m_generation_stream->push(std::move(outputs)); } void notify_handle() { diff --git a/tests/cpp/logit_filtering.cpp b/tests/cpp/logit_filtering.cpp index afedfe6685..a848683cf3 100644 --- a/tests/cpp/logit_filtering.cpp +++ b/tests/cpp/logit_filtering.cpp @@ -9,31 +9,32 @@ using namespace LogitTransformers; struct TemperatureTransformTestStruct { + static inline const size_t size = 3; + float temperature; - std::vector input; - std::vector expected_output; + float input[size]; + float expected_output[size]; }; using TemperatureTransformTest = testing::TestWithParam; TEST_P(TemperatureTransformTest, TransformResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, TemperatureTransformTestStruct::size); auto transform = TemperatureLogitTransform(test_struct.temperature); transform.apply(logits); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, TemperatureTransformTestStruct::size); // temperature transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); } } const std::vector TEMPERATURE_TRANSFORM_TEST_CASES = { - {1.0f, { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, { {0.665241, 2}, {0.244728, 1}, {0.090031, 0} } }, - {2.0f, { {1.0f, 2}, {2.0f, 1}, {3.0f, 0} }, { {0.506480, 0}, {0.307195, 1}, {0.186323, 2} } }, - {1.0f, { {3.0f, 0}, {1.0f, 1}, {2.0f, 2} }, { {0.665241, 0}, {0.244728, 2}, {0.090031, 1} } }, + {1.0f, { 1.0f, 2.0f, 3.0f }, { 0.090031, 0.244728, 0.665241 } }, + {2.0f, { 3.0f, 2.0f, 1.0f }, { 0.506480, 0.307195, 0.186323 } }, + {1.0f, { 3.0f, 1.0f, 2.0f }, { 0.665241, 0.090031, 0.244728 } }, }; INSTANTIATE_TEST_SUITE_P(VariousInputs, @@ -43,8 +44,10 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, struct TopPTestStruct { + static inline const size_t size = 3; + float top_p; - std::vector input; + float input[size]; std::vector expected_output; }; @@ -52,21 +55,22 @@ using TopPFilteringTest = testing::TestWithParam; TEST_P(TopPFilteringTest, FilterResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, TopPTestStruct::size); auto transform = TopPFilter(test_struct.top_p); transform.apply(logits); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_TRUE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, logits.m_vector.size()); + ASSERT_EQ(logits.m_size, test_struct.expected_output.size()); + for (size_t i = 0; i < logits.m_vector.size(); i++) { + EXPECT_NEAR(logits.m_vector[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits.m_vector[i].m_index, test_struct.expected_output[i].m_index); } } const std::vector TOP_P_TRANSFORM_TEST_CASES = { - {0.2f, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2} } }, - {0.9f, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2}, {0.244728, 1} } }, - {1.0f, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2}, {0.244728, 1}, {0.090031, 0} } }, + {0.2f, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2} } }, + {0.9f, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2}, {0.244728, 1} } }, }; INSTANTIATE_TEST_SUITE_P(VariousInputs, @@ -76,8 +80,10 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, struct TopKTestStruct { + static inline const size_t size = 3; + size_t top_k; - std::vector input; + float input[size]; std::vector expected_output; }; @@ -85,45 +91,66 @@ using TopKFilteringTest = testing::TestWithParam; TEST_P(TopKFilteringTest, FilterResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, TopKTestStruct::size); auto transform = TopKFilter(test_struct.top_k); transform.apply(logits); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_TRUE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, logits.m_vector.size()); + ASSERT_EQ(logits.m_size, test_struct.expected_output.size()); + for (size_t i = 0; i < logits.m_vector.size(); i++) { + EXPECT_NEAR(logits.m_vector[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits.m_vector[i].m_index, test_struct.expected_output[i].m_index); } } const std::vector TOP_K_TRANSFORM_TEST_CASES = { - {1, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2} } }, - {2, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2}, {0.244728, 1} } }, - {5, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} }, { {0.665241, 2}, {0.244728, 1}, {0.090031, 0} } }, + {1, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2} } }, + {2, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2}, {0.244728, 1} } }, }; INSTANTIATE_TEST_SUITE_P(VariousInputs, TopKFilteringTest, testing::ValuesIn(TOP_K_TRANSFORM_TEST_CASES)); +/* +TODO: Uncomment when top_k transform condition is fixed + +TEST(TopKFilteringTest, FilterNotAppliedTopKGreaterThanInputSize) { + float input[]{0.090031, 0.244728, 0.665241}; + float expected_output[]{0.090031, 0.244728, 0.665241}; // no change expected + size_t top_k = 5; + auto logits = Logits(input, 3); + auto transform = TopKFilter(top_k); + transform.apply(logits); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, 3); + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_EQ(logits.m_data[i], expected_output[i]); + } +} +*/ + struct RepetitionPenaltyTransformTestStruct { + static inline const size_t size = 3; + float penalty; - std::vector input; + float input[size]; TokenIds input_ids; - std::vector expected_output; + float expected_output[size]; }; using RepetitionPenaltyTransformTest = testing::TestWithParam; TEST_P(RepetitionPenaltyTransformTest, TransformResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, RepetitionPenaltyTransformTestStruct::size); auto transform = RepetitionPenaltyTransform(test_struct.penalty); transform.apply(logits, test_struct.input_ids); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, RepetitionPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); } } @@ -131,21 +158,21 @@ TEST_P(RepetitionPenaltyTransformTest, TransformResultEqualToReference) { const std::vector REPETITION_PENALTY_TRANSFORM_TEST_CASES = { { // basic case, indices are applied, order is left as-is 1.2f, - { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 1.0f, 2.0f, 3.0f }, { 2, 0 }, - { {0.8333333f, 0}, {2.0f, 1}, {2.5f, 2} } + { 0.8333333f, 2.0f, 2.5f } }, { // negative scores case 2.0f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 0, 1 }, - { {-2.0f, 0}, {1.0f, 1}, {3.0f, 2} } + { -2.0f, 1.0f, 3.0f } }, { // repeated tokens in prompt, check that the penalty is only applied once 0.5f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 1, 1 }, - { {-1.0f, 0}, {4.0f, 1}, {3.0f, 2} } + { -1.0f, 4.0f, 3.0f } }, }; @@ -155,30 +182,34 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { auto transform = RepetitionPenaltyTransform(1.5); - std::vector input {{43.0f, 0}}; - EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); - input = {{18.0f, 0}}; - EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); } + struct FrequencyPenaltyTransformTestStruct { + static inline const size_t size = 3; + float penalty; - std::vector input; + float input[size]; TokenIds input_ids; - std::vector expected_output; + float expected_output[size]; }; using FrequencyPenaltyTransformTest = testing::TestWithParam; TEST_P(FrequencyPenaltyTransformTest, TransformResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, FrequencyPenaltyTransformTestStruct::size); auto transform = FrequencyPenaltyTransform(test_struct.penalty); transform.apply(logits, test_struct.input_ids); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, FrequencyPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); } }; @@ -186,21 +217,21 @@ TEST_P(FrequencyPenaltyTransformTest, TransformResultEqualToReference) { const std::vector FREQUENCY_PENALTY_TRANSFORM_TEST_CASES = { { // basic case, indices are applied, order is left as-is 0.5f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 1, 0 }, - { {-0.5f, 0}, {1.5f, 1}, {3.0f, 2} } + { -0.5f, 1.5f, 3.0f } }, { // negative scores case -0.6f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 0, 1, 1 }, - { {-1.6f, 0}, {3.2f, 1}, {3.0f, 2} } + { -1.6f, 3.2f, 3.0f } }, { // repeated tokens in prompt, check that the penalty is only applied once 0.2f, - { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 1.0f, 2.0f, 3.0f }, { 2, 0, 2 }, - { {0.8f, 0}, {2.0f, 1}, {2.6f, 2} } + { 0.8f, 2.0f, 2.6f } }, }; @@ -210,31 +241,34 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, TEST(FrequencyPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { auto transform = FrequencyPenaltyTransform(1.5); - std::vector input {{43.0f, 0}}; - EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); - input = {{18.0f, 0}}; - EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); } struct PresencePenaltyTransformTestStruct { + static inline const size_t size = 3; + float penalty; - std::vector input; + float input[size]; TokenIds input_ids; - std::vector expected_output; + float expected_output[size]; }; using PresencePenaltyTransformTest = testing::TestWithParam; TEST_P(PresencePenaltyTransformTest, TransformResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, PresencePenaltyTransformTestStruct::size); auto transform = PresencePenaltyTransform(test_struct.penalty); transform.apply(logits, test_struct.input_ids); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, PresencePenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); } }; @@ -242,21 +276,21 @@ TEST_P(PresencePenaltyTransformTest, TransformResultEqualToReference) { const std::vector PRESENCE_PENALTY_TRANSFORM_TEST_CASES = { { // basic case, indices are applied, order is left as-is 0.5f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 1, 0 }, - { {-0.5f, 0}, {1.5f, 1}, {3.0f, 2} } + { -0.5f, 1.5f, 3.0f } }, { // negative scores case -0.6f, - { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { -1.0f, 2.0f, 3.0f }, { 0, 1, 1 }, - { {-1.6f, 0}, {2.6f, 1}, {3.0f, 2} } + { -1.6f, 2.6f, 3.0f } }, { // repeated tokens in prompt, check that the penalty is only applied once 0.2f, - { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 1.0f, 2.0f, 3.0f }, { 2, 0, 2 }, - { {0.8f, 0}, {2.0f, 1}, {2.8f, 2} } + { 0.8f, 2.0f, 2.8f } }, }; @@ -266,29 +300,32 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, TEST(PresencePenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { auto transform = PresencePenaltyTransform(1.5); - std::vector input {{43.0f, 0}}; - EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); - input = {{18.0f, 0}}; - EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); } struct EOSPenaltyTransformTestStruct { + static inline const size_t size = 3; + size_t eos_token_id; - std::vector input; - std::vector expected_output; + float input[size]; + float expected_output[size]; }; using EOSPenaltyTransformTest = testing::TestWithParam; TEST_P(EOSPenaltyTransformTest, TransformResultEqualToReference) { auto test_struct = GetParam(); - auto logits = test_struct.input; + auto logits = Logits(test_struct.input, EOSPenaltyTransformTestStruct::size); auto transform = EOSPenaltyTransform(test_struct.eos_token_id, std::numeric_limits::max()); transform.apply(logits); - ASSERT_EQ(logits.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < logits.size(); i++) { - EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, EOSPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); } } @@ -296,11 +333,12 @@ TEST_P(EOSPenaltyTransformTest, TransformResultEqualToReference) { const std::vector EOS_PENALTY_TRANSFORM_TEST_CASES = { { // basic case, indices are applied, order is left as-is 1, - { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, - { {1.0f, 0}, {0.0f, 1}, {3.0f, 2} }, + { 1.0f, 2.0f, 3.0f }, + { 1.0f, 0.0f, 3.0f }, }, }; INSTANTIATE_TEST_SUITE_P(VariousInputs, EOSPenaltyTransformTest, - testing::ValuesIn(EOS_PENALTY_TRANSFORM_TEST_CASES)); \ No newline at end of file + testing::ValuesIn(EOS_PENALTY_TRANSFORM_TEST_CASES)); +