Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
gtest adjustment

coverity

revert top_k fix

remove additional timers

is_vector_initialized
  • Loading branch information
mzegla committed Jul 24, 2024
1 parent 42dd049 commit 5c463c4
Show file tree
Hide file tree
Showing 11 changed files with 266 additions and 176 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM ubuntu:22.04

ARG JOBS
WORKDIR /workspace
RUN apt-get update -y && apt-get install -y python3-pip python3-venv git
RUN apt-get update -y && apt-get install -y --no-install-recommends python3-pip python3-venv git

# Install OpenVINO
RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \
Expand All @@ -25,14 +25,14 @@ ENV OpenVINO_DIR=/workspace/openvino_build
RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

# Build GenAI library with dependencies
RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tokenizer openvino.genai && \
RUN git clone https://github.com/openvinotoolkit/openvino.genai.git && \
cd /workspace/openvino.genai/thirdparty && git submodule update --remote --init && \
mkdir /workspace/openvino.genai/build && cd /workspace/openvino.genai/build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j${JOBS}

# Install test dependencies
RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/continuous_batching/requirements.txt
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/requirements.txt
ENV PYTHONPATH=/workspace/openvino.genai/build/
ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {

PipelineMetrics get_metrics() const;

GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params);
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params);

void step();

bool has_non_finished_requests();

// more high level interface, which can process multiple prompts in continuous batching manner
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params);
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params);
};
}
2 changes: 1 addition & 1 deletion src/cpp/include/openvino/genai/generation_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {

public:
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
m_generation_stream(generation_stream),
m_generation_stream(std::move(generation_stream)),
m_sampling_params(sampling_params) {};

~GenerationHandleImpl();
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/block_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ class BlockManager {
}

bool can_append_slots(SequenceGroup::CPtr seq_group) {
return required_blocks_count(seq_group) <= m_allocator.num_free_blocks();
return required_blocks_count(std::move(seq_group)) <= m_allocator.num_free_blocks();
}

size_t required_blocks_count(SequenceGroup::CPtr seq_group) {
Expand Down Expand Up @@ -336,7 +336,7 @@ class BlockManager {
// write information about block forking for later usage in CacheManager
copy_blocks_map[last_block->get_index()].push_back(new_block->get_index());
// release `last_block` usage
m_allocator.free(last_block);
m_allocator.free(std::move(last_block));
} else {
// nothing to do, because we are the only users of this block
}
Expand Down
8 changes: 4 additions & 4 deletions src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ class ContinuousBatchingPipeline::Impl {
return !m_awaiting_requests.empty() || !m_requests.empty();
}

std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
OPENVINO_ASSERT(prompts.size() == sampling_params.size());

Expand Down Expand Up @@ -307,8 +307,8 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{
return m_impl->get_metrics();
}

GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
return m_impl->add_request(request_id, prompt, sampling_params);
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) {
return m_impl->add_request(request_id, prompt, std::move(sampling_params));
}

void ContinuousBatchingPipeline::step() {
Expand All @@ -319,6 +319,6 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
return m_impl->has_non_finished_requests();
}

std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
return m_impl->generate(prompts, sampling_params);
}
2 changes: 1 addition & 1 deletion src/cpp/src/generation_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class GenerationStream {
}

void push(GenerationOutputs outputs) {
m_output_queue.push(outputs);
m_output_queue.push(std::move(outputs));
}

// Retriving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
Expand Down
144 changes: 94 additions & 50 deletions src/cpp/src/logit_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,38 @@ struct Token {
Token() = default;
};

struct Logits {
float * m_data = nullptr;
size_t m_size;
// Late initialized for top_p or top_k transforms
std::vector<Token> m_vector;

Logits(float* data, size_t size): m_data(data), m_size(size) {}


void initialize_vector() {
OPENVINO_ASSERT(m_vector.size() == 0, "Logits vector already initialized");
m_vector.reserve(m_size);
for (size_t i = 0; i < m_size; i++)
m_vector.emplace_back(m_data[i], i);
}

bool is_vector_initialized() const {
return m_vector.size() > 0;
}

void resize(size_t new_size) {
m_size = new_size;
m_vector.resize(new_size);
}
};

namespace LogitTransformers {
using TokenIds = std::vector<int64_t>;

class ILogitTransformer {
public:
virtual void apply(std::vector<Token>& logits) = 0;
virtual void apply(Logits& logits) = 0;

virtual bool is_applicable(size_t generated_tokens_cnt = 0) {
return true;
Expand All @@ -32,11 +58,15 @@ class TopPFilter : public ILogitTransformer {
public:
TopPFilter(double top_p) : m_top_p(top_p) {}

void apply(std::vector<Token>& logits) override {
std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; });
void apply(Logits& logits) override {
if (!logits.is_vector_initialized()) {
// Initialize and sort vector
logits.initialize_vector();
std::sort(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; });
}
float probability_sum = 0.0f;
size_t nucleus_size = 0;
for (const auto& probability : logits) {
for (const auto& probability : logits.m_vector) {
probability_sum += probability.m_log_prob;
nucleus_size += 1;
if (probability_sum > m_top_p) break;
Expand All @@ -52,10 +82,26 @@ class TopKFilter : public ILogitTransformer {
public:
TopKFilter(size_t top_k) : m_top_k(top_k) {}

void apply(std::vector<Token>& logits) override {
std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; });
size_t top_k = logits.size() >= m_top_k ? m_top_k : logits.size();
logits.resize(top_k);
// If this transform is used along with top_p, it should be applied after it since top_p sorts entire vector and top_k does it only partially
void apply(Logits& logits) override {

/*
TODO: Uncommenting this section requires changes in reference texts in tests
if (m_top_k >= logits.m_size)
return;
*/

if (!logits.is_vector_initialized()) {
// Initialize and partially sort vector
logits.initialize_vector();
// TODO: Uncommenting below requires uncommenting section above
// std::partial_sort(logits.m_vector.begin(), logits.m_vector.begin() + m_top_k, logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; });

std::sort(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; });
}
if (m_top_k < logits.m_size)
logits.resize(m_top_k);
}

protected:
Expand All @@ -66,18 +112,23 @@ class TemperatureLogitTransform : public ILogitTransformer {
public:
TemperatureLogitTransform(double temperature) : m_temperature(temperature) {};

void apply(std::vector<Token>& logits) override {
auto max_prob_token = std::max_element(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
float max_logit = max_prob_token->m_log_prob;

std::for_each(logits.begin(), logits.end(), [max_logit, this](Token& val) {val.m_log_prob = expf((val.m_log_prob - max_logit) / this->m_temperature);});
void apply(Logits& logits) override {
float max_logit = -std::numeric_limits<float>::infinity();
for (size_t i = 0; i < logits.m_size; i++) {
if (logits.m_data[i] > max_logit) {
max_logit = logits.m_data[i];
}
}

float norm_sum = 0.0;
for (const auto& val : logits) {
norm_sum += val.m_log_prob;
for (size_t i = 0; i < logits.m_size; i++) {
logits.m_data[i] = expf((logits.m_data[i] - max_logit) / this->m_temperature);
norm_sum += logits.m_data[i];
}

std::for_each(logits.begin(), logits.end(), [norm_sum](Token& val) {val.m_log_prob /= norm_sum;});
for (size_t i = 0; i < logits.m_size; i++) {
logits.m_data[i] /= norm_sum;
}
}

protected:
Expand Down Expand Up @@ -118,32 +169,28 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer {
m_penalty = repetition_penalty;
};

void apply(std::vector<Token>& logits) override {
size_t vocab_size = logits.size();
void apply(Logits& logits) override {
size_t vocab_size = logits.m_size;
for (const auto& prompt_id : *m_unique_prompt_token_ids) {
OPENVINO_ASSERT((prompt_id >= 0) && (prompt_id < vocab_size), "input_ids token out of bounds");
OPENVINO_ASSERT(logits[prompt_id].m_index == prompt_id, "input_logits must have original index order");
auto logit_value = logits[prompt_id].m_log_prob;
if (logit_value >= 0) {
logits[prompt_id].m_log_prob /= m_penalty;
if (logits.m_data[prompt_id] >= 0) {
logits.m_data[prompt_id] /= m_penalty;
} else {
logits[prompt_id].m_log_prob *= m_penalty;
logits.m_data[prompt_id] *= m_penalty;
};
}
for (const auto& input_id_pair : *m_unique_generated_token_ids) {
const auto& input_id = input_id_pair.first;
OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds");
OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order");
auto logit_value = logits[input_id].m_log_prob;
if (logit_value >= 0) {
logits[input_id].m_log_prob /= m_penalty;
if (logits.m_data[input_id] >= 0) {
logits.m_data[input_id] /= m_penalty;
} else {
logits[input_id].m_log_prob *= m_penalty;
logits.m_data[input_id] *= m_penalty;
};
}
}

void apply(std::vector<Token>& logits, const TokenIds& input_ids) {
void apply(Logits& logits, const TokenIds& input_ids) {
set_unique_prompt_token_ids(nullptr);
extract_generated_tokens(input_ids);
apply(logits);
Expand All @@ -166,10 +213,10 @@ class EOSPenaltyTransform : public ILogitTransformer {
EOSPenaltyTransform(size_t eos_token_id, size_t min_generated_tokens) :
m_eos_token_id(eos_token_id), m_applicable_tensor_len(min_generated_tokens) {}

void apply(std::vector<Token>& logits) override {
// Since EOS penalty is applied early, the token vector is not sorted
void apply(Logits& logits) override {
// Since EOS penalty is applied early, the token vector is not initialized yet
// and we can assume element order match token ids.
logits[m_eos_token_id].m_log_prob = 0.f;
logits.m_data[m_eos_token_id] = 0.f;
}


Expand All @@ -188,22 +235,20 @@ class FrequencyPenaltyTransform : public IPenaltyTransformer {
m_penalty = value;
};

void apply(std::vector<Token>& logits) override {
size_t vocab_size = logits.size();
void apply(Logits& logits) override {
size_t vocab_size = logits.m_size;
for (const auto& input_id_pair : *m_unique_generated_token_ids) {
const auto& input_id = input_id_pair.first;
OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds");
OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order");
auto logit_value = logits[input_id].m_log_prob;
if (logit_value >= 0) {
logits[input_id].m_log_prob -= m_penalty * input_id_pair.second;
if (logits.m_data[input_id] >= 0) {
logits.m_data[input_id] -= m_penalty * input_id_pair.second;
} else {
logits[input_id].m_log_prob += m_penalty * input_id_pair.second;
logits.m_data[input_id] += m_penalty * input_id_pair.second;
};
}
}

void apply(std::vector<Token>& logits, const TokenIds& input_ids) {
void apply(Logits& logits, const TokenIds& input_ids) {
extract_generated_tokens(input_ids);
apply(logits);
}
Expand All @@ -215,22 +260,20 @@ class PresencePenaltyTransform : public IPenaltyTransformer {
m_penalty = value;
};

void apply(std::vector<Token>& logits) override {
size_t vocab_size = logits.size();
void apply(Logits& logits) override {
size_t vocab_size = logits.m_size;
for (const auto& input_id_pair : *m_unique_generated_token_ids) {
const auto& input_id = input_id_pair.first;
OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds");
OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order");
auto logit_value = logits[input_id].m_log_prob;
if (logit_value >= 0) {
logits[input_id].m_log_prob -= m_penalty;
if (logits.m_data[input_id] >= 0) {
logits.m_data[input_id] -= m_penalty;
} else {
logits[input_id].m_log_prob += m_penalty;
logits.m_data[input_id] += m_penalty;
};
}
}

void apply(std::vector<Token>& logits, const TokenIds& input_ids) {
void apply(Logits& logits, const TokenIds& input_ids) {
extract_generated_tokens(input_ids);
apply(logits);
}
Expand Down Expand Up @@ -286,14 +329,15 @@ class LogitProcessor {
if (sampling_params.top_p != 1.0f) {
m_logit_transformers.emplace_back(new LogitTransformers::TopPFilter(sampling_params.top_p));
}
if (sampling_params.top_k > 0) {
// TODO: Uncommenting below condition requires changes in reference texts in tests
if (sampling_params.top_k > 0 /* && sampling_params.top_k < std::numeric_limits<size_t>::max() */) {
m_logit_transformers.emplace_back(new LogitTransformers::TopKFilter(sampling_params.top_k));
}
}
}
}

void apply(std::vector<Token>& logits) {
void apply(Logits& logits) {
for (const auto& transformer : m_logit_transformers) {
if (transformer->is_applicable(m_generated_tokens)) {
transformer->apply(logits);
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/model_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class ModelRunner {
SchedulerConfig m_scheduler_config;
public:
ModelRunner(ov::InferRequest request, const SchedulerConfig& scheduler_config) :
m_request(request),
m_request(std::move(request)),
m_scheduler_config(scheduler_config) { }

ov::InferRequest get_infer_request() const {
Expand Down
Loading

0 comments on commit 5c463c4

Please sign in to comment.