diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 7c34ca7f66..23d9006d07 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -74,7 +74,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -90,7 +90,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('69', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -106,7 +106,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Hi', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -122,7 +122,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('return 0', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -138,7 +138,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -160,7 +160,7 @@ jobs: for prompt in prompts: tokenized = tokenizer(prompt, return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -201,7 +201,7 @@ jobs: echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py - echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py + echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py echo if -1 == idx: >> ref.py echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py @@ -441,7 +441,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -486,7 +486,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref}" from predictions') diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 68fa0c86ab..b392e44b3b 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -36,7 +36,12 @@ GenerationConfig::GenerationConfig(std::string json_path) { if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"]; if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"]; if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"]; - if (data.contains("eos_token_id")) eos_token_id = data["eos_token_id"]; + + if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) { + // todo: qwen contains several eos_token_id + eos_token_id = data["eos_token_id"]; + } + if (data.contains("bos_token")) bos_token = data["bos_token"]; if (data.contains("eos_token")) eos_token = data["eos_token"]; diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 09d64460a2..75c18734d3 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -80,7 +80,7 @@ class Tokenizer::TokenizerImpl { m_bos_token_id = rt_info["bos_token_id"].as(); if (rt_info.count("pad_token_id") > 0) m_pad_token_id = rt_info["pad_token_id"].as(); - } + } std::pair encode(std::string prompt) { size_t batch_size = 1; @@ -94,6 +94,13 @@ class Tokenizer::TokenizerImpl { auto size_ = m_tokenize_request.get_input_tensor().get_shape(); m_tokenize_request.infer(); pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); + + // todo: fix mask filled with '2' instead of '0' + // https://github.com/openvinotoolkit/openvino_tokenizers/pull/90 should've fixed this + ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask"); + int64_t* attention_mask_data = attention_mask.data(); + std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); + return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; } diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 3b40529f38..1afc5f93ed 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -3,6 +3,10 @@ #include +namespace { + enum SPECIAL_TOKEN { PAD_TOKEN = 2 }; +} + int main(int argc, char* argv[]) try { if (argc < 3) { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' ['' ...]"); @@ -14,11 +18,14 @@ int main(int argc, char* argv[]) try { ov::LLMPipeline pipe(model_path, device); ov::GenerationConfig config = pipe.get_generation_config(); - config.max_new_tokens = 100; + config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; config.num_return_sequences = config.num_beams * prompts.size(); + // workaround until pad_token_id is not written into IR + pipe.get_tokenizer().set_pad_token_id(PAD_TOKEN); + auto beams = pipe.generate(prompts, config); for (int i = 0; i < beams.scores.size(); i++) std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n';