Skip to content

Commit

Permalink
Enable prefix caching
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Jan 2, 2025
1 parent bf462f5 commit 2f99472
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ ov::genai::LLMPipeline::LLMPipeline(
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
#endif
Expand Down Expand Up @@ -162,6 +163,7 @@ ov::genai::LLMPipeline::LLMPipeline(
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
#endif
Expand Down Expand Up @@ -228,6 +230,7 @@ ov::genai::LLMPipeline::LLMPipeline(
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
default_config, device, properties, generation_config);
Expand Down

0 comments on commit 2f99472

Please sign in to comment.