Enable prefix caching

openvinotoolkit · Jan 2, 2025 · 2f99472 · 2f99472
1 parent bf462f5
commit 2f99472
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -121,6 +121,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
 #endif
@@ -162,6 +163,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
 #endif
@@ -228,6 +230,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
                                                                   default_config, device, properties, generation_config);