Fixing requests and concurrency (prefill_profiler)

project-etalon · Jul 17, 2024 · 077a661 · 077a661
1 parent 65f379c
commit 077a661
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -115,10 +115,7 @@ Launch any open source system and setup API keys and URL as shown for [vLLM](#ru
 ```bash
 python -m metron.prefill_profiler \
 --model "meta-llama/Meta-Llama-3-8B-Instruct" \
---max-num-completed-requests 1 \
 --timeout 600 \
---num-ray-clients 1 \
---num-concurrent-requests-per-client 1 \
 --fixed-request-generator-decode-tokens 16 \
 --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
 --should-use-given-dir true
@@ -128,10 +125,7 @@ To modify range of prompt tokens for which prefill times get profiled, use the f
 ```bash
 python -m metron.prefill_profiler \
 --model "meta-llama/Meta-Llama-3-8B-Instruct" \
---max-num-completed-requests 1 \
 --timeout 600 \
---num-ray-clients 1 \
---num-concurrent-requests-per-client 1 \
 --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
 --should-use-given-dir true \
 --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536

diff --git a/docs/tutorials/prefill_profiler.rst b/docs/tutorials/prefill_profiler.rst
@@ -17,10 +17,7 @@ And, then run the following command:
 
     python -m metron.prefill_profiler \
     --model "meta-llama/Meta-Llama-3-8B-Instruct" \
-    --max-num-completed-requests 1 \
     --timeout 600 \
-    --num-ray-clients 1 \
-    --num-concurrent-requests-per-client 1 \
     --fixed-request-generator-decode-tokens 16 \
     --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b"
 
@@ -39,10 +36,7 @@ To profile a custom range of prompt lengths, use the flag ``--prefill-lengths``
 
     python -m metron.prefill_profiler \
     --model "meta-llama/Meta-Llama-3-8B-Instruct" \
-    --max-num-completed-requests 1 \
     --timeout 600 \
-    --num-ray-clients 1 \
-    --num-concurrent-requests-per-client 1 \
     --fixed-request-generator-decode-tokens 16 \
     --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
     --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536

diff --git a/metron/prefill_profiler.py b/metron/prefill_profiler.py
@@ -31,6 +31,12 @@
 PREFILL_POLYNOMIAL_DEGREE = 2
 # RMSE threshold for the prefill time predictor
 PREFILL_RMSE_THRESHOLD = 0.05
+# Number of Ray clients to use for prefill profiling
+PREFILL_NUM_RAY_CLIENTS = 1
+# Number of concurrent requests per client for prefill profiling
+PREFILL_NUM_CONCURRENT_REQUESTS_PER_CLIENT = 1
+# Number of completed requests to wait for before stopping the prefill profiling for a prompt length
+PREFILL_MAX_NUM_COMPLETED_REQUESTS = 1
 
 
 class PrefillProfiler:
@@ -77,9 +83,9 @@ def run(self):
                 model=self.args.model,
                 output_dir=run_dir,
                 additional_sampling_params=self.args.additional_sampling_params,
-                num_ray_clients=self.args.num_ray_clients,
-                num_concurrent_requests_per_client=self.args.num_concurrent_requests_per_client,
-                max_num_completed_requests=self.args.max_num_completed_requests,
+                num_ray_clients=PREFILL_NUM_RAY_CLIENTS,
+                num_concurrent_requests_per_client=PREFILL_NUM_CONCURRENT_REQUESTS_PER_CLIENT,
+                max_num_completed_requests=PREFILL_MAX_NUM_COMPLETED_REQUESTS,
                 timeout=self.args.timeout,
                 llm_api=self.args.llm_api,
                 request_generator_config=request_generator_config,