[Core] Introduce asyncio within Ray Actors handling LLMClient (#8)

* initial async io implementation * timeout * single client, multiple concurrent requests * asyncio with ray actors * Simplifying logic * Remove print statements * Use num-ray-clients and requests-per-client * Minor bug fixes and cleanup * Removed send_llm_request_ in between * Initial fixes with print logs * Removed print and test asyncio.sleep * make format * Error handling with streaming OpenAI client * make format * Removing timeout for large requests * Update stop logic to use completed requests * make format * Update pbar once more * Instantiate client only once and close after run. * Fixing requests and concurrency (prefill_profiler) * make format * make format
project-etalon · Jul 17, 2024 · c009611 · c009611
1 parent 852be91
commit c009611
Show file tree

Hide file tree

Showing 19 changed files with 396 additions and 162 deletions.
diff --git a/README.md b/README.md
@@ -63,7 +63,8 @@ python -m metron.run_benchmark \
 --model "meta-llama/Meta-Llama-3-8B-Instruct" \
 --max-num-completed-requests 150 \
 --timeout 600 \
---num-concurrent-requests 10 \
+--num-ray-clients 2 \
+--num-concurrent-requests-per-client 5 \
 --output-dir "result_outputs" \
 --request-interval-generator-provider "poisson" \
 --poisson-request-interval-generator-qps 0.5 \
@@ -114,9 +115,7 @@ Launch any open source system and setup API keys and URL as shown for [vLLM](#ru
 ```bash
 python -m metron.prefill_profiler \
 --model "meta-llama/Meta-Llama-3-8B-Instruct" \
---max-num-completed-requests 1 \
 --timeout 600 \
---num-concurrent-requests 1 \
 --fixed-request-generator-decode-tokens 16 \
 --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
 --should-use-given-dir true
@@ -126,10 +125,7 @@ To modify range of prompt tokens for which prefill times get profiled, use the f
 ```bash
 python -m metron.prefill_profiler \
 --model "meta-llama/Meta-Llama-3-8B-Instruct" \
---max-num-completed-requests 1 \
 --timeout 600 \
---num-concurrent-requests 1 \
---fixed-request-generator-decode-tokens 16 \
 --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
 --should-use-given-dir true \
 --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536

diff --git a/docs/tutorials/prefill_profiler.rst b/docs/tutorials/prefill_profiler.rst
@@ -17,9 +17,7 @@ And, then run the following command:
 
     python -m metron.prefill_profiler \
     --model "meta-llama/Meta-Llama-3-8B-Instruct" \
-    --max-num-completed-requests 1 \
     --timeout 600 \
-    --num-concurrent-requests 1 \
     --fixed-request-generator-decode-tokens 16 \
     --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b"
 
@@ -38,9 +36,7 @@ To profile a custom range of prompt lengths, use the flag ``--prefill-lengths``
 
     python -m metron.prefill_profiler \
     --model "meta-llama/Meta-Llama-3-8B-Instruct" \
-    --max-num-completed-requests 1 \
     --timeout 600 \
-    --num-concurrent-requests 1 \
     --fixed-request-generator-decode-tokens 16 \
     --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \
     --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536

diff --git a/metron/capacity_search/config/config.py b/metron/capacity_search/config/config.py
@@ -179,7 +179,8 @@ def to_args(self):
 
 @dataclass
 class RequestConfig:
-    num_concurrent_requests: Optional[int] = None
+    num_ray_clients: Optional[int] = None
+    num_concurrent_requests_per_client: Optional[int] = None
     timeout: Optional[int] = None
     max_num_completed_requests: Optional[int] = None
     additional_sampling_params: Optional[Dict[str, Any]] = None
@@ -188,7 +189,8 @@ class RequestConfig:
 
     def to_config_dict(self):
         return {
-            "num-concurrent-requests": self.num_concurrent_requests,
+            "num-ray-clients": self.num_ray_clients,
+            "num-concurrent-requests-per-client": self.num_concurrent_requests_per_client,
             "timeout": self.timeout,
             "max-num-completed-requests": self.max_num_completed_requests,
             "additional-sampling-params": self.additional_sampling_params,
@@ -208,10 +210,10 @@ def to_args(self):
         return " ".join(args)
 
     def get_key(self):
-        return f"{self.num_concurrent_requests}_{self.timeout}_{self.max_num_completed_requests}_{self.llm_api}"
+        return f"{self.num_ray_clients}_{self.timeout}_{self.max_num_completed_requests}_{self.llm_api}"
 
     def to_human_readable_name(self):
-        return f"Num concurrent requests: {self.num_concurrent_requests}, Timeout: {self.timeout}, Max num completed requests: {self.max_num_completed_requests}, LLM API: {self.llm_api}"
+        return f"Num ray clients: {self.num_ray_clients}, Num concurrent requests per client: {self.num_concurrent_requests_per_client}, Timeout: {self.timeout}, Max num completed requests: {self.max_num_completed_requests}, LLM API: {self.llm_api}"
 
 
 @dataclass

diff --git a/metron/capacity_search/config/default.yml b/metron/capacity_search/config/default.yml
@@ -42,13 +42,15 @@ request_generator_configs:
     trace_file_name: "sharegpt"
 
 request_configs:
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}
     llm_api: "openai"
     request_generator_max_tokens: 8192
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}

diff --git a/metron/capacity_search/config/llama_70b.yml b/metron/capacity_search/config/llama_70b.yml
@@ -31,7 +31,8 @@ request_generator_configs:
     trace_file_name: "arxiv"
 
 request_configs:
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}

diff --git a/metron/capacity_search/config/llama_8b.yml b/metron/capacity_search/config/llama_8b.yml
@@ -33,13 +33,15 @@ request_generator_configs:
     trace_file_name: "arxiv"
 
 request_configs:
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}
     llm_api: "openai"
     request_generator_max_tokens: 8192
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}

diff --git a/metron/capacity_search/config/mixtral.yml b/metron/capacity_search/config/mixtral.yml
@@ -34,7 +34,8 @@ request_generator_configs:
     trace_file_name: "arxiv"
 
 request_configs:
-  - num_concurrent_requests: 100
+  - num_ray_clients: 10
+    num_concurrent_requests_per_client: 10
     timeout: 1200
     max_num_completed_requests: 1000
     additional_sampling_params: {}

diff --git a/metron/core/llm_clients/base_llm_client.py b/metron/core/llm_clients/base_llm_client.py
@@ -19,7 +19,7 @@ def get_token_length(self, text: str) -> int:
         return len(self.tokenizer.encode(text))
 
     @abc.abstractmethod
-    def send_llm_request(
+    async def send_llm_request(
         self, request_config: RequestConfig
     ) -> Tuple[RequestMetrics, str]:
         """Make a single completion request to a LLM API

diff --git a/metron/core/llm_clients/common.py b/metron/core/llm_clients/common.py
diff --git a/metron/core/llm_clients/litellm_client.py b/metron/core/llm_clients/litellm_client.py
@@ -11,11 +11,10 @@
 logger = init_logger(__name__)
 
 
-@ray.remote
 class LiteLLMClient(BaseLLMClient):
     """Client for LiteLLM Completions API."""
 
-    def send_llm_request(
+    async def send_llm_request(
         self, request_config: RequestConfig
     ) -> Tuple[RequestMetrics, str]:
         # litellm package isn't serializable, so we import it within the function

diff --git a/metron/core/llm_clients/openai_chat_completions_client.py b/metron/core/llm_clients/openai_chat_completions_client.py
@@ -3,8 +3,7 @@
 import time
 from typing import List, Tuple
 
-import ray
-import requests
+import httpx
 
 from metron.core.llm_clients.base_llm_client import BaseLLMClient
 from metron.core.request_config import RequestConfig
@@ -17,10 +16,13 @@
 MAX_RESPONSES_ALLOWED_TO_STORE = 5
 
 
-@ray.remote
 class OpenAIChatCompletionsClient(BaseLLMClient):
     """Client for OpenAI Chat Completions API."""
 
+    def __init__(self, model_name: str) -> None:
+        super().__init__(model_name)
+        self.client = httpx.AsyncClient()
+
     def total_tokens(self, response_list: List[str]) -> int:
         merged_content = "".join(response_list)
         return self.get_token_length(merged_content)
@@ -40,7 +42,11 @@ def get_current_tokens_received(
         previous_token_count = self.total_tokens(previous_responses)
         return current_tokens_received, previous_token_count
 
-    def send_llm_request(
+    async def close_client(self):
+        # Close the client
+        await self.client.aclose()
+
+    async def send_llm_request(
         self, request_config: RequestConfig
     ) -> Tuple[RequestMetrics, str]:
         prompt = request_config.prompt
@@ -82,29 +88,33 @@ def send_llm_request(
         most_recent_received_token_time = time.monotonic()
 
         try:
-            with requests.post(
-                address,
-                json=body,
-                stream=True,
-                timeout=180,
-                headers=headers,
+            async with self.client.stream(
+                "POST", address, json=body, timeout=None, headers=headers
             ) as response:
                 if response.status_code != 200:
-                    error_msg = response.text
                     error_response_code = response.status_code
-                    logger.error(f"Request Error: {response.content}")
+                    error_content = []
+                    async for error_line in response.aiter_lines():
+                        error_content.append(error_line)
+                    error_msg = "".join(error_content)
+                    logger.error(f"Request Error: {error_msg}")
                     response.raise_for_status()
 
-                for chunk in response.iter_lines(chunk_size=None):
+                async for chunk in response.aiter_lines():
                     chunk = chunk.strip()
 
                     if not chunk:
                         continue
                     stem = "data: "
                     chunk = chunk[len(stem) :]
-                    if chunk == b"[DONE]":
+                    if chunk in [b"[DONE]", "[DONE]"]:
                         continue
-                    data = json.loads(chunk)
+
+                    try:
+                        data = json.loads(chunk)
+                    except json.JSONDecodeError:
+                        logger.error(f"JSON decode error with chunk: {chunk}")
+                        continue  # Skip malformed JSON
 
                     if "error" in data:
                         error_msg = data["error"]["message"]

diff --git a/metron/core/llm_clients/sagemaker_client.py b/metron/core/llm_clients/sagemaker_client.py
@@ -15,11 +15,10 @@
 logger = init_logger(__name__)
 
 
-@ray.remote
 class SageMakerClient(BaseLLMClient):
     """Client for OpenAI Chat Completions API."""
 
-    def send_llm_request(
+    async def send_llm_request(
         self, request_config: RequestConfig
     ) -> Tuple[RequestMetrics, str]:
         if not os.environ.get("AWS_ACCESS_KEY_ID"):

diff --git a/metron/core/llm_clients/vertexai_client.py b/metron/core/llm_clients/vertexai_client.py
@@ -14,11 +14,10 @@
 logger = init_logger(__name__)
 
 
-@ray.remote
 class VertexAIClient(BaseLLMClient):
     """Client for VertexAI API."""
 
-    def send_llm_request(
+    async def send_llm_request(
         self, request_config: RequestConfig
     ) -> Tuple[RequestMetrics, str]:
         project_id = os.environ.get("GCLOUD_PROJECT_ID")

diff --git a/metron/core/request_config.py b/metron/core/request_config.py
@@ -21,3 +21,4 @@ class RequestConfig(BaseModel):
     llm_api: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     address_append_value: Optional[str] = None
+    id: Optional[int] = None