feat: allow any supported payload on /invocations (#2683)

* feat: allow any supported payload on /invocations * update openAPI * update doc
huggingface · Oct 23, 2024 · 41c2623 · 41c2623
1 parent 27ff187
commit 41c2623
Show file tree

Hide file tree

Showing 13 changed files with 237 additions and 789 deletions.
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \
 You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
 
 ```bash
-curl localhost:3000/v1/chat/completions \
+curl localhost:8080/v1/chat/completions \
     -X POST \
     -d '{
   "model": "tgi",

diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use std::path::PathBuf;
 use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
 use text_generation_backends_trtllm::TensorRtLlmBackend;
-use text_generation_router::server;
+use text_generation_router::{server, usage_stats};
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 
 /// App Configuration
@@ -48,14 +48,14 @@ struct Args {
     otlp_service_name: String,
     #[clap(long, env)]
     cors_allow_origin: Option<Vec<String>>,
-    #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
     #[clap(long, env)]
     auth_token: Option<String>,
     #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
     executor_worker: PathBuf,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
 }
 
 #[tokio::main]
@@ -83,10 +83,10 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         otlp_endpoint,
         otlp_service_name,
         cors_allow_origin,
-        messages_api_enabled,
         max_client_batch_size,
         auth_token,
         executor_worker,
+        usage_stats,
     } = args;
 
     // Launch Tokio runtime
@@ -155,11 +155,9 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         false,
         None,
         None,
-        messages_api_enabled,
         true,
         max_client_batch_size,
-        false,
-        false,
+        usage_stats,
     )
     .await?;
     Ok(())

diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
@@ -63,8 +63,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -110,7 +108,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -190,7 +187,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,

diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
@@ -63,8 +63,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -110,7 +108,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -190,7 +187,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -316,6 +316,98 @@
         }
       }
     },
+    "/invocations": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens from Sagemaker request",
+        "operationId": "sagemaker_compatibility",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SagemakerRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerStreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/metrics": {
       "get": {
         "tags": [
@@ -1865,6 +1957,45 @@
           "type": "string"
         }
       },
+      "SagemakerRequest": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/CompatGenerateRequest"
+          },
+          {
+            "$ref": "#/components/schemas/ChatRequest"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionRequest"
+          }
+        ]
+      },
+      "SagemakerResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/GenerateResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletion"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionFinal"
+          }
+        ]
+      },
+      "SagemakerStreamResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/StreamResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletionChunk"
+          },
+          {
+            "$ref": "#/components/schemas/Chunk"
+          }
+        ]
+      },
       "SimpleToken": {
         "type": "object",
         "required": [

diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
@@ -141,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene
 
 ## Amazon SageMaker
 
-To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
-
-This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+Amazon Sagemaker natively supports the message API:
 
 ```python
 import json
@@ -161,12 +159,11 @@ except ValueError:
 hub = {
  'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
  'SM_NUM_GPUS': json.dumps(1),
- 'MESSAGES_API_ENABLED': True
 }
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"),
  env=hub,
  role=role,
 )

diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
@@ -26,7 +26,6 @@ As of release 2.1.2 this is an example of the data collected:
   "max_top_n_tokens": 5,
   "max_total_tokens": 2048,
   "max_waiting_tokens": 20,
-  "messages_api_enabled": false,
   "model_config": {
     "model_type": "Bloom"
   },

diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -8,6 +8,7 @@ pub mod validation;
 mod kserve;
 pub mod logging;
 
+mod sagemaker;
 pub mod usage_stats;
 mod vertex;