huggingface · danieldk · Oct 25, 2024 · Oct 24, 2024 · Oct 25, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -161,15 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build
 
-# Build FBGEMM CUDA kernels
-FROM kernel-builder AS fbgemm-builder
-
-WORKDIR /usr/src
-
-COPY server/Makefile-fbgemm Makefile
-
-RUN make build-fbgemm
-
 # Build vllm CUDA kernels
 FROM kernel-builder AS vllm-builder
 
@@ -239,8 +230,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from fbgemm builder
-COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.1";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

diff --git a/...tion-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json b/...tion-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
@@ -1,8 +1,8 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "stop_sequence",
-    "generated_tokens": 5,
+    "finish_reason": "length",
+    "generated_tokens": 10,
     "prefill": [
       {
         "id": 128000,
@@ -11,49 +11,79 @@
       },
       {
         "id": 2323,
-        "logprob": -9.5625,
+        "logprob": -9.5234375,
         "text": "Test"
       },
       {
         "id": 1715,
-        "logprob": -10.4375,
+        "logprob": -10.421875,
         "text": " request"
       }
     ],
     "seed": 0,
     "tokens": [
       {
         "id": 25,
-        "logprob": -0.8984375,
+        "logprob": -0.88183594,
         "special": false,
         "text": ":"
       },
       {
-        "id": 923,
-        "logprob": -2.84375,
+        "id": 2209,
+        "logprob": -2.6699219,
         "special": false,
-        "text": " add"
+        "text": " Is"
       },
       {
-        "id": 264,
-        "logprob": 0.0,
+        "id": 279,
+        "logprob": -0.61083984,
         "special": false,
-        "text": " a"
+        "text": " the"
+      },
+      {
+        "id": 734,
+        "logprob": -2.6660156,
+        "special": false,
+        "text": " function"
       },
       {
         "id": 330,
-        "logprob": -0.31640625,
+        "logprob": -0.35498047,
         "special": false,
         "text": " \""
       },
       {
-        "id": 1985,
-        "logprob": 0.0,
+        "id": 4110,
+        "logprob": -2.4101562,
+        "special": false,
+        "text": "Create"
+      },
+      {
+        "id": 7575,
+        "logprob": -2.2304688,
+        "special": false,
+        "text": "Process"
+      },
+      {
+        "id": 1,
+        "logprob": -0.080078125,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 304,
+        "logprob": -0.75439453,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 12468,
+        "logprob": -1.8769531,
         "special": false,
-        "text": "test"
+        "text": " Win"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request: add a \"test"
+  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
 }
diff --git a/...__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json b/...__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
@@ -16,17 +16,17 @@
       },
       {
         "id": 5655,
-        "logprob": -11.75,
+        "logprob": -11.8359375,
         "text": " deep"
       },
       {
         "id": 6975,
-        "logprob": -2.0625,
+        "logprob": -2.0703125,
         "text": " learning"
       },
       {
         "id": 30,
-        "logprob": -6.0,
+        "logprob": -5.9765625,
         "text": "?"
       }
     ],
@@ -40,25 +40,25 @@
       },
       {
         "id": 34564,
-        "logprob": -0.11279297,
+        "logprob": -0.12512207,
         "special": false,
         "text": "Deep"
       },
       {
         "id": 6975,
-        "logprob": -0.16015625,
+        "logprob": 0.0,
         "special": false,
         "text": " learning"
       },
       {
         "id": 320,
-        "logprob": -0.25195312,
+        "logprob": -0.23840332,
         "special": false,
         "text": " ("
       },
       {
         "id": 16931,
-        "logprob": -1.703125,
+        "logprob": -2.0175781,
         "special": false,
         "text": "DL"
       },
@@ -70,7 +70,7 @@
       },
       {
         "id": 374,
-        "logprob": -1.140625,
+        "logprob": -0.8613281,
         "special": false,
         "text": " is"
       },
@@ -82,7 +82,7 @@
       },
       {
         "id": 1207,
-        "logprob": -1.3125,
+        "logprob": -1.2451172,
         "special": false,
         "text": " sub"
       },

diff --git a/nix/server.nix b/nix/server.nix
@@ -8,7 +8,6 @@
   eetq,
   einops,
   exllamav2,
-  fbgemm-gpu,
   flashinfer,
   flash-attn,
   flash-attn-layer-norm,
@@ -77,7 +76,6 @@ buildPythonPackage {
     causal-conv1d
     einops
     exllamav2
-    fbgemm-gpu
     flashinfer
     flash-attn
     flash-attn-layer-norm

diff --git a/server/Makefile b/server/Makefile
@@ -5,7 +5,6 @@ include Makefile-awq
 include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
-include Makefile-fbgemm
 include Makefile-exllamav2
 include Makefile-flashinfer
 
@@ -30,7 +29,7 @@ install-server: gen-server
 install: install-cuda
 	echo "Installed server"
 
-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
+install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
 	pip install -e ".[bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3
 

diff --git a/server/Makefile-fbgemm b/server/Makefile-fbgemm
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -41,10 +41,10 @@ py-cpuinfo = "^9.0.0"
 numpy = "^1.26"
 
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
   { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },