Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from fbgemm-gpu w8a8 scaled matmul to vLLM/marlin-kernels #2688

Merged
merged 2 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,6 @@ COPY server/custom_kernels/ .
# Build specific version of transformers
RUN python setup.py build

# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

# Build vllm CUDA kernels
FROM kernel-builder AS vllm-builder

Expand Down Expand Up @@ -239,8 +230,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
Expand Down
8 changes: 4 additions & 4 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.1";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "stop_sequence",
"generated_tokens": 5,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
Expand All @@ -11,49 +11,79 @@
},
{
"id": 2323,
"logprob": -9.5625,
"logprob": -9.5234375,
"text": "Test"
},
{
"id": 1715,
"logprob": -10.4375,
"logprob": -10.421875,
"text": " request"
}
],
"seed": 0,
"tokens": [
{
"id": 25,
"logprob": -0.8984375,
"logprob": -0.88183594,
"special": false,
"text": ":"
},
{
"id": 923,
"logprob": -2.84375,
"id": 2209,
"logprob": -2.6699219,
"special": false,
"text": " add"
"text": " Is"
},
{
"id": 264,
"logprob": 0.0,
"id": 279,
"logprob": -0.61083984,
"special": false,
"text": " a"
"text": " the"
},
{
"id": 734,
"logprob": -2.6660156,
"special": false,
"text": " function"
},
{
"id": 330,
"logprob": -0.31640625,
"logprob": -0.35498047,
"special": false,
"text": " \""
},
{
"id": 1985,
"logprob": 0.0,
"id": 4110,
"logprob": -2.4101562,
"special": false,
"text": "Create"
},
{
"id": 7575,
"logprob": -2.2304688,
"special": false,
"text": "Process"
},
{
"id": 1,
"logprob": -0.080078125,
"special": false,
"text": "\""
},
{
"id": 304,
"logprob": -0.75439453,
"special": false,
"text": " in"
},
{
"id": 12468,
"logprob": -1.8769531,
"special": false,
"text": "test"
"text": " Win"
}
],
"top_tokens": null
},
"generated_text": "Test request: add a \"test"
"generated_text": "Test request: Is the function \"CreateProcess\" in Win"
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@
},
{
"id": 5655,
"logprob": -11.75,
"logprob": -11.8359375,
"text": " deep"
},
{
"id": 6975,
"logprob": -2.0625,
"logprob": -2.0703125,
"text": " learning"
},
{
"id": 30,
"logprob": -6.0,
"logprob": -5.9765625,
"text": "?"
}
],
Expand All @@ -40,25 +40,25 @@
},
{
"id": 34564,
"logprob": -0.11279297,
"logprob": -0.12512207,
"special": false,
"text": "Deep"
},
{
"id": 6975,
"logprob": -0.16015625,
"logprob": 0.0,
"special": false,
"text": " learning"
},
{
"id": 320,
"logprob": -0.25195312,
"logprob": -0.23840332,
"special": false,
"text": " ("
},
{
"id": 16931,
"logprob": -1.703125,
"logprob": -2.0175781,
"special": false,
"text": "DL"
},
Expand All @@ -70,7 +70,7 @@
},
{
"id": 374,
"logprob": -1.140625,
"logprob": -0.8613281,
"special": false,
"text": " is"
},
Expand All @@ -82,7 +82,7 @@
},
{
"id": 1207,
"logprob": -1.3125,
"logprob": -1.2451172,
"special": false,
"text": " sub"
},
Expand Down
2 changes: 0 additions & 2 deletions nix/server.nix
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
eetq,
einops,
exllamav2,
fbgemm-gpu,
flashinfer,
flash-attn,
flash-attn-layer-norm,
Expand Down Expand Up @@ -77,7 +76,6 @@ buildPythonPackage {
causal-conv1d
einops
exllamav2
fbgemm-gpu
flashinfer
flash-attn
flash-attn-layer-norm
Expand Down
3 changes: 1 addition & 2 deletions server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ include Makefile-awq
include Makefile-eetq
include Makefile-selective-scan
include Makefile-lorax-punica
include Makefile-fbgemm
include Makefile-exllamav2
include Makefile-flashinfer

Expand All @@ -30,7 +29,7 @@ install-server: gen-server
install: install-cuda
echo "Installed server"

install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
pip install -e ".[bnb,marlin,moe]"
pip install nvidia-nccl-cu12==2.22.3

Expand Down
15 changes: 0 additions & 15 deletions server/Makefile-fbgemm

This file was deleted.

29 changes: 15 additions & 14 deletions server/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ py-cpuinfo = "^9.0.0"
numpy = "^1.26"

marlin-kernels = [
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
]
moe-kernels = [
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
Expand Down
Loading
Loading