Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update nightly llama benchmarking tests #754

Merged
merged 17 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
- name: Run perplexity test with IREE
run: |
source ${VENV_DIR}/bin/activate
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --run-nightly-llama-tests --bs=100 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --run-nightly-llama-tests --bs=100 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-device=hip --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html

- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_eval_short.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ jobs:
- name: Run perplexity test with vmfb
run: |
source ${VENV_DIR}/bin/activate
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --bs=5 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --bs=5 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-device=hip --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
8 changes: 4 additions & 4 deletions sharktank/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,9 @@ def pytest_addoption(parser):
)

parser.addoption(
"--iree-hal-target-backends",
"--iree-hal-target-device",
action="store",
help="Specify the iree-hal target backend (e.g., rocm)",
help="Specify the iree-hal target device (e.g., hip)",
)

parser.addoption(
Expand Down Expand Up @@ -354,8 +354,8 @@ def get_iree_flags(request: FixtureRequest):
model_path["iree_hip_target"] = set_fixture_from_cli_option(
request, "--iree-hip-target", "iree_hip_target"
)
model_path["iree_hal_target_backends"] = set_fixture_from_cli_option(
request, "--iree-hal-target-backends", "iree_hal_target_backends"
model_path["iree_hal_target_device"] = set_fixture_from_cli_option(
request, "--iree-hal-target-device", "iree_hal_target_device"
)


Expand Down
18 changes: 9 additions & 9 deletions sharktank/sharktank/evaluate/perplexity_iree.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
torch_device,
iree_device,
iree_hip_target,
iree_hal_target_backends,
iree_hal_target_device,
kv_cache_type,
tensor_parallelism_size,
attention_kernel,
Expand All @@ -73,7 +73,7 @@ def __init__(
self.torch_device = torch_device
self.iree_device = iree_device
self.iree_hip_target = iree_hip_target
self.iree_hal_target_backends = iree_hal_target_backends
self.iree_hal_target_device = iree_hal_target_device
self.kv_cache_type = kv_cache_type
self.block_seq_stride = block_seq_stride
self.activation_dtype = torch.float16
Expand Down Expand Up @@ -135,7 +135,7 @@ def compile_model(self, weight_path_str):
irpa_path=self.weight_path_str,
batch_size=self.bs,
iree_hip_target=self.iree_hip_target,
iree_hal_target_backends=self.iree_hal_target_backends,
iree_hal_target_device=self.iree_hal_target_device,
attention_kernel=self.attention_kernel,
tensor_parallelism_size=self.tensor_parallelism_size,
block_seq_stride=self.block_seq_stride,
Expand Down Expand Up @@ -392,7 +392,7 @@ def run_perplexity(
torch_device,
iree_device,
iree_hip_target,
iree_hal_target_backends,
iree_hal_target_device,
kv_cache_type,
tensor_parallelism_size,
attention_kernel,
Expand All @@ -404,7 +404,7 @@ def run_perplexity(
torch_device=torch_device,
iree_device=iree_device,
iree_hip_target=iree_hip_target,
iree_hal_target_backends=iree_hal_target_backends,
iree_hal_target_device=iree_hal_target_device,
kv_cache_type=kv_cache_type,
tensor_parallelism_size=tensor_parallelism_size,
attention_kernel=attention_kernel,
Expand Down Expand Up @@ -450,10 +450,10 @@ def main(argv):
help="Specify the iree-hip target version (e.g., gfx942)",
)
parser.add_argument(
"--iree-hal-target-backends",
"--iree-hal-target-device",
action="store",
default="rocm",
help="Specify the iree-hal target backends (e.g., rocm)",
default="hip",
help="Specify the iree-hal target device (e.g., hip, cpu)",
)
parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
parser.add_argument(
Expand Down Expand Up @@ -485,7 +485,7 @@ def main(argv):
torch_device=torch_device,
iree_device=args.iree_device,
iree_hip_target=args.iree_hip_target,
iree_hal_target_backends=args.iree_hal_target_backends,
iree_hal_target_device=args.iree_hal_target_device,
kv_cache_type=args.kv_cache_type,
tensor_parallelism_size=args.tensor_parallelism_size,
attention_kernel=args.attention_kernel,
Expand Down
14 changes: 8 additions & 6 deletions sharktank/sharktank/utils/export_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,18 @@ def __init__(
irpa_path: str,
batch_size: int,
iree_hip_target: str,
iree_hal_target_backends: str,
attention_kernel: str,
tensor_parallelism_size: int,
block_seq_stride: int,
iree_hal_target_device: str,
):
self.sharktank_dir = str(
Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
)
self.irpa_path = irpa_path
self.batch_size = batch_size
self.iree_hip_target = iree_hip_target
self.iree_hal_target_backends = iree_hal_target_backends
self.iree_hal_target_device = iree_hal_target_device
self.attention_kernel = attention_kernel
self.tensor_parallelism_size = tensor_parallelism_size
self.block_seq_stride = block_seq_stride
Expand Down Expand Up @@ -216,15 +216,18 @@ def compile_to_vmfb(
f"iree-compile",
f"{mlir_path}",
f"--iree-hip-target={self.iree_hip_target}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"-o={vmfb_path}",
]
if self.tensor_parallelism_size > 1:
iree_hal_target_devices = [
f"--iree-hal-target-device=hip[{i}]"
f"--iree-hal-target-device={self.iree_hal_target_device}[{i}]"
for i in range(self.tensor_parallelism_size)
]
compile_args += iree_hal_target_devices
else:
iree_hal_target_devices = [
f"--iree-hal-target-device={self.iree_hal_target_device}"
]
compile_args += iree_hal_target_devices
if hal_dump_path:
compile_args += [
f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
Expand Down Expand Up @@ -283,7 +286,6 @@ def iree_benchmark_vmfb(
benchmark_args += [
"iree-benchmark-module",
"--hip_use_streams=true",
"--device_allocator=caching",
f"--module={vmfb_name}",
]
benchmark_args += params
Expand Down
16 changes: 8 additions & 8 deletions sharktank/tests/evaluate/perplexity_iree_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_llama3_8B_f16_decomposed(self):
f"--irpa-file={self.llama3_8b_f16_model}",
f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size=1",
f"--attention-kernel=decomposed",
Expand Down Expand Up @@ -82,7 +82,7 @@ def test_llama3_8B_f16(self):
f"--irpa-file={self.llama3_8b_f16_model}",
f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size=1",
f"--attention-kernel=torch_sdpa",
Expand Down Expand Up @@ -118,7 +118,7 @@ def test_llama3_8B_fp8_decomposed(self):
f"--irpa-file={self.llama3_8b_fp8_model}",
f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size=1",
f"--attention-kernel=decomposed",
Expand Down Expand Up @@ -154,7 +154,7 @@ def test_llama3_8B_fp8(self):
f"--irpa-file={self.llama3_8b_fp8_model}",
f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size=1",
f"--attention-kernel=torch_sdpa",
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_llama3_405B_f16_decomposed(self):
f"--irpa-file={self.llama3_405b_f16_model}",
f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size={self.tensor_parallelism_size}",
f"--attention-kernel=decomposed",
Expand Down Expand Up @@ -228,7 +228,7 @@ def test_llama3_405B_f16(self):
f"--irpa-file={self.llama3_405b_f16_model}",
f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size={self.tensor_parallelism_size}",
f"--attention-kernel=torch_sdpa",
Expand Down Expand Up @@ -264,7 +264,7 @@ def test_llama3_405B_fp8_decomposed(self):
f"--irpa-file={self.llama3_405b_fp8_model}",
f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size={self.tensor_parallelism_size}",
f"--attention-kernel=decomposed",
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_llama3_405B_fp8(self):
f"--irpa-file={self.llama3_405b_fp8_model}",
f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
f"--iree-device={self.iree_device}",
f"--iree-hal-target-backends={self.iree_hal_target_backends}",
f"--iree-hal-target-device={self.iree_hal_target_device}",
f"--iree-hip-target={self.iree_hip_target}",
f"--tensor-parallelism-size={self.tensor_parallelism_size}",
f"--attention-kernel=torch_sdpa",
Expand Down
Loading
Loading