From 6a861df595d793726ffc3455dffd01348883de1b Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Tue, 3 Dec 2024 05:38:26 -0800 Subject: [PATCH 01/10] bundled passes for vector-to-kernel pipeline and new json files for benchmarking --- CMakeLists.txt | 3 + benchmarks/config/base/vector-to-kernel.json | 53 +++++++++++++++ benchmarks/config/omp/mlir-fp32-vector.json | 64 +++++++++++++++++++ .../omp/torch-dynamo-vector-to-kernel.json | 64 +++++++++++++++++++ lib/TPP/DefaultPipeline.cpp | 5 ++ lib/TPP/DefaultTppPasses.cpp | 1 + lib/TPP/PassBundles/VectorToKernel.cpp | 3 +- scripts/benchmarks/build_and_run.sh | 8 ++- scripts/buildkite/benchmark.sh | 3 + 9 files changed, 201 insertions(+), 3 deletions(-) create mode 100644 benchmarks/config/base/vector-to-kernel.json create mode 100644 benchmarks/config/omp/mlir-fp32-vector.json create mode 100644 benchmarks/config/omp/torch-dynamo-vector-to-kernel.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fae95179..93d8670a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,7 @@ set(CONFIG_DIR "${BENCHMARK_DIR}/config") # Run baseline benchmarks with default iterations to track simple performance set(BENCH_CFGS ${CONFIG_DIR}/base/base.json + ${CONFIG_DIR}/base/vector-to-kernel.json ${CONFIG_DIR}/base/pack.json ${CONFIG_DIR}/base/mha.json ${CONFIG_DIR}/base/named-ops.json @@ -100,7 +101,9 @@ set(BENCH_OMP_CFGS ${CONFIG_DIR}/omp/dnn-bf16.json ${CONFIG_DIR}/omp/mlir-fp32.json ${CONFIG_DIR}/omp/mlir-bf16.json + ${CONFIG_DIR}/omp/mlir-fp32-vector.json ${CONFIG_DIR}/omp/torch-dynamo.json + ${CONFIG_DIR}/omp/torch-dynamo-vector-to-kernel.json ) string(JOIN ',' BENCH_OMP_CFGS_STR ${BENCH_OMP_CFGS}) add_custom_target(benchmarks-omp ${BENCHMARK_DIR}/driver.py -v --build ${PROJECT_BINARY_DIR} -n 10 diff --git a/benchmarks/config/base/vector-to-kernel.json b/benchmarks/config/base/vector-to-kernel.json new file mode 100644 index 000000000..36965adcc --- /dev/null +++ b/benchmarks/config/base/vector-to-kernel.json @@ -0,0 +1,53 @@ +[ + { + "prepacked_targets_vector_kernel": { + "gemm_fp32_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + }, + "mlp_fp32_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + } + }}, + { + "gemm_models_vector_kernel": { + "fp32_3x1024_const_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_args_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "mlp_models_vector_kernel": { + "fp32_3x1024_const_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_args_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }} +] diff --git a/benchmarks/config/omp/mlir-fp32-vector.json b/benchmarks/config/omp/mlir-fp32-vector.json new file mode 100644 index 000000000..62e6ca6df --- /dev/null +++ b/benchmarks/config/omp/mlir-fp32-vector.json @@ -0,0 +1,64 @@ +[ + { + "gemm_fp32_mlir_vector_kernel": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "mlp_fp32_mlir_vector_kernel": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }} +] diff --git a/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json new file mode 100644 index 000000000..74a3e0c7c --- /dev/null +++ b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json @@ -0,0 +1,64 @@ +[ + { + "gemm_fp32_torch_vector_kernel" : { + "fp32_3x1024_omp_2_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + } + }}, + { + "mlp_fp32_torch_vector_kernel" : { + "fp32_3x1024_omp_2_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + } + }} +] diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index 3f94f5846..73ad07f47 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -57,6 +57,10 @@ llvm::cl::opt linalgToVector("linalg-to-vector", llvm::cl::desc("Lower linalg to vector"), llvm::cl::init(false)); +llvm::cl::opt vectorToKernel("vector-to-kernel", + llvm::cl::desc("Lower vector to micro-kernels"), + llvm::cl::init(false)); + llvm::cl::opt lowerPackUnpackWithoutTranspose( "lower-pack-unpack-without-transpose", llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"), @@ -154,6 +158,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase, tppDefaultOptions.lowerPackUnpackWithoutTranspose = lowerPackUnpackWithoutTranspose; tppDefaultOptions.lhsTile = lhsTile; tppDefaultOptions.rhsTile = rhsTile; + tppDefaultOptions.vectorToKernel = vectorToKernel; pm.addPass(createDefaultTppPasses(tppDefaultOptions)); } diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp index cb84f1048..14f98414c 100644 --- a/lib/TPP/DefaultTppPasses.cpp +++ b/lib/TPP/DefaultTppPasses.cpp @@ -134,6 +134,7 @@ struct DefaultTppPasses pm.addNestedPass(createBrgemmLinalgTiling(BrgemmLinalgTilingOptions{lhsTile, rhsTile})); pm.addNestedPass(createLoopInvariantCodeMotionPass()); pm.addNestedPass(createVectorizationPass()); + pm.addNestedPass(createHoistVectorTransfers()); pm.addNestedPass(createCanonicalizerPass()); if (vectorToXSMM) { diff --git a/lib/TPP/PassBundles/VectorToKernel.cpp b/lib/TPP/PassBundles/VectorToKernel.cpp index cf0f8c634..aacdb2004 100644 --- a/lib/TPP/PassBundles/VectorToKernel.cpp +++ b/lib/TPP/PassBundles/VectorToKernel.cpp @@ -48,7 +48,6 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase, private: void constructPipeline() override { - LLVM_DEBUG(llvm::dbgs() << "Adding vector-to-kernel passes\n"); - // Not Implemented Yet. + pm.addNestedPass(createVectorContractToFMA()); } }; diff --git a/scripts/benchmarks/build_and_run.sh b/scripts/benchmarks/build_and_run.sh index 3d4474ed1..7c6477911 100755 --- a/scripts/benchmarks/build_and_run.sh +++ b/scripts/benchmarks/build_and_run.sh @@ -57,6 +57,12 @@ echo_run ./driver.py -vv \ -c "${CONFIG_DIR}/base/base.json" \ --build "${BUILD_DIR}" +echo " ========= Vector-to-kernel Base Benchmarks ===========" +echo_run ./driver.py -vv \ + -n ${NUM_ITER} \ + -c "${CONFIG_DIR}/base/vector-to-kernel.json" \ + --build "${BUILD_DIR}" + echo " ========= PyTorch Benchmarks ===========" echo_run ./driver.py -vv \ -n ${NUM_ITER} \ @@ -64,7 +70,7 @@ echo_run ./driver.py -vv \ --build "${BUILD_DIR}" echo " ========= OpenMP Benchmarks ===========" -for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16; do +for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector; do echo_run ./driver.py -vv \ -n ${NUM_ITER} \ -c "${CONFIG_DIR}/omp/${cfg}.json" \ diff --git a/scripts/buildkite/benchmark.sh b/scripts/buildkite/benchmark.sh index 0ea2e70c9..045c74d12 100755 --- a/scripts/buildkite/benchmark.sh +++ b/scripts/buildkite/benchmark.sh @@ -96,6 +96,7 @@ benchmark () { # Base Benchmarks if [ "$BENCH_BASE" ]; then benchmark base/base.json "Base Benchmarks" + benchmark base/vector-to-kernel.json "Base Vector-to-kernel Benchmarks" benchmark base/pack.json "Pack Benchmarks" benchmark base/mha.json "MHA Benchmarks" benchmark base/named-ops.json "Named Ops Benchmarks" @@ -111,8 +112,10 @@ if [ "$BENCH_OMP" ]; then benchmark omp/dnn-fp32.json "OpenMP XSMM-DNN FP32" benchmark omp/dnn-bf16.json "OpenMP XSMM-DNN BF16" benchmark omp/mlir-fp32.json "OpenMP TPP-MLIR FP32" + benchmark omp/mlir-fp32-vector.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32" benchmark omp/mlir-bf16.json "OpenMP TPP-MLIR BF16" benchmark omp/torch-dynamo.json "OpenMP TPP-MLIR PyTorch" + benchmark omp/torch-dynamo-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL PyTorch" fi # Matmul Benchmarks From 7487c31f91082e01cffb0e5e1c6e45ca76505b8a Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Tue, 3 Dec 2024 06:49:44 -0800 Subject: [PATCH 02/10] with Kavitha Madhu changes in DefaultTppPasses.cpp to fix the segfault because of skipoperations. This commit is just to check CI and will be reverted back. --- lib/TPP/DefaultTppPasses.cpp | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp index 14f98414c..8be48aa6c 100644 --- a/lib/TPP/DefaultTppPasses.cpp +++ b/lib/TPP/DefaultTppPasses.cpp @@ -77,20 +77,25 @@ struct DefaultTppPasses // * Vector-to-Kernel: Enable with `vector-to-kernel` flag, forces // `linalg-to-vector` and lowers vector patterns to libxsmm-like // micro-kernels via specialized lowering of certain vector patterns. - assert(!(vectorToXSMM && vectorToKernel) && "XSMM and Kernel lowering are mutually exclusive"); + assert(!(vectorToXSMM && vectorToKernel) && + "XSMM and Kernel lowering are mutually exclusive"); bool forceLinalgToVector = (vectorToXSMM || vectorToKernel); // List of operations to skip when lowering Linalg to XSMM / Kernel. // This allows further passes to lower to vector, function, codegen // Default is to not skip anything. Only enable when needed. - ArrayRef skipOperations; - // General "linalg-to-vector" choice needs to skip all XSMM matching at linalg level. - if (linalgToVector) - skipOperations = { "all" }; + SmallVector skipOperations; + // General "linalg-to-vector" choice needs to skip all XSMM matching at + // linalg level. + if (linalgToVector) { + skipOperations.push_back("all"); + } if (vectorToXSMM) - skipOperations = { }; - if (vectorToKernel) - skipOperations = { }; + skipOperations.clear(); + if (vectorToKernel && !linalgToVector) { + skipOperations.push_back("all"); + } + //skipOperations.clear(); // Pipeline building starts here. pm.addPass(createFoldAddIntoDest()); @@ -110,8 +115,7 @@ struct DefaultTppPasses pm.addPass(createRewriteBatchMatmulToMatmul()); // Applies a set of passes at the linalg level to fuse and pack. - TppMappingOptions tppMappingOptions{ - lowerPackUnpackWithoutTranspose}; + TppMappingOptions tppMappingOptions{lowerPackUnpackWithoutTranspose}; pm.addPass(createTppMapping(tppMappingOptions)); // Generalize tensor.pack and tensor.unpack. @@ -127,11 +131,13 @@ struct DefaultTppPasses pm.addPass(createBufferize()); // Lower Linalg to XSMM. - pm.addNestedPass(createLinalgLowering(LinalgLoweringOptions{skipOperations})); + pm.addNestedPass( + createLinalgLowering(LinalgLoweringOptions{skipOperations})); if (linalgToVector || forceLinalgToVector) { // Vectorizes the remaining Linalg operations - pm.addNestedPass(createBrgemmLinalgTiling(BrgemmLinalgTilingOptions{lhsTile, rhsTile})); + pm.addNestedPass(createBrgemmLinalgTiling( + BrgemmLinalgTilingOptions{lhsTile, rhsTile})); pm.addNestedPass(createLoopInvariantCodeMotionPass()); pm.addNestedPass(createVectorizationPass()); pm.addNestedPass(createHoistVectorTransfers()); @@ -182,3 +188,4 @@ struct DefaultTppPasses }; } // namespace + From c75d4284f17e88cdbc3c9ce92259d4b86a3be7cb Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Tue, 3 Dec 2024 07:58:12 -0800 Subject: [PATCH 03/10] fixing the name miss-match vector-to-kernel --- benchmarks/config/base/vector-to-kernel.json | 12 ++++++------ benchmarks/config/omp/mlir-fp32-vector.json | 16 ++++++++-------- .../omp/torch-dynamo-vector-to-kernel.json | 16 ++++++++-------- lib/TPP/DefaultPipeline.cpp | 2 +- lib/TPP/DefaultTppPasses.cpp | 2 +- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/benchmarks/config/base/vector-to-kernel.json b/benchmarks/config/base/vector-to-kernel.json index 36965adcc..a35138aaa 100644 --- a/benchmarks/config/base/vector-to-kernel.json +++ b/benchmarks/config/base/vector-to-kernel.json @@ -5,14 +5,14 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [] }, "mlp_fp32_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [] } }}, @@ -22,14 +22,14 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }}, @@ -39,14 +39,14 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }} diff --git a/benchmarks/config/omp/mlir-fp32-vector.json b/benchmarks/config/omp/mlir-fp32-vector.json index 62e6ca6df..52a12e34d 100644 --- a/benchmarks/config/omp/mlir-fp32-vector.json +++ b/benchmarks/config/omp/mlir-fp32-vector.json @@ -5,28 +5,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }}, @@ -36,28 +36,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }} diff --git a/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json index 74a3e0c7c..74e2b5fa9 100644 --- a/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json +++ b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json @@ -5,28 +5,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] } }}, @@ -36,28 +36,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] }, "fp32_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernel --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ ] } }} diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index 73ad07f47..1a24fa51c 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -57,7 +57,7 @@ llvm::cl::opt linalgToVector("linalg-to-vector", llvm::cl::desc("Lower linalg to vector"), llvm::cl::init(false)); -llvm::cl::opt vectorToKernel("vector-to-kernel", +llvm::cl::opt vectorToKernel("vector-to-kernels", llvm::cl::desc("Lower vector to micro-kernels"), llvm::cl::init(false)); diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp index 8be48aa6c..6152433de 100644 --- a/lib/TPP/DefaultTppPasses.cpp +++ b/lib/TPP/DefaultTppPasses.cpp @@ -93,7 +93,7 @@ struct DefaultTppPasses if (vectorToXSMM) skipOperations.clear(); if (vectorToKernel && !linalgToVector) { - skipOperations.push_back("all"); + skipOperations.push_back("all"); } //skipOperations.clear(); From 8a22638748530cbbce9bc98fd0cd543a0e373c45 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Tue, 3 Dec 2024 17:55:50 -0800 Subject: [PATCH 04/10] renaming json file mlir-fp32-vector.json to mlir-fp32-vector-to-kernel.json --- CMakeLists.txt | 2 +- .../{mlir-fp32-vector.json => mlir-fp32-vector-to-kernel.json} | 0 scripts/benchmarks/build_and_run.sh | 2 +- scripts/buildkite/benchmark.sh | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename benchmarks/config/omp/{mlir-fp32-vector.json => mlir-fp32-vector-to-kernel.json} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93d8670a1..b5f710d46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ set(BENCH_OMP_CFGS ${CONFIG_DIR}/omp/dnn-bf16.json ${CONFIG_DIR}/omp/mlir-fp32.json ${CONFIG_DIR}/omp/mlir-bf16.json - ${CONFIG_DIR}/omp/mlir-fp32-vector.json + ${CONFIG_DIR}/omp/mlir-fp32-vector-to-kernel.json ${CONFIG_DIR}/omp/torch-dynamo.json ${CONFIG_DIR}/omp/torch-dynamo-vector-to-kernel.json ) diff --git a/benchmarks/config/omp/mlir-fp32-vector.json b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json similarity index 100% rename from benchmarks/config/omp/mlir-fp32-vector.json rename to benchmarks/config/omp/mlir-fp32-vector-to-kernel.json diff --git a/scripts/benchmarks/build_and_run.sh b/scripts/benchmarks/build_and_run.sh index 7c6477911..72240669a 100755 --- a/scripts/benchmarks/build_and_run.sh +++ b/scripts/benchmarks/build_and_run.sh @@ -70,7 +70,7 @@ echo_run ./driver.py -vv \ --build "${BUILD_DIR}" echo " ========= OpenMP Benchmarks ===========" -for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector; do +for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector-to-kernel; do echo_run ./driver.py -vv \ -n ${NUM_ITER} \ -c "${CONFIG_DIR}/omp/${cfg}.json" \ diff --git a/scripts/buildkite/benchmark.sh b/scripts/buildkite/benchmark.sh index 045c74d12..fc0437c02 100755 --- a/scripts/buildkite/benchmark.sh +++ b/scripts/buildkite/benchmark.sh @@ -112,7 +112,7 @@ if [ "$BENCH_OMP" ]; then benchmark omp/dnn-fp32.json "OpenMP XSMM-DNN FP32" benchmark omp/dnn-bf16.json "OpenMP XSMM-DNN BF16" benchmark omp/mlir-fp32.json "OpenMP TPP-MLIR FP32" - benchmark omp/mlir-fp32-vector.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32" + benchmark omp/mlir-fp32-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32" benchmark omp/mlir-bf16.json "OpenMP TPP-MLIR BF16" benchmark omp/torch-dynamo.json "OpenMP TPP-MLIR PyTorch" benchmark omp/torch-dynamo-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL PyTorch" From 35f63de6f2e7c6cd147f6445bf67345695d729a8 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Tue, 3 Dec 2024 23:48:21 -0800 Subject: [PATCH 05/10] Changing brgemm tile size to 8,32 and 32,1 as it gives better gflops --- lib/TPP/DefaultPipeline.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index 1a24fa51c..82d3f1fae 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -70,14 +70,14 @@ llvm::cl::opt lowerPackUnpackWithoutTranspose( llvm::cl::list lhsTile("lhsTile", llvm::cl::desc("Lhs tile size for brgemm operation"), - llvm::cl::list_init(SmallVector{8, 8}), + llvm::cl::list_init(SmallVector{4, 32}), llvm::cl::CommaSeparated); // Rhs tile sizes for linalg-to-vector llvm::cl::list rhsTile("rhsTile", llvm::cl::desc("Rhs tile size for brgemm operation"), - llvm::cl::list_init(SmallVector{8, 16}), + llvm::cl::list_init(SmallVector{32, 1}), llvm::cl::CommaSeparated); namespace mlir { From 1b389f36c51d3f702bcaef94c854cfe61f50e7be Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Wed, 4 Dec 2024 08:03:49 -0800 Subject: [PATCH 06/10] issues in base/vector-to-kernel.json and fix --- benchmarks/config/base/vector-to-kernel.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/config/base/vector-to-kernel.json b/benchmarks/config/base/vector-to-kernel.json index a35138aaa..a8de7d457 100644 --- a/benchmarks/config/base/vector-to-kernel.json +++ b/benchmarks/config/base/vector-to-kernel.json @@ -12,7 +12,7 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100"], "extensions": [] } }}, From 8aff31f84b26bf998b90cd611941f2ec1bd35738 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Wed, 4 Dec 2024 09:23:20 -0800 Subject: [PATCH 07/10] base/vector.json mlp working in local repo but not in CI --- benchmarks/config/base/vector-to-kernel.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/config/base/vector-to-kernel.json b/benchmarks/config/base/vector-to-kernel.json index a8de7d457..5e598178f 100644 --- a/benchmarks/config/base/vector-to-kernel.json +++ b/benchmarks/config/base/vector-to-kernel.json @@ -12,7 +12,7 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": {}, - "flags": [ "-n", "100"], + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [] } }}, @@ -39,14 +39,14 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }} From 72dee68184996fac1fe5088e74890db0cd783a95 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Wed, 4 Dec 2024 19:42:10 -0800 Subject: [PATCH 08/10] moving hoisting and canonicalizer pass into VectorToKernal.cpp --- lib/TPP/DefaultTppPasses.cpp | 2 -- lib/TPP/PassBundles/VectorToKernel.cpp | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp index 70f2cd8b3..ffd44e88c 100644 --- a/lib/TPP/DefaultTppPasses.cpp +++ b/lib/TPP/DefaultTppPasses.cpp @@ -140,8 +140,6 @@ struct DefaultTppPasses BrgemmLinalgTilingOptions{lhsTile, rhsTile})); pm.addNestedPass(createLoopInvariantCodeMotionPass()); pm.addNestedPass(createVectorizationPass()); - pm.addNestedPass(createHoistVectorTransfers()); - pm.addNestedPass(createCanonicalizerPass()); if (vectorToXSMM) { pm.addPass(createVectorToXSMM()); diff --git a/lib/TPP/PassBundles/VectorToKernel.cpp b/lib/TPP/PassBundles/VectorToKernel.cpp index aacdb2004..8335b3c46 100644 --- a/lib/TPP/PassBundles/VectorToKernel.cpp +++ b/lib/TPP/PassBundles/VectorToKernel.cpp @@ -13,6 +13,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "llvm/Support/Debug.h" +#include "mlir/Transforms/Passes.h" #include "TPP/PassBundles.h" #include "TPP/PassUtils.h" @@ -48,6 +49,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase, private: void constructPipeline() override { + pm.addNestedPass(createHoistVectorTransfers()); + pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createVectorContractToFMA()); } }; From 119bab997a666eb39b960695a7ab3781ca55fd7c Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Thu, 5 Dec 2024 01:12:03 -0800 Subject: [PATCH 09/10] rebase from main: spr-all to emr partition in CI --- scripts/buildkite/tpp-mlir.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/buildkite/tpp-mlir.yml b/scripts/buildkite/tpp-mlir.yml index 1503885e7..bbff30753 100644 --- a/scripts/buildkite/tpp-mlir.yml +++ b/scripts/buildkite/tpp-mlir.yml @@ -9,21 +9,21 @@ steps: - wait - label: "TPP-MLIR-gcc-rel" - command: "${SRUN} --partition=spr-all --time=0:30:00 -- \ + command: "${SRUN} --partition=emr --time=0:30:00 -- \ 'KIND=Release COMPILER=gcc CHECK=1 ONEDNN=1 \ scripts/buildkite/build_tpp.sh'" - label: "TPP-MLIR-gcc-deb" - command: "${SRUN} --partition=spr-all --time=0:30:00 -- \ + command: "${SRUN} --partition=emr --time=0:30:00 -- \ 'KIND=Debug COMPILER=gcc CHECK=1 ONEDNN=1 \ scripts/buildkite/build_tpp.sh'" - label: "TPP-MLIR-clang-rel" - command: "${SRUN} --partition=spr-all --time=0:30:00 -- \ + command: "${SRUN} --partition=emr --time=0:30:00 -- \ 'KIND=Release COMPILER=clang LINKER=lld CHECK=1 ONEDNN=1 \ scripts/buildkite/build_tpp.sh'" - label: "TPP-MLIR-clang-deb" - command: "${SRUN} --partition=spr-all --time=0:30:00 -- \ + command: "${SRUN} --partition=emr --time=0:30:00 -- \ 'KIND=Debug COMPILER=clang LINKER=lld SANITIZERS=1 CHECK=1 ONEDNN=1 \ scripts/buildkite/build_tpp.sh'" From 839f8f4b3a73df2c7ccf3df50a7a9be5e8cbb03f Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Fri, 6 Dec 2024 05:52:40 -0800 Subject: [PATCH 10/10] merging base/vector-to-kernel.json into base/base.json --- CMakeLists.txt | 1 - benchmarks/config/base/base.json | 76 ++++++++++-------- benchmarks/config/base/vector-to-kernel.json | 19 ----- .../omp/mlir-fp32-vector-to-kernel.json | 78 +++++++++++++++++-- scripts/benchmarks/build_and_run.sh | 6 -- scripts/github/benchmark.sh | 1 - 6 files changed, 113 insertions(+), 68 deletions(-) delete mode 100644 benchmarks/config/base/vector-to-kernel.json diff --git a/CMakeLists.txt b/CMakeLists.txt index b5f710d46..642f427c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,7 +82,6 @@ set(CONFIG_DIR "${BENCHMARK_DIR}/config") # Run baseline benchmarks with default iterations to track simple performance set(BENCH_CFGS ${CONFIG_DIR}/base/base.json - ${CONFIG_DIR}/base/vector-to-kernel.json ${CONFIG_DIR}/base/pack.json ${CONFIG_DIR}/base/mha.json ${CONFIG_DIR}/base/named-ops.json diff --git a/benchmarks/config/base/base.json b/benchmarks/config/base/base.json index d1ff69cea..0251bbf78 100644 --- a/benchmarks/config/base/base.json +++ b/benchmarks/config/base/base.json @@ -36,6 +36,13 @@ "flags": [ "-n", "100" ], "extensions": [] }, + "gemm_fp32_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + }, "gemm_bf16_dp2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], @@ -57,6 +64,13 @@ "flags": [ "-n", "100" ], "extensions": [] }, + "mlp_fp32_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + }, "mlp_bf16_dp2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], @@ -81,12 +95,26 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_const_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_args_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] }, "bf16_3x1024_const_mlir": { "type": "IR-GEN", @@ -112,6 +140,13 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_const_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], @@ -119,6 +154,13 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_args_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "bf16_3x1024_const_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024" ], @@ -133,39 +175,5 @@ "flags": [ "-n", "100"], "extensions": [ "(avx2|asimd)" ] } - }}, - { - "gemm_models_vector_kernel": { - "fp32_3x1024_const_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [ "(avx2|asimd)" ] - }, - "fp32_3x1024_args_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [ "(avx2|asimd)" ] - } - }}, - { - "mlp_models_vector_kernel": { - "fp32_3x1024_const_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [ "(avx2|asimd)" ] - }, - "fp32_3x1024_args_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [ "(avx2|asimd)" ] - } }} ] diff --git a/benchmarks/config/base/vector-to-kernel.json b/benchmarks/config/base/vector-to-kernel.json deleted file mode 100644 index bfaaa1de9..000000000 --- a/benchmarks/config/base/vector-to-kernel.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "prepacked_targets_vector_kernel": { - "gemm_fp32_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [] - }, - "mlp_fp32_mlir": { - "type": "IR-GEN", - "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": {}, - "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], - "extensions": [] - } - }} -] diff --git a/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json index 52a12e34d..6bed81a47 100644 --- a/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json +++ b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json @@ -1,6 +1,7 @@ + [ - { - "gemm_fp32_mlir_vector_kernel": { + { + "gemm_fp32_mlir_vector_kernel_32": { "fp32_3x1024_omp_2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], @@ -18,20 +19,20 @@ "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } }}, { - "mlp_fp32_mlir_vector_kernel": { + "mlp_fp32_mlir_vector_kernel_32": { "fp32_3x1024_omp_2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], @@ -49,16 +50,79 @@ "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], - "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], "extensions": [ "(avx2|asimd)" ] } + }}, + { + "gemm_fp32_mlir_vector_kernel_64": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=1,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "mlp_fp32_mlir_vector_kernel_64": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=1,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + } }} ] + diff --git a/scripts/benchmarks/build_and_run.sh b/scripts/benchmarks/build_and_run.sh index accc770b9..02eba0e57 100755 --- a/scripts/benchmarks/build_and_run.sh +++ b/scripts/benchmarks/build_and_run.sh @@ -57,12 +57,6 @@ echo_run ./driver.py -vv \ -c "${CONFIG_DIR}/base/base.json" \ --build "${BUILD_DIR}" -echo " ========= Vector-to-kernel Base Benchmarks ===========" -echo_run ./driver.py -vv \ - -n ${NUM_ITER} \ - -c "${CONFIG_DIR}/base/vector-to-kernel.json" \ - --build "${BUILD_DIR}" - echo " ========= PyTorch Benchmarks ===========" echo_run ./driver.py -vv \ -n ${NUM_ITER} \ diff --git a/scripts/github/benchmark.sh b/scripts/github/benchmark.sh index 0053cd529..207831042 100755 --- a/scripts/github/benchmark.sh +++ b/scripts/github/benchmark.sh @@ -96,7 +96,6 @@ benchmark () { # Base Benchmarks if [ "$BENCH_BASE" ]; then benchmark base/base.json "Base Benchmarks" - benchmark base/vector-to-kernel.json "Base Vector-to-kernel Benchmarks" benchmark base/pack.json "Pack Benchmarks" benchmark base/mha.json "MHA Benchmarks" benchmark base/named-ops.json "Named Ops Benchmarks"