Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector-to-kernel pipeline bundle #987

Merged
merged 14 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ set(CONFIG_DIR "${BENCHMARK_DIR}/config")
# Run baseline benchmarks with default iterations to track simple performance
set(BENCH_CFGS
${CONFIG_DIR}/base/base.json
${CONFIG_DIR}/base/vector-to-kernel.json
${CONFIG_DIR}/base/pack.json
${CONFIG_DIR}/base/mha.json
${CONFIG_DIR}/base/named-ops.json
Expand All @@ -100,7 +101,9 @@ set(BENCH_OMP_CFGS
${CONFIG_DIR}/omp/dnn-bf16.json
${CONFIG_DIR}/omp/mlir-fp32.json
${CONFIG_DIR}/omp/mlir-bf16.json
${CONFIG_DIR}/omp/mlir-fp32-vector-to-kernel.json
${CONFIG_DIR}/omp/torch-dynamo.json
${CONFIG_DIR}/omp/torch-dynamo-vector-to-kernel.json
)
string(JOIN ',' BENCH_OMP_CFGS_STR ${BENCH_OMP_CFGS})
add_custom_target(benchmarks-omp ${BENCHMARK_DIR}/driver.py -v --build ${PROJECT_BINARY_DIR} -n 10
Expand Down
53 changes: 53 additions & 0 deletions benchmarks/config/base/vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
[
{
"prepacked_targets_vector_kernel": {
"gemm_fp32_mlir": {
rengolin marked this conversation as resolved.
Show resolved Hide resolved
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": []
},
"mlp_fp32_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": []
}
}},
{
"gemm_models_vector_kernel": {
rengolin marked this conversation as resolved.
Show resolved Hide resolved
"fp32_3x1024_const_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
rengolin marked this conversation as resolved.
Show resolved Hide resolved
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
"mlp_models_vector_kernel": {
"fp32_3x1024_const_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}}
]
64 changes: 64 additions & 0 deletions benchmarks/config/omp/mlir-fp32-vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[
{
"gemm_fp32_mlir_vector_kernel": {
"fp32_3x1024_omp_2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
"mlp_fp32_mlir_vector_kernel": {
"fp32_3x1024_omp_2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}}
]
64 changes: 64 additions & 0 deletions benchmarks/config/omp/torch-dynamo-vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[
{
"gemm_fp32_torch_vector_kernel" : {
"fp32_3x1024_omp_2_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
}
}},
{
"mlp_fp32_torch_vector_kernel" : {
"fp32_3x1024_omp_2_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
}
}}
]
9 changes: 7 additions & 2 deletions lib/TPP/DefaultPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ llvm::cl::opt<bool> linalgToVector("linalg-to-vector",
llvm::cl::desc("Lower linalg to vector"),
llvm::cl::init(false));

llvm::cl::opt<bool> vectorToKernel("vector-to-kernels",
llvm::cl::desc("Lower vector to micro-kernels"),
llvm::cl::init(false));

llvm::cl::opt<bool> lowerPackUnpackWithoutTranspose(
"lower-pack-unpack-without-transpose",
llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"),
Expand All @@ -66,14 +70,14 @@ llvm::cl::opt<bool> lowerPackUnpackWithoutTranspose(
llvm::cl::list<unsigned>
lhsTile("lhsTile",
llvm::cl::desc("Lhs tile size for brgemm operation"),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 8}),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{4, 32}),
llvm::cl::CommaSeparated);

// Rhs tile sizes for linalg-to-vector
llvm::cl::list<unsigned>
rhsTile("rhsTile",
llvm::cl::desc("Rhs tile size for brgemm operation"),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 16}),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{32, 1}),
llvm::cl::CommaSeparated);

namespace mlir {
Expand Down Expand Up @@ -154,6 +158,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
tppDefaultOptions.lowerPackUnpackWithoutTranspose = lowerPackUnpackWithoutTranspose;
tppDefaultOptions.lhsTile = lhsTile;
tppDefaultOptions.rhsTile = rhsTile;
tppDefaultOptions.vectorToKernel = vectorToKernel;

pm.addPass(createDefaultTppPasses(tppDefaultOptions));
}
Expand Down
8 changes: 5 additions & 3 deletions lib/TPP/DefaultTppPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ struct DefaultTppPasses
}
if (vectorToXSMM)
skipOperations.clear();
if (vectorToKernel)
skipOperations.clear();

if (vectorToKernel && !linalgToVector) {
skipOperations.push_back("all");
rengolin marked this conversation as resolved.
Show resolved Hide resolved
}

// Pipeline building starts here.
pm.addPass(createFoldAddIntoDest());
Expand Down Expand Up @@ -138,7 +140,6 @@ struct DefaultTppPasses
BrgemmLinalgTilingOptions{lhsTile, rhsTile}));
pm.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
pm.addNestedPass<func::FuncOp>(createVectorizationPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
arun-thmn marked this conversation as resolved.
Show resolved Hide resolved

if (vectorToXSMM) {
pm.addPass(createVectorToXSMM());
Expand Down Expand Up @@ -185,3 +186,4 @@ struct DefaultTppPasses
};

} // namespace

6 changes: 4 additions & 2 deletions lib/TPP/PassBundles/VectorToKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "llvm/Support/Debug.h"
#include "mlir/Transforms/Passes.h"

#include "TPP/PassBundles.h"
#include "TPP/PassUtils.h"
Expand Down Expand Up @@ -48,7 +49,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,

private:
void constructPipeline() override {
LLVM_DEBUG(llvm::dbgs() << "Adding vector-to-kernel passes\n");
// Not Implemented Yet.
pm.addNestedPass<func::FuncOp>(createHoistVectorTransfers());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createVectorContractToFMA());
}
};
8 changes: 7 additions & 1 deletion scripts/benchmarks/build_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,20 @@ echo_run ./driver.py -vv \
-c "${CONFIG_DIR}/base/base.json" \
--build "${BUILD_DIR}"

echo " ========= Vector-to-kernel Base Benchmarks ==========="
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/base/vector-to-kernel.json" \
--build "${BUILD_DIR}"

echo " ========= PyTorch Benchmarks ==========="
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/pytorch/torch_dynamo.json" \
--build "${BUILD_DIR}"

echo " ========= OpenMP Benchmarks ==========="
for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16; do
for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector-to-kernel; do
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/omp/${cfg}.json" \
Expand Down
3 changes: 3 additions & 0 deletions scripts/buildkite/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ benchmark () {
# Base Benchmarks
if [ "$BENCH_BASE" ]; then
benchmark base/base.json "Base Benchmarks"
benchmark base/vector-to-kernel.json "Base Vector-to-kernel Benchmarks"
benchmark base/pack.json "Pack Benchmarks"
benchmark base/mha.json "MHA Benchmarks"
benchmark base/named-ops.json "Named Ops Benchmarks"
Expand All @@ -111,8 +112,10 @@ if [ "$BENCH_OMP" ]; then
benchmark omp/dnn-fp32.json "OpenMP XSMM-DNN FP32"
benchmark omp/dnn-bf16.json "OpenMP XSMM-DNN BF16"
benchmark omp/mlir-fp32.json "OpenMP TPP-MLIR FP32"
benchmark omp/mlir-fp32-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32"
benchmark omp/mlir-bf16.json "OpenMP TPP-MLIR BF16"
benchmark omp/torch-dynamo.json "OpenMP TPP-MLIR PyTorch"
benchmark omp/torch-dynamo-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL PyTorch"
fi

# Matmul Benchmarks
Expand Down
8 changes: 4 additions & 4 deletions scripts/buildkite/tpp-mlir.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@ steps:
- wait

- label: "TPP-MLIR-gcc-rel"
command: "${SRUN} --partition=spr-all --time=0:30:00 -- \
command: "${SRUN} --partition=emr --time=0:30:00 -- \
arun-thmn marked this conversation as resolved.
Show resolved Hide resolved
'KIND=Release COMPILER=gcc CHECK=1 ONEDNN=1 \
scripts/buildkite/build_tpp.sh'"

- label: "TPP-MLIR-gcc-deb"
command: "${SRUN} --partition=spr-all --time=0:30:00 -- \
command: "${SRUN} --partition=emr --time=0:30:00 -- \
'KIND=Debug COMPILER=gcc CHECK=1 ONEDNN=1 \
scripts/buildkite/build_tpp.sh'"

- label: "TPP-MLIR-clang-rel"
command: "${SRUN} --partition=spr-all --time=0:30:00 -- \
command: "${SRUN} --partition=emr --time=0:30:00 -- \
'KIND=Release COMPILER=clang LINKER=lld CHECK=1 ONEDNN=1 \
scripts/buildkite/build_tpp.sh'"

- label: "TPP-MLIR-clang-deb"
command: "${SRUN} --partition=spr-all --time=0:30:00 -- \
command: "${SRUN} --partition=emr --time=0:30:00 -- \
'KIND=Debug COMPILER=clang LINKER=lld SANITIZERS=1 CHECK=1 ONEDNN=1 \
scripts/buildkite/build_tpp.sh'"