Skip to content

Commit

Permalink
GPU pipeline reorganization (#968)
Browse files Browse the repository at this point in the history
Cleans up and simplifies GPU pipeline in preparation for unified vector
based lowering.

The main goal is to retire old experimental paths and prepare for more
common vendor-agnostic lowering infrastructure.
It is another step toward GPU codegen through vectorization.

Summary of changes:
- moves to tiling based kernel outlining - retires naive outlining based
on Linalg to parallel loops conversion
- retires packed GEMM GPU kernels - currently irrelevant for GPU kernel
creation
- retires custom Linalg to WMMA lowering - to be replaced with generic
vectorization scheme in the future
- cleanups tests and adjusts existing ones to pipeline changes
- allows to override default GPU tiling sizes and to use tile setting
provided by DLTI (for now uses CPU tile size)
  • Loading branch information
adam-smnk authored Sep 13, 2024
1 parent 8018f3f commit 7b521f2
Show file tree
Hide file tree
Showing 37 changed files with 136 additions and 2,073 deletions.
28 changes: 0 additions & 28 deletions benchmarks/config/GPU/cuda.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,12 @@
"flags": [ "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_manual_kernel_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-manual-kernel.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_base_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-base.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_packed_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp16_1024_packed_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp16-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp16_1024_packed_wmma_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp16-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda", "-run-args=-gpu-wmma" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
Expand Down
19 changes: 0 additions & 19 deletions benchmarks/mlir/GPU/gemm-fp16-1024-packed.mlir

This file was deleted.

63 changes: 0 additions & 63 deletions benchmarks/mlir/GPU/gemm-fp32-1024-manual-kernel.mlir

This file was deleted.

19 changes: 0 additions & 19 deletions benchmarks/mlir/GPU/gemm-fp32-1024-packed.mlir

This file was deleted.

10 changes: 0 additions & 10 deletions include/TPP/PassBundles.td
Original file line number Diff line number Diff line change
Expand Up @@ -119,22 +119,12 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
let description = [{
Convert all eligble operations into generic GPU operations.
}];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
];
let dependentDialects = ["linalg::LinalgDialect",
"gpu::GPUDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"xegpu::XeGPUDialect"];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"isIntel", "intel",
"bool", /*default=*/"false",
"Convert for Intel GPU">,
Expand Down
21 changes: 0 additions & 21 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -323,27 +323,6 @@ def DecomposeAggregatedOps : Pass<"decompose-aggregated-ops", "func::FuncOp"> {
}];
}

def LinalgToGpu : Pass<"linalg-to-gpu", "func::FuncOp"> {
let summary = "Convert linalg ops to be GPU compatible.";
let description = [{
Lower linalg to ops optimized for computation on GPU.
}];
let dependentDialects = ["linalg::LinalgDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"gpu::GPUDialect",
"arith::ArithDialect"];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"kTile", "k-tile", "int64_t",
/*default=*/"32",
"GEMM tile size for reduction dimension.">,
];
}

def GpuDataTransfer : Pass<"gpu-data-transfer", "func::FuncOp"> {
let summary = "Transfer data to and from GPU.";
let description = [{
Expand Down
1 change: 0 additions & 1 deletion lib/TPP/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ add_mlir_library(TPPGPU
GpuToCuda.cpp
SetSPIRVCapabilities.cpp
SetSPIRVAbiAttribute.cpp
LinalgToGpu.cpp
GpuDataTransfer.cpp
GpuInlineConstants.cpp
LinalgToXeGPU.cpp
Expand Down
5 changes: 1 addition & 4 deletions lib/TPP/GPU/GpuConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,8 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
if (isIntel) {
pm.addNestedPass<func::FuncOp>(
createLinalgToXeGPU(LinalgToXeGPUOptions{kTile, stages, dpasTile}));
} else {
pm.addNestedPass<func::FuncOp>(
createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile, kTile}));
}
pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());

// Map loops into GPU kernels.
pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
Expand Down
54 changes: 23 additions & 31 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,6 @@
using namespace mlir;
using namespace mlir::tpp;

llvm::cl::opt<bool> gpuWmma("gpu-wmma",
llvm::cl::desc("Enable GPU WMMA support"),
llvm::cl::init(false));

llvm::cl::list<int64_t> wmmaTileSizes(
"wmma-tile-sizes", llvm::cl::desc("GPU WMMA tile sizes MxNxK"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
llvm::cl::CommaSeparated);

llvm::cl::list<int64_t>
gpuBlockTile("gpu-block-tile", llvm::cl::desc("GPU block tile size"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{128, 128}),
Expand Down Expand Up @@ -165,29 +156,30 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
GpuType gpuType = parseGpuOption(this->gpuBackend);
GpuOptions gpuOptions = getGpuOptions(gpuType);

// Input preprocessing.
pm.addPass(createCleanup());
pm.addPass(createFoldIntoEltwise());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToInplace());

// Tile to split the kernel into threads and blocks.
// Use default tiling to handle both packed and unpacked ops.
pm.addPass(createCleanup());
if (gpuType == GpuType::Intel) {
// First split computation into grid with blocks of specified size.
TileConsumerAndFuseProducersOptions blockTileOptions;
// First split computation into grid with blocks of specified size.
TileConsumerAndFuseProducersOptions blockTileOptions;
if (!llvm::any_of(gpuBlockTile, [](int64_t tile) { return tile == -1; }))
blockTileOptions.tileSizes = gpuBlockTile;
blockTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));

// Then try to further split computation into subtiles.
// This allows to split larger computations across multiple
// threads/workitems. For smaller workloads, it provides another
// chance for outlining.
TileConsumerAndFuseProducersOptions threadTileOptions;
blockTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));

// Then try to further split computation into subtiles.
// This allows to split larger computations across multiple
// threads/workitems. For smaller workloads, it provides another
// chance for outlining.
TileConsumerAndFuseProducersOptions threadTileOptions;
if (!llvm::any_of(gpuThreadTile, [](int64_t tile) { return tile == -1; }))
threadTileOptions.tileSizes = gpuThreadTile;
threadTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
} else {
TileConsumerAndFuseProducersOptions tilingOptions;
tilingOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
}
threadTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
pm.addPass(createCleanup());

// Preprocess and bufferize as further conversion requires memref
Expand All @@ -198,9 +190,8 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.addPass(createCleanup());

// Convert to generic GPU ops.
pm.addPass(createGpuConversion(
GpuConversionOptions{gpuWmma, wmmaTileSizes, gpuType == GpuType::Intel,
kTile, stages, gpuDpasTile}));
pm.addPass(createGpuConversion(GpuConversionOptions{
gpuType == GpuType::Intel, kTile, stages, gpuDpasTile}));

// Lower GPU ops to the chosen GPU backend.
switch (gpuType) {
Expand All @@ -212,7 +203,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
gpuOptions.triple, gpuOptions.chip, gpuOptions.features}));
break;
}
case GpuType::Intel:
case GpuType::Intel: {
pm.addPass(xegpu::createXeGPUFoldAliasOps());

std::string clientApi = "intel";
Expand All @@ -223,6 +214,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,

break;
}
}

// Covert all local dialects like perf.
pm.addPass(createLocalDialectsLowering());
Expand Down
Loading

0 comments on commit 7b521f2

Please sign in to comment.