Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU pipeline reorganization #968

Merged
merged 5 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions benchmarks/config/GPU/cuda.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,12 @@
"flags": [ "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_manual_kernel_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-manual-kernel.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_base_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-base.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_1024_packed_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp32-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp16_1024_packed_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp16-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp16_1024_packed_wmma_mlir": {
"type": "MLIR",
"benchmark": "GPU/gemm-fp16-1024-packed.mlir",
"environment": {},
"flags": [ "-n", "100", "--gpu=cuda", "-run-args=-gpu-wmma" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
Expand Down
19 changes: 0 additions & 19 deletions benchmarks/mlir/GPU/gemm-fp16-1024-packed.mlir

This file was deleted.

63 changes: 0 additions & 63 deletions benchmarks/mlir/GPU/gemm-fp32-1024-manual-kernel.mlir

This file was deleted.

19 changes: 0 additions & 19 deletions benchmarks/mlir/GPU/gemm-fp32-1024-packed.mlir

This file was deleted.

10 changes: 0 additions & 10 deletions include/TPP/PassBundles.td
Original file line number Diff line number Diff line change
Expand Up @@ -119,22 +119,12 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
let description = [{
Convert all eligble operations into generic GPU operations.
}];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
];
let dependentDialects = ["linalg::LinalgDialect",
"gpu::GPUDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"xegpu::XeGPUDialect"];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"isIntel", "intel",
"bool", /*default=*/"false",
"Convert for Intel GPU">,
Expand Down
21 changes: 0 additions & 21 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -323,27 +323,6 @@ def DecomposeAggregatedOps : Pass<"decompose-aggregated-ops", "func::FuncOp"> {
}];
}

def LinalgToGpu : Pass<"linalg-to-gpu", "func::FuncOp"> {
let summary = "Convert linalg ops to be GPU compatible.";
let description = [{
Lower linalg to ops optimized for computation on GPU.
}];
let dependentDialects = ["linalg::LinalgDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"gpu::GPUDialect",
"arith::ArithDialect"];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"kTile", "k-tile", "int64_t",
/*default=*/"32",
"GEMM tile size for reduction dimension.">,
];
}

def GpuDataTransfer : Pass<"gpu-data-transfer", "func::FuncOp"> {
let summary = "Transfer data to and from GPU.";
let description = [{
Expand Down
1 change: 0 additions & 1 deletion lib/TPP/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ add_mlir_library(TPPGPU
GpuToCuda.cpp
SetSPIRVCapabilities.cpp
SetSPIRVAbiAttribute.cpp
LinalgToGpu.cpp
GpuDataTransfer.cpp
GpuInlineConstants.cpp
LinalgToXeGPU.cpp
Expand Down
5 changes: 1 addition & 4 deletions lib/TPP/GPU/GpuConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,8 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
if (isIntel) {
pm.addNestedPass<func::FuncOp>(
createLinalgToXeGPU(LinalgToXeGPUOptions{kTile, stages, dpasTile}));
} else {
pm.addNestedPass<func::FuncOp>(
createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile, kTile}));
}
pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());

// Map loops into GPU kernels.
pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
Expand Down
54 changes: 23 additions & 31 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,6 @@
using namespace mlir;
using namespace mlir::tpp;

llvm::cl::opt<bool> gpuWmma("gpu-wmma",
llvm::cl::desc("Enable GPU WMMA support"),
llvm::cl::init(false));

llvm::cl::list<int64_t> wmmaTileSizes(
"wmma-tile-sizes", llvm::cl::desc("GPU WMMA tile sizes MxNxK"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
llvm::cl::CommaSeparated);

llvm::cl::list<int64_t>
gpuBlockTile("gpu-block-tile", llvm::cl::desc("GPU block tile size"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{128, 128}),
Expand Down Expand Up @@ -165,29 +156,30 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
GpuType gpuType = parseGpuOption(this->gpuBackend);
GpuOptions gpuOptions = getGpuOptions(gpuType);

// Input preprocessing.
pm.addPass(createCleanup());
pm.addPass(createFoldIntoEltwise());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToInplace());

// Tile to split the kernel into threads and blocks.
// Use default tiling to handle both packed and unpacked ops.
pm.addPass(createCleanup());
if (gpuType == GpuType::Intel) {
// First split computation into grid with blocks of specified size.
TileConsumerAndFuseProducersOptions blockTileOptions;
// First split computation into grid with blocks of specified size.
TileConsumerAndFuseProducersOptions blockTileOptions;
if (!llvm::any_of(gpuBlockTile, [](int64_t tile) { return tile == -1; }))
blockTileOptions.tileSizes = gpuBlockTile;
blockTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));

// Then try to further split computation into subtiles.
// This allows to split larger computations across multiple
// threads/workitems. For smaller workloads, it provides another
// chance for outlining.
TileConsumerAndFuseProducersOptions threadTileOptions;
blockTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));

// Then try to further split computation into subtiles.
// This allows to split larger computations across multiple
// threads/workitems. For smaller workloads, it provides another
// chance for outlining.
TileConsumerAndFuseProducersOptions threadTileOptions;
if (!llvm::any_of(gpuThreadTile, [](int64_t tile) { return tile == -1; }))
threadTileOptions.tileSizes = gpuThreadTile;
threadTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
} else {
TileConsumerAndFuseProducersOptions tilingOptions;
tilingOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
}
threadTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
pm.addPass(createCleanup());

// Preprocess and bufferize as further conversion requires memref
Expand All @@ -198,9 +190,8 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.addPass(createCleanup());

// Convert to generic GPU ops.
pm.addPass(createGpuConversion(
GpuConversionOptions{gpuWmma, wmmaTileSizes, gpuType == GpuType::Intel,
kTile, stages, gpuDpasTile}));
pm.addPass(createGpuConversion(GpuConversionOptions{
gpuType == GpuType::Intel, kTile, stages, gpuDpasTile}));

// Lower GPU ops to the chosen GPU backend.
switch (gpuType) {
Expand All @@ -212,7 +203,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
gpuOptions.triple, gpuOptions.chip, gpuOptions.features}));
break;
}
case GpuType::Intel:
case GpuType::Intel: {
pm.addPass(xegpu::createXeGPUFoldAliasOps());

std::string clientApi = "intel";
Expand All @@ -223,6 +214,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,

break;
}
}

// Covert all local dialects like perf.
pm.addPass(createLocalDialectsLowering());
Expand Down
Loading