GPU pipeline reorganization (#968)

Cleans up and simplifies GPU pipeline in preparation for unified vector based lowering. The main goal is to retire old experimental paths and prepare for more common vendor-agnostic lowering infrastructure. It is another step toward GPU codegen through vectorization. Summary of changes: - moves to tiling based kernel outlining - retires naive outlining based on Linalg to parallel loops conversion - retires packed GEMM GPU kernels - currently irrelevant for GPU kernel creation - retires custom Linalg to WMMA lowering - to be replaced with generic vectorization scheme in the future - cleanups tests and adjusts existing ones to pipeline changes - allows to override default GPU tiling sizes and to use tile setting provided by DLTI (for now uses CPU tile size)
plaidml · Sep 13, 2024 · 7b521f2 · 7b521f2
1 parent 8018f3f
commit 7b521f2
Show file tree

Hide file tree

Showing 37 changed files with 136 additions and 2,073 deletions.
diff --git a/benchmarks/config/GPU/cuda.json b/benchmarks/config/GPU/cuda.json
@@ -15,40 +15,12 @@
       "flags": [ "--gpu=cuda" ],
       "extensions": [ "(avx2|asimd)" ]
     },
-    "fp32_1024_manual_kernel_mlir": {
-      "type": "MLIR",
-      "benchmark": "GPU/gemm-fp32-1024-manual-kernel.mlir",
-      "environment": {},
-      "flags": [ "-n", "100", "--gpu=cuda" ],
-      "extensions": [ "(avx2|asimd)" ]
-    },
     "fp32_1024_base_mlir": {
       "type": "MLIR",
       "benchmark": "GPU/gemm-fp32-1024-base.mlir",
       "environment": {},
       "flags": [ "-n", "100", "--gpu=cuda" ],
       "extensions": [ "(avx2|asimd)" ]
-    },
-    "fp32_1024_packed_mlir": {
-      "type": "MLIR",
-      "benchmark": "GPU/gemm-fp32-1024-packed.mlir",
-      "environment": {},
-      "flags": [ "-n", "100", "--gpu=cuda" ],
-      "extensions": [ "(avx2|asimd)" ]
-    },
-    "fp16_1024_packed_mlir": {
-      "type": "MLIR",
-      "benchmark": "GPU/gemm-fp16-1024-packed.mlir",
-      "environment": {},
-      "flags": [ "-n", "100", "--gpu=cuda" ],
-      "extensions": [ "(avx2|asimd)" ]
-    },
-    "fp16_1024_packed_wmma_mlir": {
-      "type": "MLIR",
-      "benchmark": "GPU/gemm-fp16-1024-packed.mlir",
-      "environment": {},
-      "flags": [ "-n", "100", "--gpu=cuda", "-run-args=-gpu-wmma" ],
-      "extensions": [ "(avx2|asimd)" ]
     }
   }},
   {

diff --git a/benchmarks/mlir/GPU/gemm-fp16-1024-packed.mlir b/benchmarks/mlir/GPU/gemm-fp16-1024-packed.mlir
diff --git a/benchmarks/mlir/GPU/gemm-fp32-1024-manual-kernel.mlir b/benchmarks/mlir/GPU/gemm-fp32-1024-manual-kernel.mlir
diff --git a/benchmarks/mlir/GPU/gemm-fp32-1024-packed.mlir b/benchmarks/mlir/GPU/gemm-fp32-1024-packed.mlir
diff --git a/include/TPP/PassBundles.td b/include/TPP/PassBundles.td
@@ -119,22 +119,12 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
   let description = [{
     Convert all eligble operations into generic GPU operations.
   }];
-  let options = [
-    Option<"useWmma", "wmma",
-           "bool", /*default=*/"false",
-           "Use WMMA operations">,
-    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
-  ];
   let dependentDialects = ["linalg::LinalgDialect",
                            "gpu::GPUDialect",
                            "scf::SCFDialect",
                            "memref::MemRefDialect",
                            "xegpu::XeGPUDialect"];
   let options = [
-    Option<"useWmma", "wmma",
-           "bool", /*default=*/"false",
-           "Use WMMA operations">,
-    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
     Option<"isIntel", "intel",
            "bool", /*default=*/"false",
            "Convert for Intel GPU">,

diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -323,27 +323,6 @@ def DecomposeAggregatedOps : Pass<"decompose-aggregated-ops", "func::FuncOp"> {
   }];
 }
 
-def LinalgToGpu : Pass<"linalg-to-gpu", "func::FuncOp"> {
-  let summary = "Convert linalg ops to be GPU compatible.";
-  let description = [{
-    Lower linalg to ops optimized for computation on GPU.
-  }];
-  let dependentDialects = ["linalg::LinalgDialect",
-                           "scf::SCFDialect",
-                           "memref::MemRefDialect",
-                           "gpu::GPUDialect",
-                           "arith::ArithDialect"];
-  let options = [
-    Option<"useWmma", "wmma",
-           "bool", /*default=*/"false",
-           "Use WMMA operations">,
-    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
-    Option<"kTile", "k-tile", "int64_t",
-           /*default=*/"32",
-           "GEMM tile size for reduction dimension.">,
-  ];
-}
-
 def GpuDataTransfer : Pass<"gpu-data-transfer", "func::FuncOp"> {
   let summary = "Transfer data to and from GPU.";
   let description = [{

diff --git a/lib/TPP/GPU/CMakeLists.txt b/lib/TPP/GPU/CMakeLists.txt
@@ -5,7 +5,6 @@ add_mlir_library(TPPGPU
   GpuToCuda.cpp
   SetSPIRVCapabilities.cpp
   SetSPIRVAbiAttribute.cpp
-  LinalgToGpu.cpp
   GpuDataTransfer.cpp
   GpuInlineConstants.cpp
   LinalgToXeGPU.cpp

diff --git a/lib/TPP/GPU/GpuConversion.cpp b/lib/TPP/GPU/GpuConversion.cpp
@@ -62,11 +62,8 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
     if (isIntel) {
       pm.addNestedPass<func::FuncOp>(
           createLinalgToXeGPU(LinalgToXeGPUOptions{kTile, stages, dpasTile}));
-    } else {
-      pm.addNestedPass<func::FuncOp>(
-          createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile, kTile}));
     }
-    pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
+    pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
 
     // Map loops into GPU kernels.
     pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());

diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp
@@ -41,15 +41,6 @@
 using namespace mlir;
 using namespace mlir::tpp;
 
-llvm::cl::opt<bool> gpuWmma("gpu-wmma",
-                            llvm::cl::desc("Enable GPU WMMA support"),
-                            llvm::cl::init(false));
-
-llvm::cl::list<int64_t> wmmaTileSizes(
-    "wmma-tile-sizes", llvm::cl::desc("GPU WMMA tile sizes MxNxK"),
-    llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
-    llvm::cl::CommaSeparated);
-
 llvm::cl::list<int64_t>
     gpuBlockTile("gpu-block-tile", llvm::cl::desc("GPU block tile size"),
                  llvm::cl::list_init<int64_t>(SmallVector<int64_t>{128, 128}),
@@ -165,29 +156,30 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
     GpuType gpuType = parseGpuOption(this->gpuBackend);
     GpuOptions gpuOptions = getGpuOptions(gpuType);
 
+    // Input preprocessing.
+    pm.addPass(createCleanup());
+    pm.addPass(createFoldIntoEltwise());
+    pm.addNestedPass<func::FuncOp>(createConvertLinalgToInplace());
+
     // Tile to split the kernel into threads and blocks.
     // Use default tiling to handle both packed and unpacked ops.
     pm.addPass(createCleanup());
-    if (gpuType == GpuType::Intel) {
-      // First split computation into grid with blocks of specified size.
-      TileConsumerAndFuseProducersOptions blockTileOptions;
+    // First split computation into grid with blocks of specified size.
+    TileConsumerAndFuseProducersOptions blockTileOptions;
+    if (!llvm::any_of(gpuBlockTile, [](int64_t tile) { return tile == -1; }))
       blockTileOptions.tileSizes = gpuBlockTile;
-      blockTileOptions.minTileFactor = 1;
-      pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));
-
-      // Then try to further split computation into subtiles.
-      // This allows to split larger computations across multiple
-      // threads/workitems. For smaller workloads, it provides another
-      // chance for outlining.
-      TileConsumerAndFuseProducersOptions threadTileOptions;
+    blockTileOptions.minTileFactor = 1;
+    pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));
+
+    // Then try to further split computation into subtiles.
+    // This allows to split larger computations across multiple
+    // threads/workitems. For smaller workloads, it provides another
+    // chance for outlining.
+    TileConsumerAndFuseProducersOptions threadTileOptions;
+    if (!llvm::any_of(gpuThreadTile, [](int64_t tile) { return tile == -1; }))
       threadTileOptions.tileSizes = gpuThreadTile;
-      threadTileOptions.minTileFactor = 1;
-      pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
-    } else {
-      TileConsumerAndFuseProducersOptions tilingOptions;
-      tilingOptions.minTileFactor = 1;
-      pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
-    }
+    threadTileOptions.minTileFactor = 1;
+    pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
     pm.addPass(createCleanup());
 
     // Preprocess and bufferize as further conversion requires memref
@@ -198,9 +190,8 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
     pm.addPass(createCleanup());
 
     // Convert to generic GPU ops.
-    pm.addPass(createGpuConversion(
-        GpuConversionOptions{gpuWmma, wmmaTileSizes, gpuType == GpuType::Intel,
-                             kTile, stages, gpuDpasTile}));
+    pm.addPass(createGpuConversion(GpuConversionOptions{
+        gpuType == GpuType::Intel, kTile, stages, gpuDpasTile}));
 
     // Lower GPU ops to the chosen GPU backend.
     switch (gpuType) {
@@ -212,7 +203,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
           gpuOptions.triple, gpuOptions.chip, gpuOptions.features}));
       break;
     }
-    case GpuType::Intel:
+    case GpuType::Intel: {
       pm.addPass(xegpu::createXeGPUFoldAliasOps());
 
       std::string clientApi = "intel";
@@ -223,6 +214,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
 
       break;
     }
+    }
 
     // Covert all local dialects like perf.
     pm.addPass(createLocalDialectsLowering());