Linalg to XeGPU lowering (#915)

Adds direct lowering from Linalg to XeGPU and extends GPU runner support with 'intel' target. The Intel GPU pipeline is designed with IMEX and IGC compatibility in mind. The lowering targets tiled operations and assumes that the input shapes are nicely divisible by hardware supported sizes e.g., tiles 32x32, 16x16 etc. This is the first step toward bridging XeGPU with higher abstraction dialect. Common patterns used in this conversion can be later split into more progressive lowering through other dialects like vector and memref. Supported conversion: - targets Vector Compute mode of XeGPU (subgroup-level kernel) - eltwise operations of any type split into SIMD sized computations - DPAS implementation for F16 matmul with output precision conversion
plaidml · Jun 24, 2024 · 9aab33a · 9aab33a
1 parent f7ea8fd
commit 9aab33a
Show file tree

Hide file tree

Showing 13 changed files with 1,956 additions and 20 deletions.
diff --git a/include/TPP/PassBundles.td b/include/TPP/PassBundles.td
@@ -124,9 +124,27 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
     ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
   ];
   let dependentDialects = ["linalg::LinalgDialect",
+                           "gpu::GPUDialect",
                            "scf::SCFDialect",
                            "memref::MemRefDialect",
-                           "gpu::GPUDialect"];
+                           "xegpu::XeGPUDialect"];
+  let options = [
+    Option<"useWmma", "wmma",
+           "bool", /*default=*/"false",
+           "Use WMMA operations">,
+    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
+    Option<"isIntel", "intel",
+           "bool", /*default=*/"false",
+           "Convert for Intel GPU">,
+    Option<"kTile", "k-tile", "int64_t",
+           /*default=*/"32",
+           "GEMM tile size for reduction dimension.">,
+    Option<"stages", "stages", "int64_t",
+           /*default=*/"1",
+           "Number of cooperative prefetch stages.">,
+    ListOption<"dpasTile", "dpas-tile", "int64_t",
+               "DPAS register block sizes MxNxK">,
+  ];
 }
 
 def GpuToCuda : Pass<"gpu-to-cuda", "ModuleOp"> {

diff --git a/include/TPP/Passes.h b/include/TPP/Passes.h
@@ -88,6 +88,10 @@ namespace xsmm {
 class XsmmDialect;
 } // namespace xsmm
 
+namespace xegpu {
+class XeGPUDialect;
+} // namespace xegpu
+
 } // namespace mlir
 
 namespace mlir {

diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -479,4 +479,29 @@ def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
   ];
 }
 
+def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
+  let summary = "Convert linalg dialect to XeGPU dialect.";
+  let description = [{
+    Lower linalg ops to XeGPU dialect.
+  }];
+  let dependentDialects = ["linalg::LinalgDialect",
+                           "gpu::GPUDialect",
+                           "xegpu::XeGPUDialect",
+                           "scf::SCFDialect",
+                           "memref::MemRefDialect",
+                           "arith::ArithDialect",
+                           "math::MathDialect",
+                           "vector::VectorDialect"];
+  let options = [
+    Option<"kTile", "k-tile", "int64_t",
+           /*default=*/"32",
+           "GEMM tile size for reduction dimension.">,
+    Option<"stages", "stages", "int64_t",
+           /*default=*/"1",
+           "Number of cooperative prefetch stages.">,
+    ListOption<"dpasTile", "dpas-tile", "int64_t",
+               "DPAS register block sizes MxNxK">,
+  ];
+}
+
 #endif // TPP_DIALECT_TPP_PASSES
diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp
@@ -132,6 +132,13 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
     if (print == PrintStage::Mid)
       pm.addPass(createPrintIRPass());
 
+    // Bail out early for Intel GPU.
+    // The rest of the lowering is performed by IMEX.
+    if (gpuBackend == "intel") {
+      pm.addPass(createPrintIRPass());
+      return;
+    }
+
     // Partial Lowering
     pm.addPass(memref::createExpandStridedMetadataPass());
     pm.addPass(createConvertTensorToLinalgPass());

diff --git a/lib/TPP/GPU/CMakeLists.txt b/lib/TPP/GPU/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_library(TPPGPU
   LinalgToGpu.cpp
   GpuDataTransfer.cpp
   GpuInlineConstants.cpp
+  LinalgToXeGPU.cpp
 
   ADDITIONAL_HEADER_DIRS
     ${PROJECT_SOURCE_DIR}/include/TPP
@@ -22,6 +23,7 @@ add_mlir_library(TPPGPU
 
   LINK_LIBS PUBLIC
     MLIRGPUDialect
+    MLIRXeGPUDialect
     MLIRGPUTransforms
     MLIRGPUToSPIRV
     MLIRSCFToGPU

diff --git a/lib/TPP/GPU/GpuConversion.cpp b/lib/TPP/GPU/GpuConversion.cpp
@@ -15,7 +15,8 @@
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -58,8 +59,13 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
     // First lower linalg using custom patterns then fall back to
     // the default lowering for any remaining ops.
     pm.addNestedPass<func::FuncOp>(createLinalgDeGeneralize());
-    pm.addNestedPass<func::FuncOp>(
-        createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile}));
+    if (isIntel) {
+      pm.addNestedPass<func::FuncOp>(
+          createLinalgToXeGPU(LinalgToXeGPUOptions{kTile, stages, dpasTile}));
+    } else {
+      pm.addNestedPass<func::FuncOp>(
+          createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile, kTile}));
+    }
     pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
 
     // Map loops into GPU kernels.

diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp
@@ -50,6 +50,29 @@ llvm::cl::list<int64_t> wmmaTileSizes(
     llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
     llvm::cl::CommaSeparated);
 
+llvm::cl::list<int64_t>
+    gpuBlockTile("gpu-block-tile", llvm::cl::desc("GPU block tile size"),
+                 llvm::cl::list_init<int64_t>(SmallVector<int64_t>{128, 128}),
+                 llvm::cl::CommaSeparated);
+
+llvm::cl::list<int64_t>
+    gpuThreadTile("gpu-thread-tile", llvm::cl::desc("GPU thread tile size"),
+                  llvm::cl::list_init<int64_t>(SmallVector<int64_t>{32, 32}),
+                  llvm::cl::CommaSeparated);
+
+llvm::cl::opt<int64_t> kTile("k-tile", llvm::cl::desc("GEMM K dim tiling size"),
+                             llvm::cl::init(32));
+
+llvm::cl::opt<int64_t> stages("stages",
+                              llvm::cl::desc("GEMM coop prefetch stages"),
+                              llvm::cl::init(1));
+
+// DPAS size defaults to PVC.
+llvm::cl::list<int64_t>
+    gpuDpasTile("dpas-tile", llvm::cl::desc("DPAS register block sizes MxNxK"),
+                llvm::cl::list_init<int64_t>(SmallVector<int64_t>{8, 16, 16}),
+                llvm::cl::CommaSeparated);
+
 namespace mlir {
 namespace tpp {
 #define GEN_PASS_DEF_GPUPIPELINE
@@ -62,12 +85,14 @@ namespace {
 enum class GpuType {
   Cuda,
   Vulkan,
+  Intel,
 };
 
 GpuType parseGpuOption(StringRef gpuStr) {
   auto type = llvm::StringSwitch<std::optional<GpuType>>(gpuStr)
                   .CaseLower("cuda", GpuType::Cuda)
                   .CaseLower("vulkan", GpuType::Vulkan)
+                  .CaseLower("intel", GpuType::Intel)
                   .Default(std::nullopt);
   assert(type && "Unsupported GPU backend");
 
@@ -90,7 +115,8 @@ GpuOptions getGpuOptions(GpuType gpuType) {
     options.features = "+ptx60";
     break;
   }
-  case GpuType::Vulkan: {
+  case GpuType::Vulkan:
+  case GpuType::Intel: {
     // No options needed at the moment.
     break;
   }
@@ -145,22 +171,40 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
     // Tile to split the kernel into threads and blocks.
     // Use default tiling to handle both packed and unpacked ops.
     pm.addPass(createCleanup());
-    TileConsumerAndFuseProducersOptions tilingOptions;
-    tilingOptions.minTileFactor = 1;
-    pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
+    if (gpuType == GpuType::Intel) {
+      // First split computation into grid with blocks of specified size.
+      TileConsumerAndFuseProducersOptions blockTileOptions;
+      blockTileOptions.tileSizes = gpuBlockTile;
+      blockTileOptions.minTileFactor = 1;
+      pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));
+
+      // Then try to further split computation into subtiles.
+      // This allows to split larger computations across multiple
+      // threads/workitems. For smaller workloads, it provides another
+      // chance for outlining.
+      TileConsumerAndFuseProducersOptions threadTileOptions;
+      threadTileOptions.tileSizes = gpuThreadTile;
+      threadTileOptions.minTileFactor = 1;
+      pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
+    } else {
+      TileConsumerAndFuseProducersOptions tilingOptions;
+      tilingOptions.minTileFactor = 1;
+      pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
+    }
     pm.addPass(createCleanup());
 
     // Preprocess and bufferize as further conversion requires memref
     // abstraction.
     pm.addPass(createLowerPacksAndUnPacks());
-    bool dealloc = gpuType != GpuType::Cuda;
+    bool dealloc = gpuType == GpuType::Vulkan;
     pm.addPass(createBufferize(BufferizeOptions{dealloc}));
     pm.addPass(createConvertForAllToParallelOp());
     pm.addPass(createCleanup());
 
     // Convert to generic GPU ops.
-    pm.addPass(
-        createGpuConversion(GpuConversionOptions{gpuWmma, wmmaTileSizes}));
+    pm.addPass(createGpuConversion(
+        GpuConversionOptions{gpuWmma, wmmaTileSizes, gpuType == GpuType::Intel,
+                             kTile, stages, gpuDpasTile}));
 
     // Lower GPU ops to the chosen GPU backend.
     switch (gpuType) {
@@ -177,6 +221,16 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
       pm.addPass(createGpuToVulkan());
       break;
     }
+    case GpuType::Intel:
+      pm.addPass(xegpu::createXeGPUFoldAliasOps());
+
+      std::string clientApi = "intel";
+      SetSPIRVCapabilitiesOptions capabilitiesOptions{clientApi};
+      pm.addPass(tpp::createSetSPIRVCapabilities(capabilitiesOptions));
+      SetSPIRVAbiAttributeOptions abiAttrOptions{clientApi};
+      pm.addPass(tpp::createSetSPIRVAbiAttribute(abiAttrOptions));
+
+      break;
     }
 
     // Covert all local dialects like perf.