Skip to content

Commit

Permalink
Linalg to XeGPU lowering (#915)
Browse files Browse the repository at this point in the history
Adds direct lowering from Linalg to XeGPU and extends GPU runner support
with 'intel' target.
The Intel GPU pipeline is designed with IMEX and IGC compatibility in
mind.

The lowering targets tiled operations and assumes that the input shapes
are nicely divisible by hardware supported sizes e.g., tiles 32x32,
16x16 etc.
This is the first step toward bridging XeGPU with higher abstraction
dialect. Common patterns used in this conversion can be later split into
more progressive lowering through other dialects like vector and memref.

Supported conversion:
- targets Vector Compute mode of XeGPU (subgroup-level kernel)
- eltwise operations of any type split into SIMD sized computations
- DPAS implementation for F16 matmul with output precision conversion
  • Loading branch information
adam-smnk authored Jun 24, 2024
1 parent f7ea8fd commit 9aab33a
Show file tree
Hide file tree
Showing 13 changed files with 1,956 additions and 20 deletions.
20 changes: 19 additions & 1 deletion include/TPP/PassBundles.td
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,27 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
];
let dependentDialects = ["linalg::LinalgDialect",
"gpu::GPUDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"gpu::GPUDialect"];
"xegpu::XeGPUDialect"];
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"isIntel", "intel",
"bool", /*default=*/"false",
"Convert for Intel GPU">,
Option<"kTile", "k-tile", "int64_t",
/*default=*/"32",
"GEMM tile size for reduction dimension.">,
Option<"stages", "stages", "int64_t",
/*default=*/"1",
"Number of cooperative prefetch stages.">,
ListOption<"dpasTile", "dpas-tile", "int64_t",
"DPAS register block sizes MxNxK">,
];
}

def GpuToCuda : Pass<"gpu-to-cuda", "ModuleOp"> {
Expand Down
4 changes: 4 additions & 0 deletions include/TPP/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ namespace xsmm {
class XsmmDialect;
} // namespace xsmm

namespace xegpu {
class XeGPUDialect;
} // namespace xegpu

} // namespace mlir

namespace mlir {
Expand Down
25 changes: 25 additions & 0 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,29 @@ def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
];
}

def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
let summary = "Convert linalg dialect to XeGPU dialect.";
let description = [{
Lower linalg ops to XeGPU dialect.
}];
let dependentDialects = ["linalg::LinalgDialect",
"gpu::GPUDialect",
"xegpu::XeGPUDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"arith::ArithDialect",
"math::MathDialect",
"vector::VectorDialect"];
let options = [
Option<"kTile", "k-tile", "int64_t",
/*default=*/"32",
"GEMM tile size for reduction dimension.">,
Option<"stages", "stages", "int64_t",
/*default=*/"1",
"Number of cooperative prefetch stages.">,
ListOption<"dpasTile", "dpas-tile", "int64_t",
"DPAS register block sizes MxNxK">,
];
}

#endif // TPP_DIALECT_TPP_PASSES
7 changes: 7 additions & 0 deletions lib/TPP/DefaultPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
if (print == PrintStage::Mid)
pm.addPass(createPrintIRPass());

// Bail out early for Intel GPU.
// The rest of the lowering is performed by IMEX.
if (gpuBackend == "intel") {
pm.addPass(createPrintIRPass());
return;
}

// Partial Lowering
pm.addPass(memref::createExpandStridedMetadataPass());
pm.addPass(createConvertTensorToLinalgPass());
Expand Down
2 changes: 2 additions & 0 deletions lib/TPP/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_mlir_library(TPPGPU
LinalgToGpu.cpp
GpuDataTransfer.cpp
GpuInlineConstants.cpp
LinalgToXeGPU.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/TPP
Expand All @@ -22,6 +23,7 @@ add_mlir_library(TPPGPU

LINK_LIBS PUBLIC
MLIRGPUDialect
MLIRXeGPUDialect
MLIRGPUTransforms
MLIRGPUToSPIRV
MLIRSCFToGPU
Expand Down
12 changes: 9 additions & 3 deletions lib/TPP/GPU/GpuConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Dialect.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
Expand Down Expand Up @@ -58,8 +59,13 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
// First lower linalg using custom patterns then fall back to
// the default lowering for any remaining ops.
pm.addNestedPass<func::FuncOp>(createLinalgDeGeneralize());
pm.addNestedPass<func::FuncOp>(
createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile}));
if (isIntel) {
pm.addNestedPass<func::FuncOp>(
createLinalgToXeGPU(LinalgToXeGPUOptions{kTile, stages, dpasTile}));
} else {
pm.addNestedPass<func::FuncOp>(
createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile, kTile}));
}
pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());

// Map loops into GPU kernels.
Expand Down
68 changes: 61 additions & 7 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,29 @@ llvm::cl::list<int64_t> wmmaTileSizes(
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
llvm::cl::CommaSeparated);

llvm::cl::list<int64_t>
gpuBlockTile("gpu-block-tile", llvm::cl::desc("GPU block tile size"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{128, 128}),
llvm::cl::CommaSeparated);

llvm::cl::list<int64_t>
gpuThreadTile("gpu-thread-tile", llvm::cl::desc("GPU thread tile size"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{32, 32}),
llvm::cl::CommaSeparated);

llvm::cl::opt<int64_t> kTile("k-tile", llvm::cl::desc("GEMM K dim tiling size"),
llvm::cl::init(32));

llvm::cl::opt<int64_t> stages("stages",
llvm::cl::desc("GEMM coop prefetch stages"),
llvm::cl::init(1));

// DPAS size defaults to PVC.
llvm::cl::list<int64_t>
gpuDpasTile("dpas-tile", llvm::cl::desc("DPAS register block sizes MxNxK"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{8, 16, 16}),
llvm::cl::CommaSeparated);

namespace mlir {
namespace tpp {
#define GEN_PASS_DEF_GPUPIPELINE
Expand All @@ -62,12 +85,14 @@ namespace {
enum class GpuType {
Cuda,
Vulkan,
Intel,
};

GpuType parseGpuOption(StringRef gpuStr) {
auto type = llvm::StringSwitch<std::optional<GpuType>>(gpuStr)
.CaseLower("cuda", GpuType::Cuda)
.CaseLower("vulkan", GpuType::Vulkan)
.CaseLower("intel", GpuType::Intel)
.Default(std::nullopt);
assert(type && "Unsupported GPU backend");

Expand All @@ -90,7 +115,8 @@ GpuOptions getGpuOptions(GpuType gpuType) {
options.features = "+ptx60";
break;
}
case GpuType::Vulkan: {
case GpuType::Vulkan:
case GpuType::Intel: {
// No options needed at the moment.
break;
}
Expand Down Expand Up @@ -145,22 +171,40 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
// Tile to split the kernel into threads and blocks.
// Use default tiling to handle both packed and unpacked ops.
pm.addPass(createCleanup());
TileConsumerAndFuseProducersOptions tilingOptions;
tilingOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
if (gpuType == GpuType::Intel) {
// First split computation into grid with blocks of specified size.
TileConsumerAndFuseProducersOptions blockTileOptions;
blockTileOptions.tileSizes = gpuBlockTile;
blockTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(blockTileOptions));

// Then try to further split computation into subtiles.
// This allows to split larger computations across multiple
// threads/workitems. For smaller workloads, it provides another
// chance for outlining.
TileConsumerAndFuseProducersOptions threadTileOptions;
threadTileOptions.tileSizes = gpuThreadTile;
threadTileOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
} else {
TileConsumerAndFuseProducersOptions tilingOptions;
tilingOptions.minTileFactor = 1;
pm.addPass(createTileConsumerAndFuseProducers(tilingOptions));
}
pm.addPass(createCleanup());

// Preprocess and bufferize as further conversion requires memref
// abstraction.
pm.addPass(createLowerPacksAndUnPacks());
bool dealloc = gpuType != GpuType::Cuda;
bool dealloc = gpuType == GpuType::Vulkan;
pm.addPass(createBufferize(BufferizeOptions{dealloc}));
pm.addPass(createConvertForAllToParallelOp());
pm.addPass(createCleanup());

// Convert to generic GPU ops.
pm.addPass(
createGpuConversion(GpuConversionOptions{gpuWmma, wmmaTileSizes}));
pm.addPass(createGpuConversion(
GpuConversionOptions{gpuWmma, wmmaTileSizes, gpuType == GpuType::Intel,
kTile, stages, gpuDpasTile}));

// Lower GPU ops to the chosen GPU backend.
switch (gpuType) {
Expand All @@ -177,6 +221,16 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.addPass(createGpuToVulkan());
break;
}
case GpuType::Intel:
pm.addPass(xegpu::createXeGPUFoldAliasOps());

std::string clientApi = "intel";
SetSPIRVCapabilitiesOptions capabilitiesOptions{clientApi};
pm.addPass(tpp::createSetSPIRVCapabilities(capabilitiesOptions));
SetSPIRVAbiAttributeOptions abiAttrOptions{clientApi};
pm.addPass(tpp::createSetSPIRVAbiAttribute(abiAttrOptions));

break;
}

// Covert all local dialects like perf.
Expand Down
Loading

0 comments on commit 9aab33a

Please sign in to comment.