-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds GPU kernel vectorization pass. Extends CUDA lowering pass to support vector operations. GPU-specific vectorization pass that guides upstream Linalg vectorizer to process operations within a GPU kernel or prepared for outlining. CUDA-specific pass gets extended to allow lowering of vector ops within GPU kernel. The vectorization is for now disabled within the GPU pipeline due to lack of vector operation unrolling. When vector sizes exceed hardware supported lengths, pipeline gets stuck on GPU binary compilation step. This will be addressed by a separate transformation pass in the future.
- Loading branch information
Showing
7 changed files
with
392 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
//===- GpuVectorize.cpp ------------------------------------------*- C++-*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "TPP/Passes.h" | ||
|
||
#include "mlir/Conversion/Passes.h" | ||
#include "mlir/Dialect/Arith/IR/Arith.h" | ||
#include "mlir/Dialect/Func/IR/FuncOps.h" | ||
#include "mlir/Dialect/GPU/TransformOps/Utils.h" | ||
#include "mlir/Dialect/GPU/Transforms/Passes.h" | ||
#include "mlir/Dialect/Linalg/IR/Linalg.h" | ||
#include "mlir/Dialect/Linalg/Passes.h" | ||
#include "mlir/Dialect/Math/IR/Math.h" | ||
#include "mlir/Dialect/MemRef/IR/MemRef.h" | ||
#include "mlir/Dialect/SCF/IR/SCF.h" | ||
#include "mlir/Dialect/Tensor/IR/Tensor.h" | ||
#include "mlir/Dialect/Vector/IR/VectorOps.h" | ||
#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" | ||
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" | ||
#include "mlir/IR/Dialect.h" | ||
#include "mlir/Pass/Pass.h" | ||
#include "mlir/Pass/PassManager.h" | ||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h" | ||
#include "mlir/Transforms/Passes.h" | ||
|
||
using namespace mlir; | ||
|
||
namespace mlir { | ||
namespace tpp { | ||
#define GEN_PASS_DEF_GPUVECTORIZE | ||
#include "TPP/Passes.h.inc" | ||
} // namespace tpp | ||
} // namespace mlir | ||
|
||
namespace { | ||
|
||
// Vectorize ops within GPU kernel. | ||
struct VectorizeGpuLaunch : public OpRewritePattern<gpu::LaunchOp> { | ||
using OpRewritePattern<gpu::LaunchOp>::OpRewritePattern; | ||
|
||
LogicalResult matchAndRewrite(gpu::LaunchOp launchOp, | ||
PatternRewriter &rewriter) const override { | ||
// Vectorize all linalg ops within GPU kernel. | ||
// It is expected that the ops operate on statically sized tiles. | ||
auto walkResult = launchOp->walk([&](linalg::LinalgOp linalgOp) { | ||
if (linalgOp.hasDynamicShape()) | ||
return WalkResult::interrupt(); | ||
|
||
if (failed(vectorize(rewriter, linalgOp, /*inputVectorSizes=*/{}, | ||
/*scalableVecDims=*/{}))) | ||
return WalkResult::interrupt(); | ||
return WalkResult::advance(); | ||
}); | ||
|
||
if (walkResult.wasInterrupted()) | ||
return rewriter.notifyMatchFailure( | ||
launchOp, "Failed to vectorize ops within GPU launch"); | ||
|
||
return success(); | ||
} | ||
}; | ||
|
||
// Vectorize linalg ops targeting GPU. | ||
struct GpuVectorizeLinalg : public OpInterfaceRewritePattern<linalg::LinalgOp> { | ||
using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern; | ||
|
||
LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp, | ||
PatternRewriter &rewriter) const override { | ||
// Vectorize all Linalg ops within parallelized loops. | ||
if (!linalgOp.hasPureTensorSemantics()) | ||
return rewriter.notifyMatchFailure(linalgOp, "Expects tensor semantics"); | ||
|
||
if (linalgOp.hasDynamicShape()) | ||
return rewriter.notifyMatchFailure(linalgOp, | ||
"Expects static shapes only"); | ||
|
||
// Only process operations within parallelized loops. | ||
// TODO: Use some different mechanism like annotations to determine which | ||
// ops target GPU. | ||
if (!linalgOp->getParentOfType<scf::ForallOp>()) | ||
return rewriter.notifyMatchFailure(linalgOp, | ||
"Expects parallel loop parent"); | ||
|
||
return vectorize(rewriter, linalgOp, /*inputVectorSizes=*/{}, | ||
/*scalableVecDims=*/{}); | ||
} | ||
}; | ||
|
||
// Vectorize operations targeting GPU. | ||
struct GpuVectorize : public tpp::impl::GpuVectorizeBase<GpuVectorize> { | ||
using GpuVectorizeBase::GpuVectorizeBase; | ||
|
||
void runOnOperation() override { | ||
MLIRContext *ctx = getOperation().getContext(); | ||
RewritePatternSet patterns(ctx); | ||
|
||
// Vectorize core computation ops within kernel launch. | ||
patterns.add<VectorizeGpuLaunch, GpuVectorizeLinalg>(ctx); | ||
|
||
// Vector postprocessing patterns. | ||
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns); | ||
vector::populateVectorReductionToContractPatterns(patterns); | ||
vector::populateSinkVectorOpsPatterns(patterns); | ||
vector::TransferReadOp::getCanonicalizationPatterns(patterns, ctx); | ||
vector::TransferWriteOp::getCanonicalizationPatterns(patterns, ctx); | ||
|
||
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); | ||
} | ||
}; | ||
|
||
} // namespace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// RUN: ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0:${ASAN_OPTIONS} \ | ||
// RUN: tpp-run %s -gpu=cuda -print \ | ||
// RUN: -entry-point-result=void -e entry 2>&1 | \ | ||
// RUN: FileCheck %s | ||
|
||
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | ||
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> | ||
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | ||
func.func @entry(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<8x8xf32>) -> tensor<8x8xf32> { | ||
%cst = arith.constant 0.000000e+00 : f32 | ||
%c0 = arith.constant 0 : index | ||
%0 = scf.forall (%arg3, %arg4) = (0, 0) to (8, 8) step (4, 4) shared_outs(%arg5 = %arg2) -> (tensor<8x8xf32>) { | ||
%extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32> | ||
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg4] [8, 4] [1, 1] : tensor<8x8xf32> to tensor<8x4xf32> | ||
%extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [4, 4] [1, 1] : tensor<8x8xf32> to tensor<4x4xf32> | ||
%1 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x8xf32>, vector<4x8xf32> | ||
%2 = vector.transfer_read %extracted_slice_0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<8x4xf32>, vector<8x4xf32> | ||
%3 = vector.transfer_read %extracted_slice_1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> | ||
%4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %2, %3 : vector<4x8xf32>, vector<8x4xf32> into vector<4x4xf32> | ||
%5 = vector.transfer_write %4, %extracted_slice_1[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | ||
scf.forall.in_parallel { | ||
tensor.parallel_insert_slice %5 into %arg5[%arg3, %arg4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32> | ||
} | ||
} | ||
return %0 : tensor<8x8xf32> | ||
} | ||
|
||
// CHECK-COUNT-8: 9, 9, 9, 9, 9, 9, 9, 9 |
Oops, something went wrong.