-
Notifications
You must be signed in to change notification settings - Fork 174
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
238 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
|
||
shfl.sync | ||
^^^^^^^^^ | ||
|
||
.. code:: cuda | ||
// PTX ISA 6.0 | ||
// shfl.sync.mode.b32 d[|p], a, b, c, membermask; | ||
// .mode = { .up, .down, .bfly, .idx }; | ||
struct shfl_return_values { | ||
uint32_t data; | ||
bool pred; | ||
}; | ||
[[nodiscard]] __device__ static inline | ||
shfl_return_values shfl_sync(shfl_mode_t shfl_mode, | ||
uint32_t data, | ||
uint32_t lane_idx_offset, | ||
uint32_t clamp_segmask, | ||
uint32_t lane_mask) noexcept; | ||
- ``shfl_mode`` is ``shfl_mode_up`` or ``shfl_mode_down`` or ``shfl_mode_bfly`` or ``shfl_mode_idx`` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
|
||
.. _libcudacxx-ptx-instructions-shfl_sync: | ||
|
||
shfl.sync | ||
========= | ||
|
||
- PTX ISA: | ||
`shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__ | ||
|
||
.. include:: manual/shfl_sync.rst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
// UNSUPPORTED: clang && !nvcc | ||
|
||
// <cuda/ptx> | ||
|
||
__host__ __device__ void test_shfl_full_mask() | ||
{ | ||
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ | ||
constexpr unsigned FullMask = 0xFFFFFFFF; | ||
auto data = threadIdx.x; | ||
auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask); | ||
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed"); | ||
|
||
auto [res2, pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, 0 /*clamp*/, FullMask); | ||
if (threadIdx.x <= 1) | ||
{ | ||
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed"); | ||
} | ||
else | ||
{ | ||
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed"); | ||
} | ||
|
||
auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); | ||
if (threadIdx.x >= 30) | ||
{ | ||
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed"); | ||
} | ||
else | ||
{ | ||
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed"); | ||
} | ||
|
||
auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); | ||
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed"); | ||
#endif // __cccl_ptx_isa >= 600 | ||
} | ||
|
||
__host__ __device__ void test_shfl_partial_mask() | ||
{ | ||
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ | ||
constexpr unsigned PartialMask = 0b1111; | ||
auto data = threadIdx.x; | ||
if (threadIdx.x <= 3) | ||
{ | ||
auto [res1, | ||
pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask); | ||
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed"); | ||
} | ||
#endif // __cccl_ptx_isa >= 600 | ||
} | ||
|
||
__host__ __device__ void test_shfl_partial_warp() | ||
{ | ||
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ | ||
constexpr unsigned FullMask = 0xFFFFFFFF; | ||
unsigned max_lane_mask = 16; | ||
unsigned clamp = 0b11111; | ||
unsigned clamp_segmark = (max_lane_mask << 8) | clamp; | ||
auto data = threadIdx.x; | ||
auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, clamp_segmark, FullMask); | ||
if (threadIdx.x < 16) | ||
{ | ||
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed"); | ||
} | ||
else | ||
{ | ||
_CCCL_ASSERT(res1 == 16 + 2 && pred1, "shfl_mode_idx failed"); | ||
} | ||
|
||
auto [res2, | ||
pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, (max_lane_mask << 8), FullMask); | ||
printf("%d: res2 = %d, pred2 = %d\n", threadIdx.x, res2, pred2); | ||
if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17) | ||
{ | ||
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed"); | ||
} | ||
else | ||
{ | ||
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed"); | ||
} | ||
|
||
auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, clamp_segmark, FullMask); | ||
if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30) | ||
{ | ||
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed"); | ||
} | ||
else | ||
{ | ||
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed"); | ||
} | ||
|
||
auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, clamp_segmark, FullMask); | ||
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed"); | ||
#endif // __cccl_ptx_isa >= 600 | ||
} |
Oops, something went wrong.