Skip to content

Commit

Permalink
add documentation and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato committed Jan 7, 2025
1 parent fee650c commit 24b8e28
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 61 deletions.
13 changes: 7 additions & 6 deletions docs/libcudacxx/ptx/instructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,24 @@ PTX Instructions
instructions/barrier_cluster
instructions/cp_async_bulk
instructions/cp_async_bulk_commit_group
instructions/cp_async_bulk_wait_group
instructions/cp_async_bulk_tensor
instructions/cp_async_bulk_wait_group
instructions/cp_reduce_async_bulk
instructions/cp_reduce_async_bulk_tensor
instructions/fence
instructions/getctarank
instructions/mapa
instructions/mbarrier_init
instructions/mbarrier_arrive
instructions/mbarrier_expect_tx
instructions/mbarrier_init
instructions/mbarrier_test_wait
instructions/mbarrier_try_wait
instructions/red_async
instructions/shfl_sync
instructions/special_registers
instructions/st_async
instructions/tensormap_replace
instructions/tensormap_cp_fenceproxy
instructions/special_registers
instructions/tensormap_replace


Instructions by section
Expand Down Expand Up @@ -232,8 +233,8 @@ Instructions by section
- No
* - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
- No
* - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- No
* - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- Yes, CCCL 2.9.0 / CUDA 12.9
* - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
- No
* - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__
Expand Down
23 changes: 23 additions & 0 deletions docs/libcudacxx/ptx/instructions/manual/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

shfl.sync
^^^^^^^^^

.. code:: cuda
// PTX ISA 6.0
// shfl.sync.mode.b32 d[|p], a, b, c, membermask;
// .mode = { .up, .down, .bfly, .idx };
struct shfl_return_values {
uint32_t data;
bool pred;
};
[[nodiscard]] __device__ static inline
shfl_return_values shfl_sync(shfl_mode_t shfl_mode,
uint32_t data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;
- ``shfl_mode`` is ``shfl_mode_up`` or ``shfl_mode_down`` or ``shfl_mode_bfly`` or ``shfl_mode_idx``
10 changes: 10 additions & 0 deletions docs/libcudacxx/ptx/instructions/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

.. _libcudacxx-ptx-instructions-shfl_sync:

shfl.sync
=========

- PTX ISA:
`shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__

.. include:: manual/shfl_sync.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,70 @@
//
//===----------------------------------------------------------------------===//

#if !defined(_CUDA_PTX_SHFL_SYNC_H)
# define _CUDA_PTX_SHFL_SYNC_H
#ifndef _CUDA_PTX_SHFL_SYNC_H
#define _CUDA_PTX_SHFL_SYNC_H

# include <cuda/std/detail/__config>
#include <cuda/std/detail/__config>

# if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
# elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
# elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
# endif // no system header
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

# if _CCCL_STD_VER >= 2017
#if _CCCL_STD_VER >= 2017

# include <cuda/__ptx/instructions/get_sreg.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_signed.h>
# include <cuda/std/cstdint>
# include <cuda/__ptx/instructions/get_sreg.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_signed.h>
# include <cuda/std/cstdint>

# include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
# include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

# if __cccl_ptx_isa >= 600
# if __cccl_ptx_isa >= 600

template <dot_shfl_mode _ShuffleMode>
_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
shfl_mode_t<_ShuffleMode> __shfl_mode,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask)
{
auto __lane = get_sreg_laneid();
auto __clamp = __clamp_segmask & 0b11111;
auto __segmask = __clamp_segmask >> 8;
auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
auto __j = 0;
if constexpr (__shfl_mode == shfl_mode_idx)
{
auto __min_lane = (__lane & __clamp);
__j = __min_lane | (__lane_idx_offset & ~__segmask);
}
else if constexpr (__shfl_mode == shfl_mode_up)
{
__j = __lane - __lane_idx_offset;
}
else if constexpr (__shfl_mode == shfl_mode_down)
{
__j = __lane + __lane_idx_offset;
}
else
{
__j = __lane ^ __lane_idx_offset;
}
auto __dst = (__shfl_mode == shfl_mode_up) ? (__j >= __max_lane ? __j : __lane) : (__j <= __max_lane ? __j : __lane);
return (1 << __dst);
}

struct shfl_return_values
{
_CUDA_VSTD::uint32_t __data;
_CUDA_VSTD::int32_t __pred;
_CUDA_VSTD::uint32_t data;
bool pred;
};

template <dot_shfl_mode _ShuffleMode>
Expand All @@ -50,87 +83,63 @@ _CCCL_NODISCARD _CCCL_DEVICE static inline shfl_return_values shfl_sync(
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
_CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
_CCCL_ASSERT(__clamp_segmask <= 0b111111111111, "clamp value + segmentation mask must be less or equal than 12 bits");
_CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
"clamp value + segmentation mask must be less or equal than 12 bits");
_CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
# if defined(_CCCL_ENABLE_DEBUG_MODE)
auto __lane = get_sreg_laneid();
auto __clamp = __clamp_segmask & 0b11111;
auto __segmask = __clamp_segmask >> 8;
auto __max_lane = (__lane & __segmask) | (__clamp & __segmask);
_CUDA_VSTD::uint32_t __dst = 0;
# endif
_CCCL_ASSERT(__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
"the destination lane must be a member of the lane mask");
_CUDA_VSTD::int32_t __pred;
_CUDA_VSTD::uint32_t __ret;
if constexpr (__shfl_mode == shfl_mode_idx)
{
# if defined(_CCCL_ENABLE_DEBUG_MODE)
auto __min_lane = (__lane & __segmask);
auto __j = __min_lane | (__lane_idx_offset & ~__segmask);
__dst = __j <= __max_lane ? __j : __lane;
# endif
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl_sync.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"shfl_sync.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred)
: "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else if constexpr (__shfl_mode == shfl_mode_up)
{
# if defined(_CCCL_ENABLE_DEBUG_MODE)
auto __j = __lane - __lane_idx_offset;
__dst = __j >= __max_lane ? __j : __lane;
# endif
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl_sync.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"shfl_sync.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred)
: "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else if constexpr (__shfl_mode == shfl_mode_down)
{
# if defined(_CCCL_ENABLE_DEBUG_MODE)
auto __j = __lane + __lane_idx_offset;
__dst = __j <= __max_lane ? __j : __lane;
# endif
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl_sync.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"shfl_sync.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred)
: "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else
{
# if defined(_CCCL_ENABLE_DEBUG_MODE)
auto __j = __lane ^ __lane_idx_offset;
__dst = __j <= __max_lane ? __j : __lane;
# endif
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl_sync.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"shfl_sync.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred)
: "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
# if defined(_CCCL_ENABLE_DEBUG_MODE)
_CCCL_ASSERT((1 << __dst) & __lane_mask, "the destination lane must be a member of the lane mask");
# endif
return shfl_return_values{__ret, __pred};
return shfl_return_values{__ret, static_cast<bool>(__pred)};
}

# endif // __cccl_ptx_isa >= 600
# endif // __cccl_ptx_isa >= 600

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

# endif // _CCCL_STD_VER >= 2017
#endif // _CCCL_STD_VER >= 2017
#endif // _CUDA_PTX_SHFL_SYNC_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
#include <cuda/__ptx/instructions/mbarrier_init.h>
#include <cuda/__ptx/instructions/mbarrier_wait.h>
#include <cuda/__ptx/instructions/red_async.h>
#include <cuda/__ptx/instructions/shfl.h>
#include <cuda/__ptx/instructions/st_async.h>
#include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
#include <cuda/__ptx/instructions/tensormap_replace.h>
Expand Down
105 changes: 105 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/manual/shfl_test.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: clang && !nvcc

// <cuda/ptx>

__host__ __device__ void test_shfl_full_mask()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned FullMask = 0xFFFFFFFF;
auto data = threadIdx.x;
auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask);
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");

auto [res2, pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, 0 /*clamp*/, FullMask);
if (threadIdx.x <= 1)
{
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
}
else
{
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
}

auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask);
if (threadIdx.x >= 30)
{
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
}
else
{
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
}

auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask);
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
#endif // __cccl_ptx_isa >= 600
}

__host__ __device__ void test_shfl_partial_mask()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned PartialMask = 0b1111;
auto data = threadIdx.x;
if (threadIdx.x <= 3)
{
auto [res1,
pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask);
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
}
#endif // __cccl_ptx_isa >= 600
}

__host__ __device__ void test_shfl_partial_warp()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned FullMask = 0xFFFFFFFF;
unsigned max_lane_mask = 16;
unsigned clamp = 0b11111;
unsigned clamp_segmark = (max_lane_mask << 8) | clamp;
auto data = threadIdx.x;
auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, clamp_segmark, FullMask);
if (threadIdx.x < 16)
{
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
}
else
{
_CCCL_ASSERT(res1 == 16 + 2 && pred1, "shfl_mode_idx failed");
}

auto [res2,
pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, (max_lane_mask << 8), FullMask);
printf("%d: res2 = %d, pred2 = %d\n", threadIdx.x, res2, pred2);
if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17)
{
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
}
else
{
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
}

auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, clamp_segmark, FullMask);
if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30)
{
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
}
else
{
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
}

auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, clamp_segmark, FullMask);
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
#endif // __cccl_ptx_isa >= 600
}
Loading

0 comments on commit 24b8e28

Please sign in to comment.