add documentation and tests

NVIDIA · Jan 7, 2025 · 24b8e28 · 24b8e28
1 parent fee650c
commit 24b8e28
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 61 deletions.
diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
@@ -9,23 +9,24 @@ PTX Instructions
    instructions/barrier_cluster
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
-   instructions/cp_async_bulk_wait_group
    instructions/cp_async_bulk_tensor
+   instructions/cp_async_bulk_wait_group
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier_init
    instructions/mbarrier_arrive
    instructions/mbarrier_expect_tx
+   instructions/mbarrier_init
    instructions/mbarrier_test_wait
    instructions/mbarrier_try_wait
    instructions/red_async
+   instructions/shfl_sync
+   instructions/special_registers
    instructions/st_async
-   instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
-   instructions/special_registers
+   instructions/tensormap_replace
 
 
 Instructions by section
@@ -232,8 +233,8 @@ Instructions by section
      - No
    * - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
      - No
-   * - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
-     - No
+   * - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+     - Yes, CCCL 2.9.0 / CUDA 12.9
    * - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
      - No
    * - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__

diff --git a/docs/libcudacxx/ptx/instructions/manual/shfl_sync.rst b/docs/libcudacxx/ptx/instructions/manual/shfl_sync.rst
@@ -0,0 +1,23 @@
+
+shfl.sync
+^^^^^^^^^
+
+.. code:: cuda
+
+   // PTX ISA 6.0
+   // shfl.sync.mode.b32  d[|p], a, b, c, membermask;
+   //   .mode = { .up, .down, .bfly, .idx };
+
+   struct shfl_return_values {
+       uint32_t data;
+       bool     pred;
+   };
+
+   [[nodiscard]] __device__ static inline
+   shfl_return_values shfl_sync(shfl_mode_t shfl_mode,
+                                uint32_t    data,
+                                uint32_t    lane_idx_offset,
+                                uint32_t    clamp_segmask,
+                                uint32_t    lane_mask) noexcept;
+
+- ``shfl_mode`` is ``shfl_mode_up`` or ``shfl_mode_down`` or ``shfl_mode_bfly`` or ``shfl_mode_idx``
diff --git a/docs/libcudacxx/ptx/instructions/shfl_sync.rst b/docs/libcudacxx/ptx/instructions/shfl_sync.rst
@@ -0,0 +1,10 @@
+
+.. _libcudacxx-ptx-instructions-shfl_sync:
+
+shfl.sync
+=========
+
+-  PTX ISA:
+   `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+
+.. include:: manual/shfl_sync.rst
diff --git a/...xx/include/cuda/__ptx/instructions/shfl.h → ...clude/cuda/__ptx/instructions/shfl_sync.h b/...xx/include/cuda/__ptx/instructions/shfl.h → ...clude/cuda/__ptx/instructions/shfl_sync.h
@@ -8,37 +8,70 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !defined(_CUDA_PTX_SHFL_SYNC_H)
-#  define _CUDA_PTX_SHFL_SYNC_H
+#ifndef _CUDA_PTX_SHFL_SYNC_H
+#define _CUDA_PTX_SHFL_SYNC_H
 
-#  include <cuda/std/detail/__config>
+#include <cuda/std/detail/__config>
 
-#  if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#    pragma GCC system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#    pragma clang system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#    pragma system_header
-#  endif // no system header
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#  if _CCCL_STD_VER >= 2017
+#if _CCCL_STD_VER >= 2017
 
-#    include <cuda/__ptx/instructions/get_sreg.h>
-#    include <cuda/__ptx/ptx_dot_variants.h>
-#    include <cuda/std/__type_traits/is_integral.h>
-#    include <cuda/std/__type_traits/is_signed.h>
-#    include <cuda/std/cstdint>
+#  include <cuda/__ptx/instructions/get_sreg.h>
+#  include <cuda/__ptx/ptx_dot_variants.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/cstdint>
 
-#    include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+#  include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
-#    if __cccl_ptx_isa >= 600
+#  if __cccl_ptx_isa >= 600
+
+template <dot_shfl_mode _ShuffleMode>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
+  shfl_mode_t<_ShuffleMode> __shfl_mode,
+  _CUDA_VSTD::uint32_t __lane_idx_offset,
+  _CUDA_VSTD::uint32_t __clamp_segmask,
+  _CUDA_VSTD::uint32_t __lane_mask)
+{
+  auto __lane     = get_sreg_laneid();
+  auto __clamp    = __clamp_segmask & 0b11111;
+  auto __segmask  = __clamp_segmask >> 8;
+  auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
+  auto __j        = 0;
+  if constexpr (__shfl_mode == shfl_mode_idx)
+  {
+    auto __min_lane = (__lane & __clamp);
+    __j             = __min_lane | (__lane_idx_offset & ~__segmask);
+  }
+  else if constexpr (__shfl_mode == shfl_mode_up)
+  {
+    __j = __lane - __lane_idx_offset;
+  }
+  else if constexpr (__shfl_mode == shfl_mode_down)
+  {
+    __j = __lane + __lane_idx_offset;
+  }
+  else
+  {
+    __j = __lane ^ __lane_idx_offset;
+  }
+  auto __dst = (__shfl_mode == shfl_mode_up) ? (__j >= __max_lane ? __j : __lane) : (__j <= __max_lane ? __j : __lane);
+  return (1 << __dst);
+}
 
 struct shfl_return_values
 {
-  _CUDA_VSTD::uint32_t __data;
-  _CUDA_VSTD::int32_t __pred;
+  _CUDA_VSTD::uint32_t data;
+  bool pred;
 };
 
 template <dot_shfl_mode _ShuffleMode>
@@ -50,87 +83,63 @@ _CCCL_NODISCARD _CCCL_DEVICE static inline shfl_return_values shfl_sync(
   _CUDA_VSTD::uint32_t __lane_mask) noexcept
 {
   _CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
-  _CCCL_ASSERT(__clamp_segmask <= 0b111111111111, "clamp value + segmentation mask must be less or equal than 12 bits");
+  _CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
+               "clamp value + segmentation mask must be less or equal than 12 bits");
   _CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-  auto __lane                = get_sreg_laneid();
-  auto __clamp               = __clamp_segmask & 0b11111;
-  auto __segmask             = __clamp_segmask >> 8;
-  auto __max_lane            = (__lane & __segmask) | (__clamp & __segmask);
-  _CUDA_VSTD::uint32_t __dst = 0;
-#      endif
+  _CCCL_ASSERT(__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
+               "the destination lane must be a member of the lane mask");
   _CUDA_VSTD::int32_t __pred;
   _CUDA_VSTD::uint32_t __ret;
   if constexpr (__shfl_mode == shfl_mode_idx)
   {
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-    auto __min_lane = (__lane & __segmask);
-    auto __j        = __min_lane | (__lane_idx_offset & ~__segmask);
-    __dst           = __j <= __max_lane ? __j : __lane;
-#      endif
     asm volatile(
       "{                                                           \n\t\t"
       ".reg .pred p;                                               \n\t\t"
-      "shfl_sync.sync.idx.b32 %0|p, %2, %3, %4, %5;                \n\t\t"
+      "shfl_sync.sync.idx.b32 %0|p, %2, %3, %4, %5;                     \n\t\t"
       "selp.s32 %1, 1, 0, p;                                         \n\t"
       "}"
       : "=r"(__ret), "=r"(__pred)
       : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
   }
   else if constexpr (__shfl_mode == shfl_mode_up)
   {
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-    auto __j = __lane - __lane_idx_offset;
-    __dst    = __j >= __max_lane ? __j : __lane;
-#      endif
     asm volatile(
       "{                                                           \n\t\t"
       ".reg .pred p;                                               \n\t\t"
-      "shfl_sync.sync.up.b32 %0|p, %2, %3, %4, %5;                 \n\t\t"
+      "shfl_sync.sync.up.b32 %0|p, %2, %3, %4, %5;                      \n\t\t"
       "selp.s32 %1, 1, 0, p;                                         \n\t"
       "}"
       : "=r"(__ret), "=r"(__pred)
       : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
   }
   else if constexpr (__shfl_mode == shfl_mode_down)
   {
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-    auto __j = __lane + __lane_idx_offset;
-    __dst    = __j <= __max_lane ? __j : __lane;
-#      endif
     asm volatile(
       "{                                                           \n\t\t"
       ".reg .pred p;                                               \n\t\t"
-      "shfl_sync.sync.down.b32 %0|p, %2, %3, %4, %5;               \n\t\t"
+      "shfl_sync.sync.down.b32 %0|p, %2, %3, %4, %5;                    \n\t\t"
       "selp.s32 %1, 1, 0, p;                                         \n\t"
       "}"
       : "=r"(__ret), "=r"(__pred)
       : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
   }
   else
   {
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-    auto __j = __lane ^ __lane_idx_offset;
-    __dst    = __j <= __max_lane ? __j : __lane;
-#      endif
     asm volatile(
       "{                                                           \n\t\t"
       ".reg .pred p;                                               \n\t\t"
-      "shfl_sync.sync.bfly.b32 %0|p, %2, %3, %4, %5;               \n\t\t"
+      "shfl_sync.sync.bfly.b32 %0|p, %2, %3, %4, %5;                    \n\t\t"
       "selp.s32 %1, 1, 0, p;                                         \n\t"
       "}"
       : "=r"(__ret), "=r"(__pred)
       : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
   }
-#      if defined(_CCCL_ENABLE_DEBUG_MODE)
-  _CCCL_ASSERT((1 << __dst) & __lane_mask, "the destination lane must be a member of the lane mask");
-#      endif
-  return shfl_return_values{__ret, __pred};
+  return shfl_return_values{__ret, static_cast<bool>(__pred)};
 }
 
-#    endif // __cccl_ptx_isa >= 600
+#  endif // __cccl_ptx_isa >= 600
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
-#  endif // _CCCL_STD_VER >= 2017
+#endif // _CCCL_STD_VER >= 2017
 #endif // _CUDA_PTX_SHFL_SYNC_H
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
@@ -83,6 +83,7 @@
 #include <cuda/__ptx/instructions/mbarrier_init.h>
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
 #include <cuda/__ptx/instructions/red_async.h>
+#include <cuda/__ptx/instructions/shfl.h>
 #include <cuda/__ptx/instructions/st_async.h>
 #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
 #include <cuda/__ptx/instructions/tensormap_replace.h>

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/manual/shfl_test.h b/libcudacxx/test/libcudacxx/cuda/ptx/manual/shfl_test.h
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: clang && !nvcc
+
+// <cuda/ptx>
+
+__host__ __device__ void test_shfl_full_mask()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned FullMask = 0xFFFFFFFF;
+  auto data                   = threadIdx.x;
+  auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask);
+  _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+
+  auto [res2, pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, 0 /*clamp*/, FullMask);
+  if (threadIdx.x <= 1)
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
+  }
+
+  auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask);
+  if (threadIdx.x >= 30)
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
+  }
+
+  auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask);
+  _CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
+#endif // __cccl_ptx_isa >= 600
+}
+
+__host__ __device__ void test_shfl_partial_mask()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned PartialMask = 0b1111;
+  auto data                      = threadIdx.x;
+  if (threadIdx.x <= 3)
+  {
+    auto [res1,
+          pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask);
+    _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+  }
+#endif // __cccl_ptx_isa >= 600
+}
+
+__host__ __device__ void test_shfl_partial_warp()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned FullMask = 0xFFFFFFFF;
+  unsigned max_lane_mask      = 16;
+  unsigned clamp              = 0b11111;
+  unsigned clamp_segmark      = (max_lane_mask << 8) | clamp;
+  auto data                   = threadIdx.x;
+  auto [res1, pred1] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, clamp_segmark, FullMask);
+  if (threadIdx.x < 16)
+  {
+    _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res1 == 16 + 2 && pred1, "shfl_mode_idx failed");
+  }
+
+  auto [res2,
+        pred2] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, (max_lane_mask << 8), FullMask);
+  printf("%d:  res2 = %d, pred2 = %d\n", threadIdx.x, res2, pred2);
+  if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17)
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
+  }
+
+  auto [res3, pred3] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, clamp_segmark, FullMask);
+  if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30)
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
+  }
+
+  auto [res4, pred4] = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, clamp_segmark, FullMask);
+  _CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
+#endif // __cccl_ptx_isa >= 600
+}