Skip to content

Commit

Permalink
switches to analytical verification
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Jan 5, 2025
1 parent 005e539 commit b2d4555
Show file tree
Hide file tree
Showing 2 changed files with 248 additions and 96 deletions.
190 changes: 190 additions & 0 deletions cub/test/catch2_segmented_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/unique.h>

#include <cuda/std/limits>
#include <cuda/std/tuple>
Expand Down Expand Up @@ -71,6 +72,195 @@ MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333)

#undef MAKE_SEED_MOD_FUNCTION

// Helper to generate a certain number of empty segments followed by equi-sized segments.
template <typename OffsetT, typename SegmentIndexT>
struct segment_index_to_offset_op
{
SegmentIndexT num_empty_segments;
SegmentIndexT num_segments;
OffsetT segment_size;
OffsetT num_items;

_CCCL_HOST_DEVICE __forceinline__ OffsetT operator()(SegmentIndexT i)
{
if (i < num_empty_segments)
{
return 0;
}
else if (i < num_segments)
{
return segment_size * (i - num_empty_segments);
}
else
{
return num_items;
}
}
};

template <typename T>
struct mod_n
{
std::size_t mod;

template <typename IndexT>
_CCCL_HOST_DEVICE __forceinline__ T operator()(IndexT x)
{
return static_cast<T>(x % mod);
}
};

template <typename KeyT>
class short_key_verification_helper
{
private:
using key_t = KeyT;
// The histogram size of the keys being sorted for later verification
static constexpr auto max_histo_size = 1ULL << (8 * sizeof(key_t));

// Holding the histogram of the keys being sorted for verification
c2h::host_vector<std::size_t> keys_histogram{};

public:
void prepare_verification_data(const c2h::device_vector<key_t>& in_keys)
{
c2h::host_vector<key_t> h_in{in_keys};
keys_histogram = c2h::host_vector<std::size_t>(max_histo_size, 0);
for (const auto& key : h_in)
{
keys_histogram[key]++;
}
}

void verify_sorted(const c2h::device_vector<key_t>& out_keys) const
{
// Verfiy keys are sorted next to each other
auto count = thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>());
REQUIRE(count <= max_histo_size);

// Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

for (int i = 0; i < count; i++)
{
auto const next_end = (i == count - 1) ? out_keys.size() : unique_indexes_out[i + 1];
REQUIRE(keys_histogram[unique_keys_out[i]] == next_end - unique_indexes_out[i]);
}
}
};

template <typename KeyT>
class segmented_verification_helper
{
private:
using key_t = KeyT;
const std::size_t sequence_length{};

// Analytically computes the histogram for a segment of a series of keys: [0, 1, 2, ..., mod_n - 1, 0, 1, 2, ...].
// `segment_end` is one-past-the-end of the segment to compute the histogram for.
c2h::host_vector<int> compute_histogram_of_series(std::size_t segment_offset, std::size_t segment_end) const
{
// The i-th full cycle begins after segment_offset
std::size_t start_cycle = cuda::ceil_div(segment_offset, sequence_length);

// The last full cycle ending before segment_end
std::size_t end_cycle = segment_end / sequence_length;

// Number of full cycles repeating the sequence
std::size_t full_cycles = (end_cycle > start_cycle) ? end_cycle - start_cycle : 0;

// Add contributions from full cycles
c2h::host_vector<int> histogram(sequence_length, full_cycles);

// Partial cycles preceding the first full cycle
for (std::size_t j = segment_offset; j < start_cycle * sequence_length; ++j)
{
std::size_t value = j % sequence_length;
histogram[value]++;
}

// Partial cycles following the last full cycle
for (std::size_t j = end_cycle * sequence_length; j < segment_end; ++j)
{
std::size_t value = j % sequence_length;
histogram[value]++;
}
return histogram;
}

public:
segmented_verification_helper(int sequence_length)
: sequence_length(sequence_length)
{}

void prepare_input_data(c2h::device_vector<key_t>& in_keys) const
{
auto data_gen_it =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), mod_n<key_t>{sequence_length});
thrust::copy_n(data_gen_it, in_keys.size(), in_keys.begin());
}

template <typename SegmentOffsetItT>
void verify_sorted(c2h::device_vector<key_t>& out_keys, SegmentOffsetItT offsets, std::size_t num_segments) const
{
// The segments' end-offsets are provided by the segments' begin-offset iterator
auto offsets_plus_1 = offsets + 1;

// Verfiy keys are sorted next to each other
const auto count = static_cast<std::size_t>(
thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>()));
REQUIRE(count <= sequence_length * num_segments);

// // Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

// Copy the unique keys and indexes to host memory
c2h::host_vector<key_t> h_unique_keys_out{unique_keys_out};
c2h::host_vector<std::size_t> h_unique_indexes_out{unique_indexes_out};

// Verify keys are sorted using prior histogram computation
std::size_t uniques_index = 0;
std::size_t current_offset = 0;
for (std::size_t seg_index = 0; seg_index < num_segments; ++seg_index)
{
const auto segment_offset = offsets[seg_index];
const auto segment_end = offsets_plus_1[seg_index];
const auto segment_histogram = compute_histogram_of_series(segment_offset, segment_end);
for (std::size_t i = 0; i < sequence_length; i++)
{
if (segment_histogram[i] != 0)
{
CAPTURE(seg_index, i, uniques_index, current_offset, count);
auto const next_end =
(uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1];
REQUIRE(h_unique_keys_out[uniques_index] == i);
REQUIRE(next_end - h_unique_indexes_out[uniques_index] == segment_histogram[i]);
current_offset += segment_histogram[i];
uniques_index++;
}
}
}
}
};

template <typename T>
struct unwrap_value_t_impl
{
Expand Down
154 changes: 58 additions & 96 deletions cub/test/catch2_test_device_segmented_sort_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -222,73 +222,48 @@ C2H_TEST("DeviceSegmentedSortKeys: Unspecified segments, random keys", "[keys][s

#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)

// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity
C2H_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments",
"[keys][segmented][sort][device]",
all_offset_types)
C2H_TEST("DeviceSegmentedSortKeys: very large number of segments", "[keys][segmented][sort][device]", all_offset_types)
try
{
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t Step = 500;
using segment_iterator_t = segment_iterator<offset_t, Step>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using segment_offset_t = int;
using offset_t = c2h::get<0, TestType>;
using segment_iterator_t = segment_index_to_offset_op<offset_t, segment_offset_t>;
constexpr std::size_t segment_size = 1000000;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr bool is_descending = false;
constexpr bool is_overwrite = false;
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
constexpr std::size_t num_empty_segments = 1000;
const std::size_t num_segments = num_empty_segments + ::cuda::ceil_div(num_items, segment_size);
CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);

c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(C2H_SEED(num_key_seeds), in_keys);
auto offsets =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
auto offsets_plus_1 = offsets + 1;
// Allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess);
}

auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
// Generate input keys
constexpr auto max_histo_size = 250;
segmented_verification_helper<key_t> verification_helper{max_histo_size};
verification_helper.prepare_input_data(in_keys);

auto offsets = thrust::make_transform_iterator(
thrust::make_counting_iterator(std::size_t{0}),
segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items});

auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
is_overwrite);
}
else
{
dispatch_segmented_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
}
REQUIRE((ref_keys == out_keys) == true);
dispatch_segmented_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
(offsets + 1),
nullptr,
is_overwrite);

// Verify the keys are sorted correctly
verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments);
}
catch (std::bad_alloc& e)
{
Expand All @@ -302,8 +277,8 @@ try
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr bool is_descending = false;
const bool is_overwrite = true;
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = 2;
Expand All @@ -317,47 +292,34 @@ try
offsets[1] = static_cast<offset_t>(num_items);
offsets[2] = static_cast<offset_t>(num_items);

// Allocate host/device-accessible memory to communicate the selected output buffer
// Prepare information for later verification
short_key_verification_helper<key_t> verification_helper{};
verification_helper.prepare_verification_data(in_keys);

// Handle double-buffer interface: allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}
auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));

auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);
}
else
dispatch_segmented_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);

// Handle double-buffer interface
if (*selector_ptr)
{
dispatch_segmented_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);
std::swap(out_keys, in_keys);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
}
REQUIRE((ref_keys == out_keys) == true);
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));

// Verify the keys are sorted correctly
verification_helper.verify_sorted(out_keys);
}
catch (std::bad_alloc& e)
{
Expand Down

0 comments on commit b2d4555

Please sign in to comment.