Skip to content

Commit

Permalink
switches to analytical verification for pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Jan 5, 2025
1 parent b2d4555 commit f89b53d
Showing 1 changed file with 73 additions and 115 deletions.
188 changes: 73 additions & 115 deletions cub/test/catch2_test_device_segmented_sort_pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -253,88 +253,63 @@ C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values",

#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)

// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity
C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments",
"[pairs][segmented][sort][device]",
all_offset_types)
try
{
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using value_t = cuda::std::uint8_t;
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t Step = 500;
using segment_iterator_t = segment_iterator<offset_t, Step>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
constexpr int num_value_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using value_t = cuda::std::uint8_t;
using segment_offset_t = int;
using offset_t = c2h::get<0, TestType>;
using segment_iterator_t = segment_index_to_offset_op<offset_t, segment_offset_t>;
constexpr std::size_t segment_size = 1000000;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr bool is_descending = false;
constexpr bool is_overwrite = false;
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
constexpr std::size_t num_empty_segments = 1000;
const std::size_t num_segments = num_empty_segments + ::cuda::ceil_div(num_items, segment_size);
CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);

// Generate input
c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<value_t> in_values(num_items);
c2h::gen(C2H_SEED(num_key_seeds), in_keys);
c2h::gen(C2H_SEED(num_value_seeds), in_values);
constexpr auto max_histo_size = 250;
segmented_verification_helper<key_t> verification_helper{max_histo_size};
verification_helper.prepare_input_data(in_keys);
thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin());

// Initialize the output vectors by copying the inputs since not all items may belong to a segment.
c2h::device_vector<key_t> out_keys(num_items);
c2h::device_vector<value_t> out_values(num_items);
auto offsets =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});

auto offsets = thrust::make_transform_iterator(
thrust::make_counting_iterator(std::size_t{0}),
segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items});
auto offsets_plus_1 = offsets + 1;
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}

auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
auto& ref_keys = refs.first;
auto& ref_values = refs.second;
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
if (is_descending)
{
dispatch_segmented_sort_pairs_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
is_overwrite);
}
else
{
dispatch_segmented_sort_pairs(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
std::swap(out_values, in_values);
}
REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
}
REQUIRE(ref_keys == out_keys);
REQUIRE(ref_values == out_values);

dispatch_segmented_sort_pairs(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
nullptr,
is_overwrite);

// Verify the keys are sorted correctly
verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments);

// Verify values were sorted along with the keys
REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin()));
}
catch (std::bad_alloc& e)
{
Expand All @@ -349,9 +324,8 @@ try
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
constexpr int num_value_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr bool is_descending = false;
constexpr bool is_overwrite = true;
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
constexpr std::size_t num_segments = 2;
Expand All @@ -361,63 +335,47 @@ try
c2h::device_vector<value_t> in_values(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(C2H_SEED(num_key_seeds), in_keys);
c2h::gen(C2H_SEED(num_value_seeds), in_values);
thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin());
c2h::device_vector<value_t> out_values(num_items);
c2h::device_vector<offset_t> offsets(num_segments + 1);
offsets[0] = 0;
offsets[1] = static_cast<offset_t>(num_items);
offsets[2] = static_cast<offset_t>(num_items);
offsets[0] = 0;
offsets[1] = static_cast<offset_t>(num_items);
offsets[2] = static_cast<offset_t>(num_items);

// Prepare information for later verification
short_key_verification_helper<key_t> verification_helper{};
verification_helper.prepare_verification_data(in_keys);

// Handle double-buffer interface: allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));

auto refs = segmented_radix_sort_reference(
in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
auto& ref_keys = refs.first;
auto& ref_values = refs.second;
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
if (is_descending)
{
dispatch_segmented_sort_pairs_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);
}
else
{
dispatch_segmented_sort_pairs(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);
}
if (is_overwrite)
dispatch_segmented_sort_pairs(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
thrust::raw_pointer_cast(in_values.data()),
out_values_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
is_overwrite);

if (*selector_ptr)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
std::swap(out_values, in_values);
}
REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
std::swap(out_keys, in_keys);
std::swap(out_values, in_values);
}
REQUIRE(ref_keys == out_keys);
REQUIRE(ref_values == out_values);
REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);

// Verify the keys are sorted correctly
verification_helper.verify_sorted(out_keys);

// Verify values were sorted along with the keys
REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin()));
}
catch (std::bad_alloc& e)
{
Expand Down

0 comments on commit f89b53d

Please sign in to comment.