From f89b53deebe29265de8f40777440ab255a84a29b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 5 Jan 2025 05:28:54 -0800 Subject: [PATCH] switches to analytical verification for pairs --- ...catch2_test_device_segmented_sort_pairs.cu | 188 +++++++----------- 1 file changed, 73 insertions(+), 115 deletions(-) diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu index a3034608076..ded0b6f9f46 100644 --- a/cub/test/catch2_test_device_segmented_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu @@ -253,88 +253,63 @@ C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values", #if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) -// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments", "[pairs][segmented][sort][device]", all_offset_types) try { - using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs - using value_t = cuda::std::uint8_t; - using offset_t = c2h::get<0, TestType>; - constexpr std::size_t Step = 500; - using segment_iterator_t = segment_iterator; - constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); - constexpr int num_key_seeds = 1; - constexpr int num_value_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using segment_offset_t = int; + using offset_t = c2h::get<0, TestType>; + using segment_iterator_t = segment_index_to_offset_op; + constexpr std::size_t segment_size = 1000000; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr bool is_descending = false; + constexpr bool is_overwrite = false; constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); - const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); + constexpr std::size_t num_empty_segments = 1000; + const std::size_t num_segments = num_empty_segments + ::cuda::ceil_div(num_items, segment_size); CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + // Generate input c2h::device_vector in_keys(num_items); c2h::device_vector in_values(num_items); - c2h::gen(C2H_SEED(num_key_seeds), in_keys); - c2h::gen(C2H_SEED(num_value_seeds), in_values); + constexpr auto max_histo_size = 250; + segmented_verification_helper verification_helper{max_histo_size}; + verification_helper.prepare_input_data(in_keys); + thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin()); // Initialize the output vectors by copying the inputs since not all items may belong to a segment. c2h::device_vector out_keys(num_items); c2h::device_vector out_values(num_items); - auto offsets = - thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + + auto offsets = thrust::make_transform_iterator( + thrust::make_counting_iterator(std::size_t{0}), + segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items}); auto offsets_plus_1 = offsets + 1; - bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - } - auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1); - auto& ref_keys = refs.first; - auto& ref_values = refs.second; auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); - if (is_descending) - { - dispatch_segmented_sort_pairs_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort_pairs( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) - { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - std::swap(out_values, in_values); - } - REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); - } - REQUIRE(ref_keys == out_keys); - REQUIRE(ref_values == out_values); + + dispatch_segmented_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + nullptr, + is_overwrite); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments); + + // Verify values were sorted along with the keys + REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin())); } catch (std::bad_alloc& e) { @@ -349,9 +324,8 @@ try using offset_t = c2h::get<0, TestType>; constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); constexpr int num_key_seeds = 1; - constexpr int num_value_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); + constexpr bool is_descending = false; + constexpr bool is_overwrite = true; constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); constexpr std::size_t num_segments = 2; @@ -361,63 +335,47 @@ try c2h::device_vector in_values(num_items); c2h::device_vector out_keys(num_items); c2h::gen(C2H_SEED(num_key_seeds), in_keys); - c2h::gen(C2H_SEED(num_value_seeds), in_values); + thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin()); c2h::device_vector out_values(num_items); c2h::device_vector offsets(num_segments + 1); - offsets[0] = 0; - offsets[1] = static_cast(num_items); - offsets[2] = static_cast(num_items); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + + // Prepare information for later verification + short_key_verification_helper verification_helper{}; + verification_helper.prepare_verification_data(in_keys); + + // Handle double-buffer interface: allocate host/device-accessible memory to communicate the selected output buffer bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - } + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - auto refs = segmented_radix_sort_reference( - in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1); - auto& ref_keys = refs.first; - auto& ref_values = refs.second; auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); - if (is_descending) - { - dispatch_segmented_sort_pairs_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort_pairs( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) + dispatch_segmented_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + is_overwrite); + + if (*selector_ptr) { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - std::swap(out_values, in_values); - } - REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); } - REQUIRE(ref_keys == out_keys); - REQUIRE(ref_values == out_values); + REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys); + + // Verify values were sorted along with the keys + REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin())); } catch (std::bad_alloc& e) {