Skip to content

Commit

Permalink
General code cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Matthew Michel <matthew.michel@intel.com>
  • Loading branch information
mmichel11 committed Dec 18, 2024
1 parent eb90206 commit 63031ab
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 134 deletions.
7 changes: 4 additions & 3 deletions include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For

auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
{}, __f, static_cast<std::size_t>(__n)},
__n, __view);
return __future_obj;
}
Expand Down Expand Up @@ -74,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
{}, __f, std::size_t(__n)},
{}, __f, static_cast<std::size_t>(__n)},
__n, __view1, __view2);

return __future.__make_future(__first2 + __n);
Expand Down Expand Up @@ -106,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
decltype(__view3)>{{}, __f, size_t(__n)},
decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
__n, __view1, __view2, __view3);

return __future.__make_future(__first3 + __n);
Expand Down
20 changes: 10 additions & 10 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
{}, __f, size_t(__n)},
{}, __f, static_cast<std::size_t>(__n)},
__n, __view1, __view2);

// Call no wait, wait or deferrable wait depending on _WaitMode
Expand Down Expand Up @@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
{}, __f, size_t(__n)},
{}, __f, static_cast<std::size_t>(__n)},
__n, __view1, __view2);
__future.wait(__par_backend_hetero::__deferrable_mode{});
return __first2 + __n;
Expand Down Expand Up @@ -192,9 +192,9 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
auto __view3 = __buf3.all_view();

oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
decltype(__view3)>{{}, __f, size_t(__n)},
decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
__n, __view1, __view2, __view3)
.__deferrable_wait();

Expand Down Expand Up @@ -1597,8 +1597,8 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
auto __buf = __keep(__first, __last);
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
decltype(__buf.all_view())>{{}, __n},
__n / 2, __buf.all_view())
.__deferrable_wait();
Expand Down Expand Up @@ -1626,8 +1626,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
auto __view1 = __buf1.all_view();
auto __view2 = __buf2.all_view();
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
decltype(__view1), decltype(__view2)>{{}, __n},
__n, __view1, __view2)
.__deferrable_wait();
Expand Down Expand Up @@ -1670,7 +1670,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
const auto __shift = __new_first - __first;
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
decltype(__temp_rng_w)>{{}, __n, __shift},
__n, __view, __temp_rng_w);

Expand All @@ -1683,7 +1683,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
decltype(__buf.all_view())>{
{}, _Function{}, static_cast<std::size_t>(__n)};
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
__n, __temp_rng_rw, __buf.all_view())
.__deferrable_wait();

Expand Down
2 changes: 0 additions & 2 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@
#include <cmath>
#include <limits>
#include <cstdint>
#include <tuple>

#include "../../iterator_impl.h"
#include "../../execution_impl.h"
#include "../../utils_ranges.h"
#include "../../utils.h"

#include "sycl_defs.h"
#include "parallel_backend_sycl_utils.h"
Expand Down
10 changes: 5 additions & 5 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);

Expand Down Expand Up @@ -142,20 +142,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
const std::size_t __work_group_size =
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
const std::size_t __num_groups =
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
const std::size_t __num_items = __num_groups * __work_group_size;
__cgh.parallel_for<_Name...>(
sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item</*dim=*/1> __item) {
auto [__idx, __stride, __is_full] =
__stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
__stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
__strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
if (__is_full)
{
Expand Down Expand Up @@ -190,7 +190,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
// Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
// single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
// then only compile the basic kernel as the two versions are effectively the same.
if constexpr (_Fp::__preferred_iters_per_item > 1)
if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
{
if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
{
Expand Down
57 changes: 29 additions & 28 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -845,26 +845,26 @@ struct __lazy_load_transform_op
}
};

template <std::uint16_t __vec_size>
template <std::uint8_t __vec_size>
struct __vector_load
{
static_assert(__vec_size <= 4);
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
std::size_t __n;
template <typename _IdxType, typename _LoadOp, typename... _Acc>
void
operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
__load_op(__start_idx + __i, __i, __acc...);
}

template <typename _IdxType, typename _LoadOp, typename... _Acc>
void
operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
{
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
for (std::uint16_t __i = 0; __i < __elements; ++__i)
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
for (std::uint8_t __i = 0; __i < __elements; ++__i)
__load_op(__start_idx + __i, __i, __acc...);
}
};
Expand All @@ -891,20 +891,19 @@ struct __lazy_store_transform_op
}
};

template <std::uint16_t __vec_size>
template <std::uint8_t __vec_size>
struct __vector_walk
{
static_assert(__vec_size <= 4);
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
std::size_t __n;

template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
void
operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
{

__f(__rngs[__idx + __i]...);
}
}
Expand All @@ -914,61 +913,63 @@ struct __vector_walk
void
operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
{
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
for (std::uint16_t __i = 0; __i < __elements; ++__i)
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
for (std::uint8_t __i = 0; __i < __elements; ++__i)
{
__f(__rngs[__idx + __i]...);
}
}
};

template <std::uint16_t __vec_size>
template <std::uint8_t __vec_size>
struct __vector_store
{
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
std::size_t __n;
static_assert(__vec_size <= 4);
template <typename _IdxType, typename _StoreOp, typename... _Acc>

template <typename _IdxType, typename _StoreOp, typename... _Rngs>
void
operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
__store_op(__i, __start_idx + __i, __acc...);
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
__store_op(__i, __start_idx + __i, __rngs...);
}
template <typename _IdxType, typename _StoreOp, typename... _Acc>
template <typename _IdxType, typename _StoreOp, typename... _Rngs>
void
operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
{
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
for (std::uint16_t __i = 0; __i < __elements; ++__i)
__store_op(__i, __start_idx + __i, __acc...);
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
for (std::uint8_t __i = 0; __i < __elements; ++__i)
__store_op(__i, __start_idx + __i, __rngs...);
}
};

template <std::uint16_t __vec_size>
template <std::uint8_t __vec_size>
struct __vector_reverse
{
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
template <typename _IsFull, typename _Idx, typename _Array>
void
operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
{
if constexpr (__is_full)
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
}
else
{
for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
}
}
};

// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
// (potentially with vectorization)
template <std::uint16_t __num_strides>
template <std::uint8_t __num_strides>
struct __strided_loop
{
std::size_t __n;
Expand All @@ -978,7 +979,7 @@ struct __strided_loop
_Ranges&&... __rngs) const
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
{
__loop_body_op(std::true_type{}, __idx, __rngs...);
__idx += __stride;
Expand All @@ -992,7 +993,7 @@ struct __strided_loop
// Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
const std::uint8_t __adjusted_iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
{
__loop_body_op(std::false_type{}, __idx, __rngs...);
__idx += __stride;
Expand Down
Loading

0 comments on commit 63031ab

Please sign in to comment.