diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh new file mode 100644 index 00000000000..4ffbfc93b92 --- /dev/null +++ b/cub/cub/detail/nvtx.cuh @@ -0,0 +1,87 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// Enable the functionality of this header if +// * The NVTX3 C API is available in CTK +// * NVTX is not explicitly disabled +// * C++14 is availabl for cuda::std::optional +#if __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 +// Include our NVTX3 C++ wrapper if not available from the CTK +# if __has_include() // TODO(bgruber): replace by a check for the first CTK version shipping the header +# include +# else // __has_include() +# include "nvtx3.hpp" +# endif // __has_include() + +# include + +CUB_NAMESPACE_BEGIN +namespace detail +{ +struct NVTXCCCLDomain +{ + static constexpr const char* name = "CCCL"; +}; +} // namespace detail +CUB_NAMESPACE_END + +// Hook for the NestedNVTXRangeGuard from the unit tests +# ifndef CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE +# define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) +# endif // !CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE + +// Conditionally inserts a NVTX range starting here until the end of the current function scope in host code. Does +// nothing in device code. +// The optional is needed to defer the construction of an NVTX range (host-only code) and message string registration +// into a dispatch region running only on the host, while preserving the semantic scope where the range is declared. +# define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) \ + CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) \ + ::cuda::std::optional<::nvtx3::scoped_range_in> __cub_nvtx3_range; \ + NV_IF_TARGET( \ + NV_IS_HOST, \ + static const ::nvtx3::registered_string_in __cub_nvtx3_func_name{name}; \ + static const ::nvtx3::event_attributes __cub_nvtx3_func_attr{__cub_nvtx3_func_name}; \ + if (condition) __cub_nvtx3_range.emplace(__cub_nvtx3_func_attr); \ + (void) __cub_nvtx3_range;) + +# define CUB_DETAIL_NVTX_RANGE_SCOPE(name) CUB_DETAIL_NVTX_RANGE_SCOPE_IF(true, name) +#else // __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER > 2011 +# define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) +# define CUB_DETAIL_NVTX_RANGE_SCOPE(name) +#endif // __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER > 2011 diff --git a/cub/cub/detail/nvtx3.hpp b/cub/cub/detail/nvtx3.hpp new file mode 100644 index 00000000000..dcbafd20ef7 --- /dev/null +++ b/cub/cub/detail/nvtx3.hpp @@ -0,0 +1,2953 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Temporary helper #defines, #undef'ed at end of header */ +#define NVTX3_CPP_VERSION_MAJOR 1 +#define NVTX3_CPP_VERSION_MINOR 0 + +/* This section handles the decision of whether to provide unversioned symbols. + * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are + * not provided, and explicit-version symbols such as nvtx3::v1::scoped_range + * and NVTX3_V1_FUNC_RANGE must be used. By default, the first #include of this + * header will define the unversioned symbols such as nvtx3::scoped_range and + * NVTX3_FUNC_RANGE. Subsequently including a different major version of this + * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error + * since the symbols would conflict. Subsequently including of a different + * minor version within the same major version is allowed. Functionality of + * minor versions is cumulative, regardless of include order. + * + * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions + * to coexist without problems within a translation unit, the recommended best + * practice for instrumenting header-based libraries with NVTX C++ Wrappers is + * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp, + * #undef it afterward, and only use explicit-version symbols. This is not + * necessary in common cases, such as instrumenting a standalone application, or + * static/shared libraries in .cpp files or headers private to those projects. + */ +/* clang-format off */ +#if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION) + /* Define macro used by all definitions in this header to indicate the + * unversioned symbols should be defined in addition to the versioned ones. + */ + #define NVTX3_INLINE_THIS_VERSION + + #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR) + /* First occurrence of this header in the translation unit. Define macros + * indicating which version shall be used for unversioned symbols. + */ + + /** + * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols + * + * Breaking changes may occur between major versions, and different major versions + * cannot provide unversioned symbols in the same translation unit (.cpp file). + * + * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. + * + * Not to be confused with the version number of the NVTX core library. + */ + #define NVTX3_CPP_INLINED_VERSION_MAJOR 1 // NVTX3_CPP_VERSION_MAJOR + + /** + * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols + * + * No breaking changes occur between minor versions -- minor version changes within + * a major version are purely additive. + * + * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. + * + * Not to be confused with the version number of the NVTX core library. + */ + #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR + #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR + /* Unsupported case -- cannot define unversioned symbols for different major versions + * in the same translation unit. + */ + #error \ + "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both. Only one major version can enable unversioned symbols in a .cpp file. To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file." + #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \ + (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR) + /* An older minor version of the same major version already defined unversioned + * symbols. The new features provided in this header will be inlined + * redefine the minor version macro to this header's version. + */ + #undef NVTX3_CPP_INLINED_VERSION_MINOR + #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR + // else, already have this version or newer, nothing to do + #endif +#endif +/* clang-format on */ + +/** + * @file nvtx3.hpp + * + * @brief Provides C++ constructs making the NVTX library safer and easier to + * use with zero overhead. + */ + +/** + * \mainpage + * \tableofcontents + * + * \section QUICK_START Quick Start + * + * To add NVTX ranges to your code, use the `nvtx3::scoped_range` RAII object. A + * range begins when the object is created, and ends when the object is + * destroyed. + * + * \code{.cpp} + * #include "nvtx3.hpp" + * void some_function() { + * // Begins a NVTX range with the messsage "some_function" + * // The range ends when some_function() returns and `r` is destroyed + * nvtx3::scoped_range r{"some_function"}; + * + * for(int i = 0; i < 6; ++i) { + * nvtx3::scoped_range loop{"loop range"}; + * std::this_thread::sleep_for(std::chrono::seconds{1}); + * } + * } // Range ends when `r` is destroyed + * \endcode + * + * The example code above generates the following timeline view in Nsight + * Systems: + * + * \image html + * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png + * + * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add + * ranges to your code that automatically use the name of the enclosing function + * as the range's message. + * + * \code{.cpp} + * #include "nvtx3.hpp" + * void some_function() { + * // Creates a range with a message "some_function" that ends when the + * // enclosing function returns + * NVTX3_FUNC_RANGE(); + * ... + * } + * \endcode + * + * + * \section Overview + * + * The NVTX library provides a set of functions for users to annotate their code + * to aid in performance profiling and optimization. These annotations provide + * information to tools like Nsight Systems to improve visualization of + * application timelines. + * + * \ref RANGES are one of the most commonly used NVTX constructs for annotating + * a span of time. For example, imagine a user wanted to see every time a + * function, `my_function`, is called and how long it takes to execute. This can + * be accomplished with an NVTX range created on the entry to the function and + * terminated on return from `my_function` using the push/pop C APIs: + * + * \code{.cpp} + * void my_function(...) { + * nvtxRangePushA("my_function"); // Begins NVTX range + * // do work + * nvtxRangePop(); // Ends NVTX range + * } + * \endcode + * + * One of the challenges with using the NVTX C API is that it requires manually + * terminating the end of the range with `nvtxRangePop`. This can be challenging + * if `my_function()` has multiple returns or can throw exceptions as it + * requires calling `nvtxRangePop()` before all possible return points. + * + * NVTX C++ solves this inconvenience through the "RAII" technique by providing + * a `nvtx3::scoped_range` class that begins a range at construction and ends + * the range on destruction. The above example then becomes: + * + * \code{.cpp} + * void my_function(...) { + * nvtx3::scoped_range r{"my_function"}; // Begins NVTX range + * // do work + * } // Range ends on exit from `my_function` when `r` is destroyed + * \endcode + * + * The range object `r` is deterministically destroyed whenever `my_function` + * returns---ending the NVTX range without manual intervention. For more + * information, see \ref RANGES and `nvtx3::scoped_range_in`. + * + * Another inconvenience of the NVTX C APIs are the several constructs where the + * user is expected to initialize an object at the beginning of an application + * and reuse that object throughout the lifetime of the application. For example + * see domains, categories, and registered messages. + * + * Example: + * \code{.cpp} + * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); + * // Reuse `D` throughout the rest of the application + * \endcode + * + * This can be problematic if the user application or library does not have an + * explicit initialization function called before all other functions to + * ensure that these long-lived objects are initialized before being used. + * + * NVTX C++ makes use of the "construct on first use" technique to alleviate + * this inconvenience. In short, a function local static object is constructed + * upon the first invocation of a function and returns a reference to that + * object on all future invocations. See the documentation for `nvtx3::domain`, + * `nvtx3::named_category`, `nvtx3::registered_string`, and + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more + * information. + * + * Using construct on first use, the above example becomes: + * \code{.cpp} + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * // The first invocation of `domain::get` for the type `my_domain` will + * // construct a `nvtx3::domain` object and return a reference to it. Future + * // invocations simply return a reference. + * nvtx3::domain const& D = nvtx3::domain::get(); + * \endcode + * For more information about NVTX and how it can be used, see + * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and + * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + * for more information. + * + * \section RANGES Ranges + * + * Ranges are used to describe a span of time during the execution of an + * application. Common examples are using ranges to annotate the time it takes + * to execute a function or an iteration of a loop. + * + * NVTX C++ uses RAII to automate the generation of ranges that are tied to the + * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard + * Template Library. + * + * \subsection scoped_range Scoped Range + * + * `nvtx3::scoped_range_in` is a class that begins a range upon construction + * and ends the range at destruction. This is one of the most commonly used + * constructs in NVTX C++ and is useful for annotating spans of time on a + * particular thread. These ranges can be nested to arbitrary depths. + * + * `nvtx3::scoped_range` is an alias for a `nvtx3::scoped_range_in` in the + * global NVTX domain. For more information about Domains, see \ref DOMAINS. + * + * Various attributes of a range can be configured constructing a + * `nvtx3::scoped_range_in` with a `nvtx3::event_attributes` object. For + * more information, see \ref ATTRIBUTES. + * + * Example: + * + * \code{.cpp} + * void some_function() { + * // Creates a range for the duration of `some_function` + * nvtx3::scoped_range r{}; + * + * while(true) { + * // Creates a range for every loop iteration + * // `loop_range` is nested inside `r` + * nvtx3::scoped_range loop_range{}; + * } + * } + * \endcode + * + * \subsection unique_range Unique Range + * + * `nvtx3::unique_range` is similar to `nvtx3::scoped_range`, with a few key differences: + * - `unique_range` objects can be destroyed in any order whereas `scoped_range` objects must be + * destroyed in exact reverse creation order + * - `unique_range` can start and end on different threads + * - `unique_range` is moveable + * - `unique_range` objects can be constructed as heap objects + * + * There is extra overhead associated with `unique_range` constructs and therefore use of + * `nvtx3::scoped_range_in` should be preferred. + * + * \section MARKS Marks + * + * `nvtx3::mark` annotates an instantaneous point in time with a "marker". + * + * Unlike a "range" which has a beginning and an end, a marker is a single event + * in an application, such as detecting a problem: + * + * \code{.cpp} + * bool success = do_operation(...); + * if (!success) { + * nvtx3::mark("operation failed!"); + * } + * \endcode + * + * \section DOMAINS Domains + * + * Similar to C++ namespaces, domains allow for scoping NVTX events. By default, + * all NVTX events belong to the "global" domain. Libraries and applications + * should scope their events to use a custom domain to differentiate where the + * events originate from. + * + * It is common for a library or application to have only a single domain and + * for the name of that domain to be known at compile time. Therefore, Domains + * in NVTX C++ are represented by _tag types_. + * + * For example, to define a custom domain, simply define a new concrete type + * (a `class` or `struct`) with a `static` member called `name` that contains + * the desired name of the domain. + * + * \code{.cpp} + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * \endcode + * + * For any NVTX C++ construct that can be scoped to a domain, the type + * `my_domain` can be passed as an explicit template argument to scope it to + * the custom domain. + * + * The tag type `nvtx3::domain::global` represents the global NVTX domain. + * + * \code{.cpp} + * // By default, `scoped_range_in` belongs to the global domain + * nvtx3::scoped_range_in<> r0{}; + * + * // Alias for a `scoped_range_in` in the global domain + * nvtx3::scoped_range r1{}; + * + * // `r` belongs to the custom domain + * nvtx3::scoped_range_in r{}; + * \endcode + * + * When using a custom domain, it is recommended to define type aliases for NVTX + * constructs in the custom domain. + * \code{.cpp} + * using my_scoped_range = nvtx3::scoped_range_in; + * using my_registered_string = nvtx3::registered_string_in; + * using my_named_category = nvtx3::named_category_in; + * \endcode + * + * See `nvtx3::domain` for more information. + * + * \section ATTRIBUTES Event Attributes + * + * NVTX events can be customized with various attributes to provide additional + * information (such as a custom message) or to control visualization of the + * event (such as the color used). These attributes can be specified per-event + * via arguments to a `nvtx3::event_attributes` object. + * + * NVTX events can be customized via four "attributes": + * - \ref COLOR : color used to visualize the event in tools. + * - \ref MESSAGES : Custom message string. + * - \ref PAYLOAD : User-defined numerical value. + * - \ref CATEGORY : Intra-domain grouping. + * + * It is possible to construct a `nvtx3::event_attributes` from any number of + * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, + * nvtx3::category) in any order. If an attribute is not specified, a tool + * specific default value is used. See `nvtx3::event_attributes` for more + * information. + * + * \code{.cpp} + * // Set message, same as passing nvtx3::message{"message"} + * nvtx3::event_attributes attr{"message"}; + * + * // Set message and color + * nvtx3::event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; + * + * // Set message, color, payload, category + * nvtx3::event_attributes attr{"message", + * nvtx3::rgb{127, 255, 0}, + * nvtx3::payload{42}, + * nvtx3::category{1}}; + * + * // Same as above -- can use any order of arguments + * nvtx3::event_attributes attr{nvtx3::payload{42}, + * nvtx3::category{1}, + * "message", + * nvtx3::rgb{127, 255, 0}}; + * + * // Multiple arguments of the same type are allowed, but only the first is + * // used -- in this example, payload is set to 42: + * nvtx3::event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; + * + * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: + * using namespace nvtx3; + * event_attributes attr{"message", rgb{127, 255, 0}, payload{42}, category{1}}; + * \endcode + * + * \subsection MESSAGES message + * + * `nvtx3::message` sets the message string for an NVTX event. + * + * Example: + * \code{.cpp} + * // Create an `event_attributes` with the message "my message" + * nvtx3::event_attributes attr{nvtx3::message{"my message"}}; + * + * // strings and string literals implicitly assumed to be a `nvtx3::message` + * nvtx3::event_attributes attr{"my message"}; + * \endcode + * + * \subsubsection REGISTERED_MESSAGE Registered Messages + * + * Associating a `nvtx3::message` with an event requires copying the contents of + * the message every time the message is used, i.e., copying the entire message + * string. This may cause non-trivial overhead in performance sensitive code. + * + * To eliminate this overhead, NVTX allows registering a message string, + * yielding a "handle" that is inexpensive to copy that may be used in place of + * a message string. When visualizing the events, tools such as Nsight Systems + * will take care of mapping the message handle to its string. + * + * A message should be registered once and the handle reused throughout the rest + * of the application. This can be done by either explicitly creating static + * `nvtx3::registered_string` objects, or using the + * `nvtx3::registered_string::get` construct on first use helper (recommended). + * + * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a + * custom tag type with a static `message` member whose value will be the + * contents of the registered string. + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `registered_string` in my_domain: + * static registered_string_in static_message{"my message"}; + * + * // Or use construct on first use: + * // Define a tag type with a `message` member string to register + * struct my_message{ static constexpr char const* message{ "my message" }; }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * auto& msg = nvtx3::registered_string_in::get(); + * \endcode + * + * \subsection COLOR color + * + * Associating a `nvtx3::color` with an event allows controlling how the event + * is visualized in a tool such as Nsight Systems. This is a convenient way to + * visually differentiate among different events. + * + * \code{.cpp} + * // Define a color via rgb color values + * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; + * nvtx3::event_attributes attr{c}; + * + * // rgb color values can be passed directly to an `event_attributes` + * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; + * \endcode + * + * \subsection CATEGORY category + * + * A `nvtx3::category` is simply an integer id that allows for fine-grain + * grouping of NVTX events. For example, one might use separate categories for + * IO, memory allocation, compute, etc. + * + * \code{.cpp} + * nvtx3::event_attributes{nvtx3::category{1}}; + * \endcode + * + * \subsubsection NAMED_CATEGORIES Named Categories + * + * Associates a `name` string with a category `id` to help differentiate among + * categories. + * + * For any given category id `Id`, a `named_category{Id, "name"}` should only + * be constructed once and reused throughout an application. This can be done by + * either explicitly creating static `nvtx3::named_category` objects, or using + * the `nvtx3::named_category::get` construct on first use helper (recommended). + * + * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a + * custom tag type with static `name` and `id` members. + * + * \code{.cpp} + * // Explicitly constructed, static `named_category` in my_domain: + * static nvtx3::named_category_in static_category{42, "my category"}; + * + * // Or use construct on first use: + * // Define a tag type with `name` and `id` members + * struct my_category { + * static constexpr char const* name{"my category"}; // category name + * static constexpr uint32_t id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category": + * auto& cat = named_category_in::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::event_attributes attr{cat}; + * \endcode + * + * \subsection PAYLOAD payload + * + * Allows associating a user-defined numerical value with an event. + * + * \code{.cpp} + * // Constructs a payload from the `int32_t` value 42 + * nvtx3:: event_attributes attr{nvtx3::payload{42}}; + * \endcode + * + * + * \section EXAMPLE Example + * + * Putting it all together: + * \code{.cpp} + * // Define a custom domain tag type + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * // Define a named category tag type + * struct my_category{ + * static constexpr char const* name{"my category"}; + * static constexpr uint32_t id{42}; + * }; + * + * // Define a registered string tag type + * struct my_message{ static constexpr char const* message{"my message"}; }; + * + * // For convenience, use aliases for domain scoped objects + * using my_scoped_range = nvtx3::scoped_range_in; + * using my_registered_string = nvtx3::registered_string_in; + * using my_named_category = nvtx3::named_category_in; + * + * // Default values for all attributes + * nvtx3::event_attributes attr{}; + * my_scoped_range r0{attr}; + * + * // Custom (unregistered) message, and unnamed category + * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; + * my_scoped_range r1{attr1}; + * + * // Alternatively, pass arguments of `event_attributes` ctor directly to + * // `my_scoped_range` + * my_scoped_range r2{"message", nvtx3::category{2}}; + * + * // construct on first use a registered string + * auto& msg = my_registered_string::get(); + * + * // construct on first use a named category + * auto& cat = my_named_category::get(); + * + * // Use registered string and named category with a custom payload + * my_scoped_range r3{msg, cat, nvtx3::payload{42}}; + * + * // Any number of arguments in any order + * my_scoped_range r{nvtx3::rgb{127, 255,0}, msg}; + * + * \endcode + * \section MACROS Convenience Macros + * + * Oftentimes users want to quickly and easily add NVTX ranges to their library + * or application to aid in profiling and optimization. + * + * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and + * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an + * `nvtx3::scoped_range_in` with the name of the enclosing function as the + * range's message. + * + * \code{.cpp} + * void some_function() { + * // Automatically generates an NVTX range for the duration of the function + * // using "some_function" as the event's message. + * NVTX3_FUNC_RANGE(); + * } + * \endcode + * + */ + +/* Temporary helper #defines, removed with #undef at end of header */ + +#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) +# if defined(_MSC_VER) && _MSC_VER < 1914 +/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser + * that does not work with domain::get's specialization for domain::global, + * and would require extra conditions to make SFINAE work for the overloaded + * get() functions. This macro disables use of overloaded get() in order to + * work with VS2015 and versions of VS2017 below 15.7, without penalizing + * users of newer compilers. Building with this flag set to 0 means errors + * when defining tag structs (see documentation for domain, named_category, + * and registered_string) will have more complex compiler error messages + * instead of the clear static_assert messages from the get() overloads. + */ +# define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0 +# else +# define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1 +# endif +# define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE +#endif + +/* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX, + * where "X" is the major version number. */ +#define NVTX3_CONCAT(A, B) A##B +#define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION) +#define NVTX3_VERSION_NAMESPACE NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR) + +/* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces + * in each minor version by making a macro to use unconditionally, which + * resolves to "inline" or nothing as appropriate. */ +#if defined(NVTX3_INLINE_THIS_VERSION) +# define NVTX3_INLINE_IF_REQUESTED inline +#else +# define NVTX3_INLINE_IF_REQUESTED +#endif + +/* Enables the use of constexpr when support for C++14 constexpr is present. + * + * Initialization of a class member that is a union to a specific union member + * can only be done in the body of a constructor, not in a member initializer + * list. A constexpr constructor must have an empty body until C++14, so there + * is no way to make an initializer of a member union constexpr in C++11. This + * macro allows making functions constexpr in C++14 or newer, but non-constexpr + * in C++11 compilation. It is used here on constructors that initialize their + * member unions. + */ +#if __cpp_constexpr >= 201304L +# define NVTX3_CONSTEXPR_IF_CPP14 constexpr +#else +# define NVTX3_CONSTEXPR_IF_CPP14 +#endif + +/* Use a macro for static asserts, which defaults to static_assert, but that + * testing tools can replace with a logging function. For example: + * #define NVTX3_STATIC_ASSERT(c, m) \ + * do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0) + */ +#if !defined(NVTX3_STATIC_ASSERT) +# define NVTX3_STATIC_ASSERT(condition, message) static_assert(condition, message); +# define NVTX3_STATIC_ASSERT_DEFINED_HERE +#endif + +/* Implementation sections, enclosed in guard macros for each minor version */ + +#ifndef NVTX3_CPP_DEFINITIONS_V1_0 +# define NVTX3_CPP_DEFINITIONS_V1_0 + +# include +# include +# include +# include +# include + +# include // NOTE(bgruber): "nvtx3/" prefix added and switched to angle brackets + +namespace nvtx3 +{ + +NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE +{ + namespace detail + { + + template + struct always_false : std::false_type + {}; + + template + struct has_name : std::false_type + {}; + template + struct has_name : std::true_type + {}; + + template + struct has_id : std::false_type + {}; + template + struct has_id : std::true_type + {}; + + template + struct has_message : std::false_type + {}; + template + struct has_message : std::true_type + {}; + + template + struct is_c_string : std::false_type + {}; + template + struct is_c_string::value + || std::is_convertible::value>::type> : std::true_type + {}; + + template + using is_uint32 = std::is_same::type, uint32_t>; + + } // namespace detail + + /** + * @brief `domain`s allow for grouping NVTX events into a single scope to + * differentiate them from events in other `domain`s. + * + * By default, all NVTX constructs are placed in the "global" NVTX domain. + * + * A custom `domain` may be used in order to differentiate a library's or + * application's NVTX events from other events. + * + * `domain`s are expected to be long-lived and unique to a library or + * application. As such, it is assumed a domain's name is known at compile + * time. Therefore, all NVTX constructs that can be associated with a domain + * require the domain to be specified via a *type* `D` passed as an + * explicit template parameter. + * + * The type `domain::global` may be used to indicate that the global NVTX + * domain should be used. + * + * None of the C++ NVTX constructs require the user to manually construct a + * `domain` object. Instead, if a custom domain is desired, the user is + * expected to define a type `D` that contains a member + * `D::name` which resolves to either a `char const*` or `wchar_t + * const*`. The value of `D::name` is used to name and uniquely + * identify the custom domain. + * + * Upon the first use of an NVTX construct associated with the type + * `D`, the "construct on first use" pattern is used to construct a + * function local static `domain` object. All future NVTX constructs + * associated with `D` will use a reference to the previously + * constructed `domain` object. See `domain::get`. + * + * Example: + * \code{.cpp} + * // The type `my_domain` defines a `name` member used to name and identify + * // the `domain` object identified by `my_domain`. + * struct my_domain{ static constexpr char const* name{"my_domain"}; }; + * + * // The NVTX range `r` will be grouped with all other NVTX constructs + * // associated with `my_domain`. + * nvtx3::scoped_range_in r{}; + * + * // An alias can be created for a `scoped_range_in` in the custom domain + * using my_scoped_range = nvtx3::scoped_range_in; + * my_scoped_range my_range{}; + * + * // `domain::global` indicates that the global NVTX domain is used + * nvtx3::scoped_range_in r2{}; + * + * // For convenience, `nvtx3::scoped_range` is an alias for a range in the + * // global domain + * nvtx3::scoped_range r3{}; + * \endcode + */ + class domain + { + public: + domain(domain const&) = delete; + domain& operator=(domain const&) = delete; + domain(domain&&) = delete; + domain& operator=(domain&&) = delete; + + /** + * @brief Tag type for the "global" NVTX domain. + * + * This type may be passed as a template argument to any function/class + * expecting a type to identify a domain to indicate that the global domain + * should be used. + * + * All NVTX events in the global domain across all libraries and + * applications will be grouped together. + * + */ + struct global + {}; + +# if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET + /** + * @brief Returns reference to an instance of a function local static + * `domain` object. + * + * Uses the "construct on first use" idiom to safely ensure the `domain` + * object is initialized exactly once upon first invocation of + * `domain::get()`. All following invocations will return a + * reference to the previously constructed `domain` object. See + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use + * + * None of the constructs in this header require the user to directly invoke + * `domain::get`. It is automatically invoked when constructing objects like + * a `scoped_range_in` or `category`. Advanced users may wish to use + * `domain::get` for the convenience of the "construct on first use" idiom + * when using domains with their own use of the NVTX C API. + * + * This function is threadsafe as of C++11. If two or more threads call + * `domain::get` concurrently, exactly one of them is guaranteed + * to construct the `domain` object and the other(s) will receive a + * reference to the object after it is fully constructed. + * + * The domain's name is specified via the type `D` pass as an + * explicit template parameter. `D` is required to contain a + * member `D::name` that resolves to either a `char const*` or + * `wchar_t const*`. The value of `D::name` is used to name and + * uniquely identify the `domain`. + * + * Example: + * \code{.cpp} + * // The type `my_domain` defines a `name` member used to name and identify + * // the `domain` object identified by `my_domain`. + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * auto& D1 = domain::get(); // First invocation constructs a + * // `domain` with the name "my domain" + * + * auto& D2 = domain::get(); // Quickly returns reference to + * // previously constructed `domain`. + * \endcode + * + * @tparam D Type that contains a `D::name` member used to + * name the `domain` object. + * @return Reference to the `domain` corresponding to the type `D`. + */ + template ::value, int>::type = 0> + static domain const& get() noexcept + { + static domain const d(D::name); + return d; + } + + /** + * @brief Overload of `domain::get` to provide a clear compile error when + * `D` has a `name` member that is not directly convertible to either + * `char const*` or `wchar_t const*`. + */ + template ::value, int>::type = 0> + static domain const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::always_false::value, + "Type used to identify an NVTX domain must contain a static constexpr member " + "called 'name' of type const char* or const wchar_t* -- 'name' member is not " + "convertible to either of those types"); + static domain const unused; + return unused; // Function must compile for static_assert to be triggered + } + + /** + * @brief Overload of `domain::get` to provide a clear compile error when + * `D` does not have a `name` member. + */ + template ::value, int>::type = 0> + static domain const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::always_false::value, + "Type used to identify an NVTX domain must contain a static constexpr member " + "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); + static domain const unused; + return unused; // Function must compile for static_assert to be triggered + } +# else + template + static domain const& get() noexcept + { + static domain const d(D::name); + return d; + } +# endif + + /** + * @brief Conversion operator to `nvtxDomainHandle_t`. + * + * Allows transparently passing a domain object into an API expecting a + * native `nvtxDomainHandle_t` object. + */ + operator nvtxDomainHandle_t() const noexcept + { + return _domain; + } + + private: + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(char const* name) noexcept + : _domain{nvtxDomainCreateA(name)} + {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(wchar_t const* name) noexcept + : _domain{nvtxDomainCreateW(name)} + {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(std::string const& name) noexcept + : domain{name.c_str()} + {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(std::wstring const& name) noexcept + : domain{name.c_str()} + {} + + /** + * @brief Default constructor creates a `domain` representing the + * "global" NVTX domain. + * + * All events not associated with a custom `domain` are grouped in the + * "global" NVTX domain. + * + */ + domain() noexcept {} + + /** + * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object. + * + * No currently-available tools attempt to free domain resources when the + * nvtxDomainDestroy function is called, due to the thread-safety and + * efficiency challenges of freeing thread-local storage for other threads. + * Since libraries may be disallowed from introducing static destructors, + * and destroying the domain is likely to have no effect, the destructor + * for `domain` intentionally chooses to not destroy the domain. + * + * In a situation where domain destruction is necessary, either manually + * call nvtxDomainDestroy on the domain's handle, or make a class that + * derives from `domain` and calls nvtxDomainDestroy in its destructor. + */ + ~domain() = default; + + private: + nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle + }; + + /** + * @brief Returns reference to the `domain` object that represents the global + * NVTX domain. + * + * This specialization for `domain::global` returns a default constructed, + * `domain` object for use when the "global" domain is desired. + * + * All NVTX events in the global domain across all libraries and applications + * will be grouped together. + * + * @return Reference to the `domain` corresponding to the global NVTX domain. + * + */ + template <> + inline domain const& domain::get() noexcept + { + static domain const d{}; + return d; + } + + /** + * @brief Indicates the values of the red, green, and blue color channels for + * an RGB color to use as an event attribute (assumes no transparency). + * + */ + struct rgb + { + /// Type used for component values + using component_type = uint8_t; + + /** + * @brief Construct a rgb with red, green, and blue channels + * specified by `red_`, `green_`, and `blue_`, respectively. + * + * Valid values are in the range `[0,255]`. + * + * @param red_ Value of the red channel + * @param green_ Value of the green channel + * @param blue_ Value of the blue channel + */ + constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept + : red{red_} + , green{green_} + , blue{blue_} + {} + + component_type red{}; ///< Red channel value + component_type green{}; ///< Green channel value + component_type blue{}; ///< Blue channel value + }; + + /** + * @brief Indicates the value of the alpha, red, green, and blue color + * channels for an ARGB color to use as an event attribute. + * + */ + struct argb final : rgb + { + /** + * @brief Construct an argb with alpha, red, green, and blue channels + * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. + * + * Valid values are in the range `[0,255]`. + * + * @param alpha_ Value of the alpha channel (opacity) + * @param red_ Value of the red channel + * @param green_ Value of the green channel + * @param blue_ Value of the blue channel + * + */ + constexpr argb(component_type alpha_, component_type red_, component_type green_, component_type blue_) noexcept + : rgb{red_, green_, blue_} + , alpha{alpha_} + {} + + component_type alpha{}; ///< Alpha channel value + }; + + /** + * @brief Represents a custom color that can be associated with an NVTX event + * via it's `event_attributes`. + * + * Specifying colors for NVTX events is a convenient way to visually + * differentiate among different events in a visualization tool such as Nsight + * Systems. + * + */ + class color + { + public: + /// Type used for the color's value + using value_type = uint32_t; + + /** + * @brief Constructs a `color` using the value provided by `hex_code`. + * + * `hex_code` is expected to be a 4 byte argb hex code. + * + * The most significant byte indicates the value of the alpha channel + * (opacity) (0-255) + * + * The next byte indicates the value of the red channel (0-255) + * + * The next byte indicates the value of the green channel (0-255) + * + * The least significant byte indicates the value of the blue channel + * (0-255) + * + * @param hex_code The hex code used to construct the `color` + */ + constexpr explicit color(value_type hex_code) noexcept + : _value{hex_code} + {} + + /** + * @brief Construct a `color` using the alpha, red, green, blue components + * in `argb`. + * + * @param argb The alpha, red, green, blue components of the desired `color` + */ + constexpr color(argb argb_) noexcept + : color{from_bytes_msb_to_lsb(argb_.alpha, argb_.red, argb_.green, argb_.blue)} + {} + + /** + * @brief Construct a `color` using the red, green, blue components in + * `rgb`. + * + * Uses maximum value for the alpha channel (opacity) of the `color`. + * + * @param rgb The red, green, blue components of the desired `color` + */ + constexpr color(rgb rgb_) noexcept + : color{from_bytes_msb_to_lsb(0xFF, rgb_.red, rgb_.green, rgb_.blue)} + {} + + /** + * @brief Returns the `color`s argb hex code + * + */ + constexpr value_type get_value() const noexcept + { + return _value; + } + + /** + * @brief Return the NVTX color type of the color. + * + */ + constexpr nvtxColorType_t get_type() const noexcept + { + return _type; + } + + color() = delete; + ~color() = default; + color(color const&) = default; + color& operator=(color const&) = default; + color(color&&) = default; + color& operator=(color&&) = default; + + private: + /** + * @brief Constructs an unsigned, 4B integer from the component bytes in + * most to least significant byte order. + * + */ + constexpr static value_type + from_bytes_msb_to_lsb(uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0) noexcept + { + return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; + } + + value_type _value{}; ///< color's argb color code + nvtxColorType_t _type{NVTX_COLOR_ARGB}; ///< NVTX color type code + }; + + /** + * @brief Object for intra-domain grouping of NVTX events. + * + * A `category` is simply an integer id that allows for fine-grain grouping of + * NVTX events. For example, one might use separate categories for IO, memory + * allocation, compute, etc. + * + * Example: + * \code{.cpp} + * nvtx3::category cat1{1}; + * + * // Range `r1` belongs to the category identified by the value `1`. + * nvtx3::scoped_range r1{cat1}; + * + * // Range `r2` belongs to the same category as `r1` + * nvtx3::scoped_range r2{nvtx3::category{1}}; + * \endcode + * + * To associate a name string with a category id, see `named_category`. + * + */ + class category + { + public: + /// Type used for `category`s integer id. + using id_type = uint32_t; + + /** + * @brief Construct a `category` with the specified `id`. + * + * The `category` will be unnamed and identified only by its `id` value. + * + * All `category`s in a domain sharing the same `id` are equivalent. + * + * @param[in] id The `category`'s identifying value + */ + constexpr explicit category(id_type id) noexcept + : id_{id} + {} + + /** + * @brief Returns the id of the category. + * + */ + constexpr id_type get_id() const noexcept + { + return id_; + } + + category() = delete; + ~category() = default; + category(category const&) = default; + category& operator=(category const&) = default; + category(category&&) = default; + category& operator=(category&&) = default; + + private: + id_type id_{}; ///< category's unique identifier + }; + + /** + * @brief A `category` with an associated name string. + * + * Associates a `name` string with a category `id` to help differentiate among + * categories. + * + * For any given category id `Id`, a `named_category(Id, "name")` should only + * be constructed once and reused throughout an application. This can be done + * by either explicitly creating static `named_category` objects, or using the + * `named_category::get` construct on first use helper (recommended). + * + * Creating two or more `named_category` objects with the same value for `id` + * in the same domain results in undefined behavior. + * + * Similarly, behavior is undefined when a `named_category` and `category` + * share the same value of `id`. + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `named_category` in global domain: + * static nvtx3::named_category static_category{42, "my category"}; + * + * // Range `r` associated with category id `42` + * nvtx3::scoped_range r{static_category}; + * + * // OR use construct on first use: + * + * // Define a type with `name` and `id` members + * struct my_category { + * static constexpr char const* name{"my category"}; // category name + * static constexpr uint32_t id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category" + * auto& cat = named_category_in::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::scoped_range r{cat}; + * \endcode + * + * `named_category_in`'s association of a name to a category id is local to + * the domain specified by the type `D`. An id may have a different name in + * another domain. + * + * @tparam D Type containing `name` member used to identify the `domain` to + * which the `named_category_in` belongs. Else, `domain::global` to indicate + * that the global NVTX domain should be used. + */ + template + class named_category_in final : public category + { + public: +# if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET + /** + * @brief Returns a global instance of a `named_category_in` as a + * function-local static. + * + * Creates a `named_category_in` with name and id specified by the contents + * of a type `C`. `C::name` determines the name and `C::id` determines the + * category id. + * + * This function is useful for constructing a named `category` exactly once + * and reusing the same instance throughout an application. + * + * Example: + * \code{.cpp} + * // Define a type with `name` and `id` members + * struct my_category { + * static constexpr char const* name{"my category"}; // category name + * static constexpr uint32_t id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category" + * auto& cat = named_category_in::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::scoped_range r{cat}; + * \endcode + * + * Uses the "construct on first use" idiom to safely ensure the `category` + * object is initialized exactly once. See + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use + * + * @tparam C Type containing a member `C::name` that resolves to either a + * `char const*` or `wchar_t const*` and `C::id`. + */ + template < + typename C, + typename std::enable_if::value && detail::is_uint32::value, + int>::type = 0> + static named_category_in const& get() noexcept + { + static named_category_in const cat(C::id, C::name); + return cat; + } + + /** + * @brief Overload of `named_category_in::get` to provide a clear compile error + * when `C` has the required `name` and `id` members, but they are not the + * required types. `name` must be directly convertible to `char const*` or + * `wchar_t const*`, and `id` must be `uint32_t`. + */ + template ::value + || !detail::is_uint32::value, + int>::type = 0> + static named_category_in const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::is_c_string::value, + "Type used to name an NVTX category must contain a static constexpr member " + "called 'name' of type const char* or const wchar_t* -- 'name' member is not " + "convertible to either of those types"); + NVTX3_STATIC_ASSERT(detail::is_uint32::value, + "Type used to name an NVTX category must contain a static constexpr member " + "called 'id' of type uint32_t -- 'id' member is the wrong type"); + static named_category_in const unused; + return unused; // Function must compile for static_assert to be triggered + } + + /** + * @brief Overload of `named_category_in::get` to provide a clear compile error + * when `C` does not have the required `name` and `id` members. + */ + template ::value || !detail::has_id::value, int>::type = 0> + static named_category_in const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::has_name::value, + "Type used to name an NVTX category must contain a static constexpr member " + "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); + NVTX3_STATIC_ASSERT(detail::has_id::value, + "Type used to name an NVTX category must contain a static constexpr member " + "called 'id' of type uint32_t -- 'id' member is missing"); + static named_category_in const unused; + return unused; // Function must compile for static_assert to be triggered + } +# else + template + static named_category_in const& get() noexcept + { + static named_category_in const cat(C::id, C::name); + return cat; + } +# endif + + private: + // Default constructor is only used internally for static_assert(false) cases. + named_category_in() noexcept + : category{0} + {} + + public: + /** + * @brief Construct a `named_category_in` with the specified `id` and `name`. + * + * The name `name` will be registered with `id`. + * + * Every unique value of `id` should only be named once. + * + * @param[in] id The category id to name + * @param[in] name The name to associated with `id` + */ + named_category_in(id_type id, char const* name) noexcept + : category{id} + { +# ifndef NVTX_DISABLE + nvtxDomainNameCategoryA(domain::get(), get_id(), name); +# else + (void) id; + (void) name; +# endif + }; + + /** + * @brief Construct a `named_category_in` with the specified `id` and `name`. + * + * The name `name` will be registered with `id`. + * + * Every unique value of `id` should only be named once. + * + * @param[in] id The category id to name + * @param[in] name The name to associated with `id` + */ + named_category_in(id_type id, wchar_t const* name) noexcept + : category{id} + { +# ifndef NVTX_DISABLE + nvtxDomainNameCategoryW(domain::get(), get_id(), name); +# else + (void) id; + (void) name; +# endif + }; + }; + + /** + * @brief Alias for a `named_category_in` in the global NVTX domain. + * + */ + using named_category = named_category_in; + + /** + * @brief A message registered with NVTX. + * + * Normally, associating a `message` with an NVTX event requires copying the + * contents of the message string. This may cause non-trivial overhead in + * highly performance sensitive regions of code. + * + * message registration is an optimization to lower the overhead of + * associating a message with an NVTX event. Registering a message yields a + * handle that is inexpensive to copy that may be used in place of a message + * string. + * + * A particular message should only be registered once and the handle + * reused throughout the rest of the application. This can be done by either + * explicitly creating static `registered_string_in` objects, or using the + * `registered_string_in::get` construct on first use helper (recommended). + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `registered_string` in my_domain: + * static registered_string_in static_message{"message"}; + * + * // "message" is associated with the range `r` + * nvtx3::scoped_range r{static_message}; + * + * // Or use construct on first use: + * + * // Define a type with a `message` member that defines the contents of the + * // registered string + * struct my_message{ static constexpr char const* message{ "my message" }; }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * auto& msg = registered_string_in::get(); + * + * // "my message" is associated with the range `r` + * nvtx3::scoped_range r{msg}; + * \endcode + * + * `registered_string_in`s are local to a particular domain specified via + * the type `D`. + * + * @tparam D Type containing `name` member used to identify the `domain` to + * which the `registered_string_in` belongs. Else, `domain::global` to indicate + * that the global NVTX domain should be used. + */ + template + class registered_string_in + { + public: +# if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET + /** + * @brief Returns a global instance of a `registered_string_in` as a function + * local static. + * + * Provides a convenient way to register a message with NVTX without having + * to explicitly register the message. + * + * Upon first invocation, constructs a `registered_string_in` whose contents + * are specified by `message::message`. + * + * All future invocations will return a reference to the object constructed + * in the first invocation. + * + * Example: + * \code{.cpp} + * // Define a type with a `message` member that defines the contents of the + * // registered string + * struct my_message{ static constexpr char const* message{ "my message" }; + * }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * auto& msg = registered_string_in::get(); + * + * // "my message" is associated with the range `r` + * nvtx3::scoped_range r{msg}; + * \endcode + * + * @tparam M Type required to contain a member `M::message` that + * resolves to either a `char const*` or `wchar_t const*` used as the + * registered string's contents. + * @return Reference to a `registered_string_in` associated with the type `M`. + */ + template ::value, int>::type = 0> + static registered_string_in const& get() noexcept + { + static registered_string_in const regstr(M::message); + return regstr; + } + + /** + * @brief Overload of `registered_string_in::get` to provide a clear compile error + * when `M` has a `message` member that is not directly convertible to either + * `char const*` or `wchar_t const*`. + */ + template ::value, int>::type = 0> + static registered_string_in const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::always_false::value, + "Type used to register an NVTX string must contain a static constexpr member " + "called 'message' of type const char* or const wchar_t* -- 'message' member is " + "not convertible to either of those types"); + static registered_string_in const unused; + return unused; // Function must compile for static_assert to be triggered + } + + /** + * @brief Overload of `registered_string_in::get` to provide a clear compile error when + * `M` does not have a `message` member. + */ + template ::value, int>::type = 0> + static registered_string_in const& get() noexcept + { + NVTX3_STATIC_ASSERT(detail::always_false::value, + "Type used to register an NVTX string must contain a static constexpr member " + "called 'message' of type const char* or const wchar_t* -- 'message' member " + "is missing"); + static registered_string_in const unused; + return unused; // Function must compile for static_assert to be triggered + } +# else + template + static registered_string_in const& get() noexcept + { + static registered_string_in const regstr(M::message); + return regstr; + } +# endif + + /** + * @brief Constructs a `registered_string_in` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_string_in(char const* msg) noexcept + : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} + {} + + /** + * @brief Constructs a `registered_string_in` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_string_in(std::string const& msg) noexcept + : registered_string_in{msg.c_str()} + {} + + /** + * @brief Constructs a `registered_string_in` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_string_in(wchar_t const* msg) noexcept + : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} + {} + + /** + * @brief Constructs a `registered_string_in` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_string_in(std::wstring const& msg) noexcept + : registered_string_in{msg.c_str()} + {} + + /** + * @brief Returns the registered string's handle + * + */ + nvtxStringHandle_t get_handle() const noexcept + { + return handle_; + } + + private: + // Default constructor is only used internally for static_assert(false) cases. + registered_string_in() noexcept {}; + + public: + ~registered_string_in() = default; + registered_string_in(registered_string_in const&) = default; + registered_string_in& operator=(registered_string_in const&) = default; + registered_string_in(registered_string_in&&) = default; + registered_string_in& operator=(registered_string_in&&) = default; + + private: + nvtxStringHandle_t handle_{}; ///< The handle returned from + ///< registering the message with NVTX + }; + + /** + * @brief Alias for a `registered_string_in` in the global NVTX domain. + * + */ + using registered_string = registered_string_in; + + /** + * @brief Allows associating a message string with an NVTX event via + * its `EventAttribute`s. + * + * Associating a `message` with an NVTX event through its `event_attributes` + * allows for naming events to easily differentiate them from other events. + * + * Every time an NVTX event is created with an associated `message`, the + * contents of the message string must be copied. This may cause non-trivial + * overhead in highly performance sensitive sections of code. Use of a + * `nvtx3::registered_string` is recommended in these situations. + * + * Example: + * \code{.cpp} + * // Creates an `event_attributes` with message "message 0" + * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; + * + * // `range0` contains message "message 0" + * nvtx3::scoped_range range0{attr0}; + * + * // `std::string` and string literals are implicitly assumed to be + * // the contents of an `nvtx3::message` + * // Creates an `event_attributes` with message "message 1" + * nvtx3::event_attributes attr1{"message 1"}; + * + * // `range1` contains message "message 1" + * nvtx3::scoped_range range1{attr1}; + * + * // `range2` contains message "message 2" + * nvtx3::scoped_range range2{nvtx3::Mesage{"message 2"}}; + * + * // `std::string` and string literals are implicitly assumed to be + * // the contents of an `nvtx3::message` + * // `range3` contains message "message 3" + * nvtx3::scoped_range range3{"message 3"}; + * \endcode + */ + class message + { + public: + using value_type = nvtxMessageValue_t; + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + NVTX3_CONSTEXPR_IF_CPP14 message(char const* msg) noexcept + : type_{NVTX_MESSAGE_TYPE_ASCII} + { + value_.ascii = msg; + } + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + message(std::string const& msg) noexcept + : message{msg.c_str()} + {} + + /** + * @brief Disallow construction for `std::string` r-value + * + * `message` is a non-owning type and therefore cannot take ownership of an + * r-value. Therefore, constructing from an r-value is disallowed to prevent + * a dangling pointer. + * + */ + message(std::string&&) = delete; + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + NVTX3_CONSTEXPR_IF_CPP14 message(wchar_t const* msg) noexcept + : type_{NVTX_MESSAGE_TYPE_UNICODE} + { + value_.unicode = msg; + } + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + message(std::wstring const& msg) noexcept + : message{msg.c_str()} + {} + + /** + * @brief Disallow construction for `std::wstring` r-value + * + * `message` is a non-owning type and therefore cannot take ownership of an + * r-value. Therefore, constructing from an r-value is disallowed to prevent + * a dangling pointer. + * + */ + message(std::wstring&&) = delete; + + /** + * @brief Construct a `message` from a `registered_string_in`. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `registered_string_in` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + * @param msg The message that has already been registered with NVTX. + */ + template + NVTX3_CONSTEXPR_IF_CPP14 message(registered_string_in const& msg) noexcept + : type_{NVTX_MESSAGE_TYPE_REGISTERED} + { + value_.registered = msg.get_handle(); + } + + /** + * @brief Construct a `message` from NVTX C API type and value. + * + * @param type nvtxMessageType_t enum value indicating type of the payload + * @param value nvtxMessageValue_t union containing message + */ + constexpr message(nvtxMessageType_t const& type, nvtxMessageValue_t const& value) noexcept + : type_{type} + , value_(value) + {} + + /** + * @brief Construct a `message` from NVTX C API registered string handle. + * + * @param handle nvtxStringHandle_t value of registered string handle + */ + NVTX3_CONSTEXPR_IF_CPP14 message(nvtxStringHandle_t handle) noexcept + : type_{NVTX_MESSAGE_TYPE_REGISTERED} + { + value_.registered = handle; + } + + /** + * @brief Return the union holding the value of the message. + * + */ + constexpr value_type get_value() const noexcept + { + return value_; + } + + /** + * @brief Return the type information about the value the union holds. + * + */ + constexpr nvtxMessageType_t get_type() const noexcept + { + return type_; + } + + private: + nvtxMessageType_t type_{}; ///< message type + nvtxMessageValue_t value_{}; ///< message contents + }; + + /** + * @brief A numerical value that can be associated with an NVTX event via + * its `event_attributes`. + * + * Example: + * \code{.cpp} + * // Constructs a payload from the int32_t value 42 + * nvtx3:: event_attributes attr{nvtx3::payload{42}}; + * + * // `range0` will have an int32_t payload of 42 + * nvtx3::scoped_range range0{attr}; + * + * // range1 has double payload of 3.14 + * nvtx3::scoped_range range1{nvtx3::payload{3.14}}; + * \endcode + */ + class payload + { + public: + using value_type = typename nvtxEventAttributes_v2::payload_t; + + /** + * @brief Construct a `payload` from a signed, 8 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int64_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_INT64} + , value_{} + { + value_.llValue = value; + } + + /** + * @brief Construct a `payload` from a signed, 4 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int32_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_INT32} + , value_{} + { + value_.iValue = value; + } + + /** + * @brief Construct a `payload` from an unsigned, 8 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint64_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64} + , value_{} + { + value_.ullValue = value; + } + + /** + * @brief Construct a `payload` from an unsigned, 4 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint32_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32} + , value_{} + { + value_.uiValue = value; + } + + /** + * @brief Construct a `payload` from a single-precision floating point + * value. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(float value) noexcept + : type_{NVTX_PAYLOAD_TYPE_FLOAT} + , value_{} + { + value_.fValue = value; + } + + /** + * @brief Construct a `payload` from a double-precision floating point + * value. + * + * @param value Value to use as contents of the payload + */ + NVTX3_CONSTEXPR_IF_CPP14 explicit payload(double value) noexcept + : type_{NVTX_PAYLOAD_TYPE_DOUBLE} + , value_{} + { + value_.dValue = value; + } + + /** + * @brief Construct a `payload` from NVTX C API type and value. + * + * @param type nvtxPayloadType_t enum value indicating type of the payload + * @param value nvtxEventAttributes_t::payload_t union containing payload + */ + constexpr payload(nvtxPayloadType_t const& type, value_type const& value) noexcept + : type_{type} + , value_(value) + {} + + /** + * @brief Return the union holding the value of the payload + * + */ + constexpr value_type get_value() const noexcept + { + return value_; + } + + /** + * @brief Return the information about the type the union holds. + * + */ + constexpr nvtxPayloadType_t get_type() const noexcept + { + return type_; + } + + private: + nvtxPayloadType_t type_; ///< Type of the payload value + value_type value_; ///< Union holding the payload value + }; + + /** + * @brief Describes the attributes of a NVTX event. + * + * NVTX events can be customized via four "attributes": + * + * - color: color used to visualize the event in tools such as Nsight + * Systems. See `color`. + * - message: Custom message string. See `message`. + * - payload: User-defined numerical value. See `payload`. + * - category: Intra-domain grouping. See `category`. + * + * These component attributes are specified via an `event_attributes` object. + * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and + * `nvtx3::category` for how these individual attributes are constructed. + * + * While it is possible to specify all four attributes, it is common to want + * to only specify a subset of attributes and use default values for the + * others. For convenience, `event_attributes` can be constructed from any + * number of attribute components in any order. + * + * Example: + * \code{.cpp} + * // Set message, same as using nvtx3::message{"message"} + * event_attributes attr{"message"}; + * + * // Set message and color + * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; + * + * // Set message, color, payload, category + * event_attributes attr{"message", + * nvtx3::rgb{127, 255, 0}, + * nvtx3::payload{42}, + * nvtx3::category{1}}; + * + * // Same as above -- can use any order of arguments + * event_attributes attr{nvtx3::payload{42}, + * nvtx3::category{1}, + * "message", + * nvtx3::rgb{127, 255, 0}}; + * + * // Multiple arguments of the same type are allowed, but only the first is + * // used -- in this example, payload is set to 42: + * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; + * + * // Range `r` will be customized according the attributes in `attr` + * nvtx3::scoped_range r{attr}; + * + * // For convenience, `event_attributes` constructor arguments may be passed + * // to the `scoped_range_in` contructor -- they are forwarded to the + * // `event_attributes` constructor + * nvtx3::scoped_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; + * + * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: + * using namespace nvtx3; + * scoped_range r{payload{42}, category{1}, "message"}; + * \endcode + * + */ + class event_attributes + { + public: + using value_type = nvtxEventAttributes_t; + + /** + * @brief Default constructor creates an `event_attributes` with no + * category, color, payload, nor message. + */ + constexpr event_attributes() noexcept + : attributes_{ + NVTX_VERSION, // version + sizeof(nvtxEventAttributes_t), // size + 0, // category + NVTX_COLOR_UNKNOWN, // color type + 0, // color value + NVTX_PAYLOAD_UNKNOWN, // payload type + 0, // reserved 4B + {0}, // payload value (union) // NOTE(bgruber): added braces + NVTX_MESSAGE_UNKNOWN, // message type + {0} // message value (union) // NOTE(bgruber): added braces + } + {} + + /** + * @brief Variadic constructor where the first argument is a `category`. + * + * Sets the value of the `EventAttribute`s category based on `c` and + * forwards the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.category = c.get_id(); + } + + /** + * @brief Variadic constructor where the first argument is a `color`. + * + * Sets the value of the `EventAttribute`s color based on `c` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.color = c.get_value(); + attributes_.colorType = c.get_type(); + } + + /** + * @brief Variadic constructor where the first argument is a `payload`. + * + * Sets the value of the `EventAttribute`s payload based on `p` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.payload = p.get_value(); + attributes_.payloadType = p.get_type(); + } + + /** + * @brief Variadic constructor where the first argument is a `message`. + * + * Sets the value of the `EventAttribute`s message based on `m` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.message = m.get_value(); + attributes_.messageType = m.get_type(); + } + + ~event_attributes() = default; + event_attributes(event_attributes const&) = default; + event_attributes& operator=(event_attributes const&) = default; + event_attributes(event_attributes&&) = default; + event_attributes& operator=(event_attributes&&) = default; + + /** + * @brief Get raw pointer to underlying NVTX attributes object. + * + */ + constexpr value_type const* get() const noexcept + { + return &attributes_; + } + + private: + value_type attributes_{}; ///< The NVTX attributes structure + }; + + /** + * @brief A RAII object for creating a NVTX range local to a thread within a + * domain. + * + * When constructed, begins a nested NVTX range on the calling thread in the + * specified domain. Upon destruction, ends the NVTX range. + * + * Behavior is undefined if a `scoped_range_in` object is + * created/destroyed on different threads. + * + * `scoped_range_in` is neither moveable nor copyable. + * + * `scoped_range_in`s may be nested within other ranges. + * + * The domain of the range is specified by the template type parameter `D`. + * By default, the `domain::global` is used, which scopes the range to the + * global NVTX domain. The convenience alias `scoped_range` is provided for + * ranges scoped to the global domain. + * + * A custom domain can be defined by creating a type, `D`, with a static + * member `D::name` whose value is used to name the domain associated with + * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` + * + * Example: + * \code{.cpp} + * // Define a type `my_domain` with a member `name` used to name the domain + * // associated with the type `my_domain`. + * struct my_domain{ + * static constexpr char const* name{"my domain"}; + * }; + * \endcode + * + * Usage: + * \code{.cpp} + * nvtx3::scoped_range_in r1{"range 1"}; // Range in my domain + * + * // Three equivalent ways to make a range in the global domain: + * nvtx3::scoped_range_in r2{"range 2"}; + * nvtx3::scoped_range_in<> r3{"range 3"}; + * nvtx3::scoped_range r4{"range 4"}; + * + * // Create an alias to succinctly make ranges in my domain: + * using my_scoped_range = nvtx3::scoped_range_in; + * + * my_scoped_range r3{"range 3"}; + * \endcode + */ + template + class scoped_range_in + { + public: + /** + * @brief Construct a `scoped_range_in` with the specified + * `event_attributes` + * + * Example: + * \code{cpp} + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * nvtx3::scoped_range range{attr}; // Creates a range with message contents + * // "msg" and green color + * \endcode + * + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + */ + explicit scoped_range_in(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + nvtxDomainRangePushEx(domain::get(), attr.get()); +# else + (void) attr; +# endif + } + + /** + * @brief Constructs a `scoped_range_in` from the constructor arguments + * of an `event_attributes`. + * + * Forwards the arguments `args...` to construct an + * `event_attributes` object. The `event_attributes` object is then + * associated with the `scoped_range_in`. + * + * For more detail, see `event_attributes` documentation. + * + * Example: + * \code{cpp} + * // Creates a range with message "message" and green color + * nvtx3::scoped_range r{"message", nvtx3::rgb{127,255,0}}; + * \endcode + * + * @param[in] args Arguments to used to construct an `event_attributes` associated with this + * range. + * + */ + template + explicit scoped_range_in(Args const&... args) noexcept + : scoped_range_in{event_attributes{args...}} + {} + + /** + * @brief Default constructor creates a `scoped_range_in` with no + * message, color, payload, nor category. + * + */ + scoped_range_in() noexcept + : scoped_range_in{event_attributes{}} + {} + + /** + * @brief Delete `operator new` to disallow heap allocated objects. + * + * `scoped_range_in` must follow RAII semantics to guarantee proper push/pop semantics. + * + */ + void* operator new(std::size_t) = delete; + + scoped_range_in(scoped_range_in const&) = delete; + scoped_range_in& operator=(scoped_range_in const&) = delete; + scoped_range_in(scoped_range_in&&) = delete; + scoped_range_in& operator=(scoped_range_in&&) = delete; + + /** + * @brief Destroy the scoped_range_in, ending the NVTX range event. + */ + ~scoped_range_in() noexcept + { +# ifndef NVTX_DISABLE + nvtxDomainRangePop(domain::get()); +# endif + } + }; + + /** + * @brief Alias for a `scoped_range_in` in the global NVTX domain. + * + */ + using scoped_range = scoped_range_in; + + namespace detail + { + + /// @cond internal + template + class optional_scoped_range_in + { + public: + optional_scoped_range_in() = default; + + void begin(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + // This class is not meant to be part of the public NVTX C++ API and should + // only be used in the `NVTX3_FUNC_RANGE_IF` and `NVTX3_FUNC_RANGE_IF_IN` + // macros. However, to prevent developers from misusing this class, make + // sure to not start multiple ranges. + if (initialized) + { + return; + } + + nvtxDomainRangePushEx(domain::get(), attr.get()); + initialized = true; +# endif + } + + ~optional_scoped_range_in() noexcept + { +# ifndef NVTX_DISABLE + if (initialized) + { + nvtxDomainRangePop(domain::get()); + } +# endif + } + + void* operator new(std::size_t) = delete; + optional_scoped_range_in(optional_scoped_range_in const&) = delete; + optional_scoped_range_in& operator=(optional_scoped_range_in const&) = delete; + optional_scoped_range_in(optional_scoped_range_in&&) = delete; + optional_scoped_range_in& operator=(optional_scoped_range_in&&) = delete; + + private: +# ifndef NVTX_DISABLE + bool initialized = false; +# endif + }; + /// @endcond + + } // namespace detail + + /** + * @brief Handle used for correlating explicit range start and end events. + * + * A handle is "null" if it does not correspond to any range. + * + */ + struct range_handle + { + /// Type used for the handle's value + using value_type = nvtxRangeId_t; + + /** + * @brief Construct a `range_handle` from the given id. + * + */ + constexpr explicit range_handle(value_type id) noexcept + : _range_id{id} + {} + + /** + * @brief Constructs a null range handle. + * + * A null range_handle corresponds to no range. Calling `end_range` on a + * null handle is undefined behavior when a tool is active. + * + */ + constexpr range_handle() noexcept = default; + + /** + * @brief Checks whether this handle is null + * + * Provides contextual conversion to `bool`. + * + * \code{cpp} + * range_handle handle{}; + * if (handle) {...} + * \endcode + * + */ + constexpr explicit operator bool() const noexcept + { + return get_value() != null_range_id; + }; + + /** + * @brief Implicit conversion from `nullptr` constructs a null handle. + * + * Satisfies the "NullablePointer" requirement to make `range_handle` comparable with `nullptr`. + * + */ + constexpr range_handle(std::nullptr_t) noexcept {} + + /** + * @brief Returns the `range_handle`'s value + * + * @return value_type The handle's value + */ + constexpr value_type get_value() const noexcept + { + return _range_id; + } + + private: + /// Sentinel value for a null handle that corresponds to no range + static constexpr value_type null_range_id = nvtxRangeId_t{0}; + + value_type _range_id{null_range_id}; ///< The underlying NVTX range id + }; + + /** + * @brief Compares two range_handles for equality + * + * @param lhs The first range_handle to compare + * @param rhs The second range_handle to compare + */ + inline constexpr bool operator==(range_handle lhs, range_handle rhs) noexcept + { + return lhs.get_value() == rhs.get_value(); + } + + /** + * @brief Compares two range_handles for inequality + * + * @param lhs The first range_handle to compare + * @param rhs The second range_handle to compare + */ + inline constexpr bool operator!=(range_handle lhs, range_handle rhs) noexcept + { + return !(lhs == rhs); + } + + /** + * @brief Manually begin an NVTX range. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range_in()`. + * + * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and + * `nvtx3::start_range_in(...)`. + * + * `start_range_in/end_range_in` are the most explicit and lowest level APIs + * provided for creating ranges. Use of `nvtx3::unique_range_in` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * Example: + * \code{.cpp} + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * // Manually begin a range + * nvtx3::range_handle h = nvtx3::start_range_in(attr); + * ... + * nvtx3::end_range_in(h); // End the range + * \endcode + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the range belongs. Else, `domain::global` to indicate that the + * global NVTX domain should be used. + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + * @return Unique handle to be passed to `end_range_in` to end the range. + */ + template + inline range_handle start_range_in(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; +# else + (void) attr; + return {}; +# endif + } + + /** + * @brief Manually begin an NVTX range. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range_in()`. + * + * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and + * `nvtx3::start_range_in(...)`. + * + * `start_range_in/end_range_in` are the most explicit and lowest level APIs + * provided for creating ranges. Use of `nvtx3::unique_range_in` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * This overload uses `args...` to construct an `event_attributes` to + * associate with the range. For more detail, see `event_attributes`. + * + * Example: + * \code{cpp} + * // Manually begin a range + * nvtx3::range_handle h = nvtx3::start_range_in("msg", nvtx3::rgb{127,255,0}); + * ... + * nvtx3::end_range_in(h); // Ends the range + * \endcode + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the range belongs. Else, `domain::global` to indicate that the + * global NVTX domain should be used. + * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. + * @return Unique handle to be passed to `end_range` to end the range. + */ + template + inline range_handle start_range_in(Args const&... args) noexcept + { +# ifndef NVTX_DISABLE + return start_range_in(event_attributes{args...}); +# else + return {}; +# endif + } + + /** + * @brief Manually begin an NVTX range in the global domain. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range()`. + * + * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and + * `nvtx3::start_range_in(...)`. + * + * `start_range/end_range` are the most explicit and lowest level APIs + * provided for creating ranges. Use of `nvtx3::unique_range` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * Example: + * \code{.cpp} + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * // Manually begin a range + * nvtx3::range_handle h = nvtx3::start_range(attr); + * ... + * nvtx3::end_range(h); // End the range + * \endcode + * + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + * @return Unique handle to be passed to `end_range_in` to end the range. + */ + inline range_handle start_range(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + return start_range_in(attr); +# else + (void) attr; + return {}; +# endif + } + + /** + * @brief Manually begin an NVTX range in the global domain. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range_in()`. + * + * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and + * `nvtx3::start_range_in(...)`. + * + * `start_range_in/end_range_in` are the most explicit and lowest level APIs + * provided for creating ranges. Use of `nvtx3::unique_range_in` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * This overload uses `args...` to construct an `event_attributes` to + * associate with the range. For more detail, see `event_attributes`. + * + * Example: + * \code{cpp} + * // Manually begin a range + * nvtx3::range_handle h = nvtx3::start_range("msg", nvtx3::rgb{127,255,0}); + * ... + * nvtx3::end_range(h); // Ends the range + * \endcode + * + * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. + * @return Unique handle to be passed to `end_range` to end the range. + */ + template + inline range_handle start_range(Args const&... args) noexcept + { +# ifndef NVTX_DISABLE + return start_range_in(args...); +# else + return {}; +# endif + } + + /** + * @brief Manually end the range associated with the handle `r` in domain `D`. + * + * Explicitly ends the NVTX range indicated by the handle `r` returned from a + * prior call to `start_range_in`. The range may end on a different thread + * from where it began. + * + * @tparam D Type containing `name` member used to identify the `domain` to + * which the range belongs. Else, `domain::global` to indicate that the global + * NVTX domain should be used. + * @param r Handle to a range started by a prior call to `start_range_in`. + * + * @warning The domain type specified as template parameter to this function + * must be the same that was specified on the associated `start_range_in` call. + */ + template + inline void end_range_in(range_handle r) noexcept + { +# ifndef NVTX_DISABLE + nvtxDomainRangeEnd(domain::get(), r.get_value()); +# else + (void) r; +# endif + } + + /** + * @brief Manually end the range associated with the handle `r` in the global + * domain. + * + * Explicitly ends the NVTX range indicated by the handle `r` returned from a + * prior call to `start_range`. The range may end on a different thread from + * where it began. + * + * @param r Handle to a range started by a prior call to `start_range`. + * + * @warning The domain type specified as template parameter to this function + * must be the same that was specified on the associated `start_range` call. + */ + inline void end_range(range_handle r) noexcept + { +# ifndef NVTX_DISABLE + end_range_in(r); +# else + (void) r; +# endif + } + + /** + * @brief A RAII object for creating a NVTX range within a domain that can + * be created and destroyed on different threads. + * + * When constructed, begins a NVTX range in the specified domain. Upon + * destruction, ends the NVTX range. + * + * Similar to `nvtx3::scoped_range_in`, with a few key differences: + * - `unique_range` objects can be destroyed in an order whereas `scoped_range` objects must be + * destroyed in exact reverse creation order + * - `unique_range` can start and end on different threads + * - `unique_range` is moveable + * - `unique_range` objects can be constructed as heap objects + * + * There is extra overhead associated with `unique_range` constructs and therefore use of + * `nvtx3::scoped_range_in` should be preferred. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `unique_range_in` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + */ + template + class unique_range_in + { + public: + /** + * @brief Construct a new unique_range_in object with the specified event attributes + * + * Example: + * \code{cpp} + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * nvtx3::unique_range_in range{attr}; // Creates a range with message contents + * // "msg" and green color + * \endcode + * + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + */ + explicit unique_range_in(event_attributes const& attr) noexcept + : handle_{start_range_in(attr)} + {} + + /** + * @brief Constructs a `unique_range_in` from the constructor arguments + * of an `event_attributes`. + * + * Forwards the arguments `args...` to construct an + * `event_attributes` object. The `event_attributes` object is then + * associated with the `unique_range_in`. + * + * For more detail, see `event_attributes` documentation. + * + * Example: + * \code{.cpp} + * // Creates a range with message "message" and green color + * nvtx3::unique_range_in<> r{"message", nvtx3::rgb{127,255,0}}; + * \endcode + * + * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` + * associated with this range. + */ + template + explicit unique_range_in(Args const&... args) noexcept + : unique_range_in{event_attributes{args...}} + {} + + /** + * @brief Default constructor creates a `unique_range_in` with no + * message, color, payload, nor category. + * + */ + constexpr unique_range_in() noexcept + : unique_range_in{event_attributes{}} + {} + + /** + * @brief Destroy the `unique_range_in` ending the range. + * + */ + ~unique_range_in() noexcept = default; + + /** + * @brief Move constructor allows taking ownership of the NVTX range from + * another `unique_range_in`. + * + * @param other The range to take ownership of + */ + unique_range_in(unique_range_in&& other) noexcept = default; + + /** + * @brief Move assignment operator allows taking ownership of an NVTX range + * from another `unique_range_in`. + * + * @param other The range to take ownership of + */ + unique_range_in& operator=(unique_range_in&& other) noexcept = default; + + /// Copy construction is not allowed to prevent multiple objects from owning + /// the same range handle + unique_range_in(unique_range_in const&) = delete; + + /// Copy assignment is not allowed to prevent multiple objects from owning the + /// same range handle + unique_range_in& operator=(unique_range_in const&) = delete; + + private: + struct end_range_handle + { + using pointer = range_handle; /// Override the pointer type of the unique_ptr + void operator()(range_handle h) const noexcept + { + end_range_in(h); + } + }; + + /// Range handle used to correlate the start/end of the range + std::unique_ptr handle_; + }; + + /** + * @brief Alias for a `unique_range_in` in the global NVTX domain. + * + */ + using unique_range = unique_range_in; + + /** + * @brief Annotates an instantaneous point in time with a "marker", using the + * attributes specified by `attr`. + * + * Unlike a "range" which has a beginning and an end, a marker is a single event + * in an application, such as detecting a problem: + * + * \code{.cpp} + * bool success = do_operation(...); + * if (!success) { + * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; + * nvtx3::mark_in(attr); + * } + * \endcode + * + * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `unique_range_in` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + * @param[in] attr `event_attributes` that describes the desired attributes + * of the mark. + */ + template + inline void mark_in(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + nvtxDomainMarkEx(domain::get(), attr.get()); +# else + (void) (attr); +# endif + } + + /** + * @brief Annotates an instantaneous point in time with a "marker", using the + * arguments to construct an `event_attributes`. + * + * Unlike a "range" which has a beginning and an end, a marker is a single event + * in an application, such as detecting a problem: + * + * \code{.cpp} + * bool success = do_operation(...); + * if (!success) { + * nvtx3::mark_in("operation failed!", nvtx3::rgb{255,0,0}); + * } + * \endcode + * + * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. + * + * Forwards the arguments `args...` to construct an `event_attributes` object. + * The attributes are then associated with the marker. For more detail, see + * the `event_attributes` documentation. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `unique_range_in` belongs. Else `domain::global` to + * indicate that the global NVTX domain should be used. + * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` + * associated with this range. + * + */ + template + inline void mark_in(Args const&... args) noexcept + { +# ifndef NVTX_DISABLE + mark_in(event_attributes{args...}); +# endif + } + + /** + * @brief Annotates an instantaneous point in time with a "marker", using the + * attributes specified by `attr`, in the global domain. + * + * Unlike a "range" which has a beginning and an end, a marker is a single event + * in an application, such as detecting a problem: + * + * \code{.cpp} + * bool success = do_operation(...); + * if (!success) { + * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; + * nvtx3::mark(attr); + * } + * \endcode + * + * Note that nvtx3::mark is a function, not a class like scoped_range. + * + * @param[in] attr `event_attributes` that describes the desired attributes + * of the mark. + */ + inline void mark(event_attributes const& attr) noexcept + { +# ifndef NVTX_DISABLE + mark_in(attr); +# endif + } + + /** + * @brief Annotates an instantaneous point in time with a "marker", using the + * arguments to construct an `event_attributes`, in the global domain. + * + * Unlike a "range" which has a beginning and an end, a marker is a single event + * in an application, such as detecting a problem: + * + * \code{.cpp} + * bool success = do_operation(...); + * if (!success) { + * nvtx3::mark("operation failed!", nvtx3::rgb{255,0,0}); + * } + * \endcode + * + * Note that nvtx3::mark is a function, not a class like scoped_range. + * + * Forwards the arguments `args...` to construct an `event_attributes` object. + * The attributes are then associated with the marker. For more detail, see + * the `event_attributes` documentation. + * + * @param[in] args Variadic parameter pack of arguments to construct an + * `event_attributes` associated with this range. + * + */ + template + inline void mark(Args const&... args) noexcept + { +# ifndef NVTX_DISABLE + mark_in(args...); +# endif + } + +} // namespace NVTX3_VERSION_NAMESPACE + +} // namespace nvtx3 + +# ifndef NVTX_DISABLE +/** + * @brief Convenience macro for generating a range in the specified `domain` + * from the lifetime of a function + * + * This macro is useful for generating an NVTX range in `domain` from + * the entry point of a function to its exit. It is intended to be the first + * line of the function. + * + * Constructs a static `registered_string_in` using the name of the immediately + * enclosing function returned by `__func__` and constructs a + * `nvtx3::scoped_range` using the registered function name as the range's + * message. + * + * Example: + * \code{.cpp} + * struct my_domain{static constexpr char const* name{"my_domain"};}; + * + * void foo(...) { + * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() + * // do stuff + * ... + * } // Range ends on return from foo() + * \endcode + * + * @param[in] D Type containing `name` member used to identify the + * `domain` to which the `registered_string_in` belongs. Else, + * `domain::global` to indicate that the global NVTX domain should be used. + */ +# define NVTX3_V1_FUNC_RANGE_IN(D) \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + +/** + * @brief Convenience macro for generating a range in the specified `domain` + * from the lifetime of a function if the given boolean expression evaluates + * to true. + * + * Similar to `NVTX3_V1_FUNC_RANGE_IN(D)`, the only difference being that + * `NVTX3_V1_FUNC_RANGE_IF_IN(D, C)` only generates a range if the given boolean + * expression evaluates to true. + * + * @param[in] D Type containing `name` member used to identify the + * `domain` to which the `registered_string_in` belongs. Else, + * `domain::global` to indicate that the global NVTX domain should be used. + * + * @param[in] C Boolean expression used to determine if a range should be + * generated. + */ +# define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) \ + ::nvtx3::v1::detail::optional_scoped_range_in optional_nvtx3_range__; \ + if (C) \ + { \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + optional_nvtx3_range__.begin(nvtx3_func_attr__); \ + } +# else +# define NVTX3_V1_FUNC_RANGE_IN(D) +# define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) +# endif // NVTX_DISABLE + +/** + * @brief Convenience macro for generating a range in the global domain from the + * lifetime of a function. + * + * This macro is useful for generating an NVTX range in the global domain from + * the entry point of a function to its exit. It is intended to be the first + * line of the function. + * + * Constructs a static `registered_string_in` using the name of the immediately + * enclosing function returned by `__func__` and constructs a + * `nvtx3::scoped_range` using the registered function name as the range's + * message. + * + * Example: + * \code{.cpp} + * void foo(...) { + * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() + * // do stuff + * ... + * } // Range ends on return from foo() + * \endcode + */ +# define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global) + +/** + * @brief Convenience macro for generating a range in the global domain from the + * lifetime of a function if the given boolean expression evaluates to true. + * + * Similar to `NVTX3_V1_FUNC_RANGE()`, the only difference being that + * `NVTX3_V1_FUNC_RANGE_IF(C)` only generates a range if the given boolean + * expression evaluates to true. + * + * @param[in] C Boolean expression used to determine if a range should be + * generated. + */ +# define NVTX3_V1_FUNC_RANGE_IF(C) NVTX3_V1_FUNC_RANGE_IF_IN(::nvtx3::v1::domain::global, C) + +/* When inlining this version, versioned macros must have unversioned aliases. + * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/ +# if defined(NVTX3_INLINE_THIS_VERSION) +/* clang format off */ +# define NVTX3_FUNC_RANGE NVTX3_V1_FUNC_RANGE +# define NVTX3_FUNC_RANGE_IF NVTX3_V1_FUNC_RANGE_IF +# define NVTX3_FUNC_RANGE_IN NVTX3_V1_FUNC_RANGE_IN +# define NVTX3_FUNC_RANGE_IF_IN NVTX3_V1_FUNC_RANGE_IF_IN +/* clang format on */ +# endif + +#endif // NVTX3_CPP_DEFINITIONS_V1_0 + +/* Add functionality for new minor versions here, by copying the above section enclosed + * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version. This code + * is an example of how additions for version 1.2 would look, indented for clarity. Note + * that the versioned symbols and macros are always provided, and the unversioned symbols + * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header. + * + * \code{.cpp} + * #ifndef NVTX3_CPP_DEFINITIONS_V1_2 + * #define NVTX3_CPP_DEFINITIONS_V1_2 + * namespace nvtx3 { + * NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { + * class new_class {}; + * inline void new_function() {} + * } + * } + * + * // Macros must have the major version in their names: + * #define NVTX3_V1_NEW_MACRO_A() ... + * #define NVTX3_V1_NEW_MACRO_B() ... + * + * // If inlining, make aliases for the macros with the version number omitted + * #if defined(NVTX3_INLINE_THIS_VERSION) + * #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A + * #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B + * #endif + * #endif // NVTX3_CPP_DEFINITIONS_V1_2 + * \endcode + */ + +/* Undefine all temporarily-defined unversioned macros, which would conflict with + * subsequent includes of different versions of this header. */ +#undef NVTX3_CPP_VERSION_MAJOR +#undef NVTX3_CPP_VERSION_MINOR +#undef NVTX3_CONCAT +#undef NVTX3_NAMESPACE_FOR +#undef NVTX3_VERSION_NAMESPACE +#undef NVTX3_INLINE_IF_REQUESTED +#undef NVTX3_CONSTEXPR_IF_CPP14 + +#if defined(NVTX3_INLINE_THIS_VERSION) +# undef NVTX3_INLINE_THIS_VERSION +#endif + +#if defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE) +# undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE +# undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET +#endif + +#if defined(NVTX3_STATIC_ASSERT_DEFINED_HERE) +# undef NVTX3_STATIC_ASSERT_DEFINED_HERE +# undef NVTX3_STATIC_ASSERT +#endif diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index ca270d438b9..5917227bf55 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -38,6 +38,7 @@ #endif // no system header #include +#include #include #include #include @@ -256,6 +257,8 @@ public: DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy"); + constexpr bool may_alias = false; constexpr bool read_left = true; @@ -381,6 +384,8 @@ public: DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft"); + constexpr bool may_alias = true; constexpr bool read_left = true; @@ -524,6 +529,8 @@ public: DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy"); + constexpr bool may_alias = false; constexpr bool read_left = false; @@ -638,6 +645,8 @@ public: DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight"); + constexpr bool may_alias = true; constexpr bool read_left = false; diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index 6d78d3d9b5b..0d222475b22 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -39,6 +39,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -170,6 +171,8 @@ struct DeviceCopy uint32_t num_ranges, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Batched"); + // Integer type large enough to hold any offset in [0, num_ranges) using RangeOffsetT = uint32_t; diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh index bcc4a869406..d957ea29822 100644 --- a/cub/cub/device/device_for.cuh +++ b/cub/cub/device/device_for.cuh @@ -37,6 +37,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -574,11 +575,30 @@ public: template CUB_RUNTIME_FUNCTION static cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {}) { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk"); static_assert(::cuda::std::is_integral::value, "ShapeT must be an integral type"); using offset_t = ShapeT; return detail::for_each::dispatch_t::dispatch(static_cast(shape), op, stream); } +private: + // Internal version without NVTX raNGE + template + CUB_RUNTIME_FUNCTION static cudaError_t + ForEachNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) + { + using offset_t = NumItemsT; + using use_vectorization_t = ::cuda::std::integral_constant; + + // Disable auto-vectorization for now: + // constexpr bool use_vectorization = + // detail::for_each::can_regain_copy_freedom, OpT>::value + // && THRUST_NS_QUALIFIER::is_contiguous_iterator::value; + + return for_each_n(first, num_items, op, stream, use_vectorization_t{}); + } + +public: //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -630,15 +650,8 @@ public: CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { - using offset_t = NumItemsT; - using use_vectorization_t = ::cuda::std::integral_constant; - - // Disable auto-vectorization for now: - // constexpr bool use_vectorization = - // detail::for_each::can_regain_copy_freedom, OpT>::value - // && THRUST_NS_QUALIFIER::is_contiguous_iterator::value; - - return for_each_n(first, num_items, op, stream, use_vectorization_t{}); + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachN"); + return ForEachNNoNVTX(first, num_items, op, stream); } //! @rst @@ -689,13 +702,31 @@ public: CUB_RUNTIME_FUNCTION static cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEach"); + using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); - return ForEachN(first, num_items, op, stream); + return ForEachNNoNVTX(first, num_items, op, stream); + } + +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t + ForEachCopyNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) + { + static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, + "Iterator must be contiguous"); + + using offset_t = NumItemsT; + using use_vectorization_t = ::cuda::std::integral_constant; + + return for_each_n(first, num_items, op, stream, use_vectorization_t{}); } +public: //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -750,13 +781,8 @@ public: CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { - static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, - "Iterator must be contiguous"); - - using offset_t = NumItemsT; - using use_vectorization_t = ::cuda::std::integral_constant; - - return for_each_n(first, num_items, op, stream, use_vectorization_t{}); + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopyN"); + return ForEachCopyNNoNVTX(first, num_items, op, stream); } //! @rst @@ -810,6 +836,7 @@ public: CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy"); static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, "Iterator must be contiguous"); @@ -817,7 +844,7 @@ public: const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); - return ForEachCopyN(first, num_items, op, stream); + return ForEachCopyNNoNVTX(first, num_items, op, stream); } }; diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 8d6406624bf..085df42dc5d 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -796,6 +797,8 @@ struct DeviceHistogram size_t row_stride_bytes, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven"); + /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; @@ -1533,6 +1536,8 @@ struct DeviceHistogram size_t row_stride_bytes, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange"); + /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index a20fb8203d3..1359863a76f 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -39,6 +39,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -172,6 +173,7 @@ struct DeviceMemcpy uint32_t num_buffers, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMemcpy::Batched"); static_assert(std::is_pointer>::value, "DeviceMemcpy::Batched only supports copying of memory buffers." "Please consider using DeviceCopy::Batched instead."); diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh index c740138937d..3c28e405ab0 100644 --- a/cub/cub/device/device_merge_sort.cuh +++ b/cub/cub/device/device_merge_sort.cuh @@ -38,6 +38,7 @@ #endif // no system header #include +#include #include #include #include @@ -111,6 +112,34 @@ CUB_NAMESPACE_BEGIN */ struct DeviceMergeSort { +private: + // Name reported for NVTX ranges + _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* + { + return "cub::DeviceMergeSort"; + } + + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + cudaStream_t stream = 0) + { + using PromotedOffsetT = detail::promote_small_offset_t; + + using DispatchMergeSortT = + DispatchMergeSort; + + return DispatchMergeSortT::Dispatch( + d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream); + } + +public: /** * @brief Sorts items using a merge sorting method. * @@ -213,13 +242,8 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { - using PromotedOffsetT = detail::promote_small_offset_t; - - using DispatchMergeSortT = - DispatchMergeSort; - - return DispatchMergeSortT::Dispatch( - d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream); + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } template @@ -367,6 +391,7 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; using DispatchMergeSortT = @@ -416,6 +441,35 @@ struct DeviceMergeSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + cudaStream_t stream = 0) + { + using PromotedOffsetT = detail::promote_small_offset_t; + + using DispatchMergeSortT = + DispatchMergeSort; + + return DispatchMergeSortT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + static_cast(nullptr), + d_keys, + static_cast(nullptr), + num_items, + compare_op, + stream); + } + +public: /** * @brief Sorts items using a merge sorting method. * @@ -508,21 +562,8 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { - using PromotedOffsetT = detail::promote_small_offset_t; - - using DispatchMergeSortT = - DispatchMergeSort; - - return DispatchMergeSortT::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - static_cast(nullptr), - d_keys, - static_cast(nullptr), - num_items, - compare_op, - stream); + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } template @@ -541,6 +582,36 @@ struct DeviceMergeSort d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopyNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + cudaStream_t stream = 0) + { + using PromotedOffsetT = detail::promote_small_offset_t; + + using DispatchMergeSortT = + DispatchMergeSort; + + return DispatchMergeSortT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_input_keys, + static_cast(nullptr), + d_output_keys, + static_cast(nullptr), + num_items, + compare_op, + stream); + } + +public: /** * @brief Sorts items using a merge sorting method. * @@ -647,21 +718,9 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { - using PromotedOffsetT = detail::promote_small_offset_t; - - using DispatchMergeSortT = - DispatchMergeSort; - - return DispatchMergeSortT::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_input_keys, - static_cast(nullptr), - d_output_keys, - static_cast(nullptr), - num_items, - compare_op, - stream); + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysCopyNoNVTX( + d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } template @@ -783,9 +842,10 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; - return SortPairs( + return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } @@ -899,9 +959,10 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; - return SortKeys( + return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } @@ -1028,9 +1089,9 @@ struct DeviceMergeSort CompareOpT compare_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; - - return SortKeysCopy( + return SortKeysCopyNoNVTX( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } }; diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 9b5cce18cda..5c24a0ec200 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -178,6 +179,7 @@ struct DevicePartition int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged"); using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -337,6 +339,7 @@ struct DevicePartition SelectOp select_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -382,6 +385,63 @@ struct DevicePartition d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } +private: + template + friend class DispatchSegmentedSort; + + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + InputIteratorT d_in, + FirstOutputIteratorT d_first_part_out, + SecondOutputIteratorT d_second_part_out, + UnselectedOutputIteratorT d_unselected_out, + NumSelectedIteratorT d_num_selected_out, + int num_items, + SelectFirstPartOp select_first_part_op, + SelectSecondPartOp select_second_part_op, + cudaStream_t stream = 0) + { + using OffsetT = int; + using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< + InputIteratorT, + FirstOutputIteratorT, + SecondOutputIteratorT, + UnselectedOutputIteratorT, + NumSelectedIteratorT, + SelectFirstPartOp, + SelectSecondPartOp, + OffsetT>; + + return DispatchThreeWayPartitionIfT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_first_part_out, + d_second_part_out, + d_unselected_out, + d_num_selected_out, + select_first_part_op, + select_second_part_op, + num_items, + stream); + } + +public: //! @rst //! Uses two functors to split the corresponding items from ``d_in`` into a three partitioned sequences //! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``. @@ -581,18 +641,8 @@ struct DevicePartition SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { - using OffsetT = int; - using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< - InputIteratorT, - FirstOutputIteratorT, - SecondOutputIteratorT, - UnselectedOutputIteratorT, - NumSelectedIteratorT, - SelectFirstPartOp, - SelectSecondPartOp, - OffsetT>; - - return DispatchThreeWayPartitionIfT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); + return IfNoNVTX( d_temp_storage, temp_storage_bytes, d_in, @@ -600,9 +650,9 @@ struct DevicePartition d_second_part_out, d_unselected_out, d_num_selected_out, + num_items, select_first_part_op, select_second_part_op, - num_items, stream); } diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh index 605fda3698a..db63745d4de 100644 --- a/cub/cub/device/device_radix_sort.cuh +++ b/cub/cub/device/device_radix_sort.cuh @@ -43,6 +43,7 @@ #endif // no system header #include +#include #include #include @@ -206,6 +207,12 @@ private: stream); } + // Name reported for NVTX ranges + _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* + { + return "cub::DeviceRadixSort"; + } + public: //! @name KeyT-value pairs //@{ @@ -329,6 +336,7 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -506,6 +514,7 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -644,6 +653,7 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -797,6 +807,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -934,6 +946,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -1078,6 +1092,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -1219,6 +1235,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -1386,6 +1404,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -1525,6 +1545,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -1673,6 +1695,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -1811,6 +1835,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -1956,6 +1982,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -2092,6 +2120,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -2222,6 +2252,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -2349,6 +2381,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -2504,6 +2538,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -2629,6 +2665,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -2761,6 +2799,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -2888,6 +2928,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -3030,6 +3072,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -3155,6 +3199,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -3285,6 +3331,8 @@ public: int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; @@ -3412,6 +3460,8 @@ public: DecomposerT decomposer, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; @@ -3545,6 +3595,8 @@ public: int end_bit, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index efa4c156ead..dabe8f9d4e4 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -43,6 +43,7 @@ #endif // no system header #include +#include #include #include #include @@ -195,6 +196,8 @@ struct DeviceReduce T init, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce"); + // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; @@ -304,6 +307,8 @@ struct DeviceReduce NumItemsT num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum"); + // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; @@ -423,6 +428,8 @@ struct DeviceReduce NumItemsT num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min"); + // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; @@ -547,6 +554,8 @@ struct DeviceReduce int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin"); + // Signed integer type for global offsets using OffsetT = int; @@ -673,6 +682,8 @@ struct DeviceReduce NumItemsT num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max"); + // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; @@ -802,6 +813,8 @@ struct DeviceReduce int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax"); + // Signed integer type for global offsets using OffsetT = int; @@ -968,6 +981,8 @@ struct DeviceReduce T init, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce"); + using OffsetT = detail::choose_offset_t; return DispatchTransformReduce::Dispatch( @@ -1135,6 +1150,8 @@ struct DeviceReduce NumItemsT num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey"); + // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 6b6a589d34e..96570e31f4b 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -187,6 +188,8 @@ struct DeviceRunLengthEncode int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::Encode"); + using offset_t = int; // Signed integer type for global offsets using equality_op = Equality; // Default == operator using reduction_op = cub::Sum; // Value reduction operator @@ -357,6 +360,8 @@ struct DeviceRunLengthEncode int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns"); + using OffsetT = int; // Signed integer type for global offsets using EqualityOp = Equality; // Default == operator diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index b7118bbf999..ff7b6d25677 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -180,6 +181,8 @@ struct DeviceScan int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); + // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; @@ -407,6 +410,8 @@ struct DeviceScan int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); + // Signed integer type for global offsets using OffsetT = int; @@ -683,6 +688,8 @@ struct DeviceScan int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); + // Signed integer type for global offsets using OffsetT = int; @@ -933,6 +940,8 @@ struct DeviceScan int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); + // Signed integer type for global offsets using OffsetT = int; @@ -1146,6 +1155,8 @@ struct DeviceScan int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); + // Signed integer type for global offsets using OffsetT = int; @@ -1389,6 +1400,8 @@ struct DeviceScan EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey"); + // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; @@ -1590,6 +1603,8 @@ struct DeviceScan EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey"); + // Signed integer type for global offsets using OffsetT = int; @@ -1757,6 +1772,8 @@ struct DeviceScan EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey"); + // Signed integer type for global offsets using OffsetT = int; @@ -1938,6 +1955,8 @@ struct DeviceScan EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey"); + // Signed integer type for global offsets using OffsetT = int; diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index 60001e78797..4bae106e1de 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -83,6 +84,14 @@ CUB_NAMESPACE_BEGIN //! @endrst struct DeviceSegmentedRadixSort { +private: + // Name reported for NVTX ranges + _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* + { + return "cub::DeviceSegmentedRadixSort"; + } + +public: //! @name Key-value pairs //! @{ @@ -232,6 +241,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -442,6 +453,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -642,6 +655,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -856,6 +871,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -1042,6 +1059,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -1238,6 +1257,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -1422,6 +1443,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; @@ -1615,6 +1638,8 @@ struct DeviceSegmentedRadixSort int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + // Signed integer type for global offsets using OffsetT = int; diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index d43fc260de3..0a75b71e1db 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -43,6 +43,7 @@ #endif // no system header #include +#include #include #include #include @@ -249,6 +250,8 @@ public: T initial_value, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce"); + // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; using integral_offset_check = ::cuda::std::is_integral; @@ -391,6 +394,8 @@ public: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum"); + // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; @@ -525,6 +530,8 @@ public: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min"); + // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; @@ -666,6 +673,8 @@ public: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin"); + // Integer type for global offsets // Using common iterator value type is a breaking change, see: // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 @@ -822,6 +831,8 @@ public: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max"); + // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; @@ -966,6 +977,8 @@ public: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax"); + // Integer type for global offsets // Using common iterator value type is a breaking change, see: // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 62cf6829d51..1b56e7e7435 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -126,6 +127,48 @@ CUB_NAMESPACE_BEGIN //! @endrst struct DeviceSegmentedSort { +private: + // Name reported for NVTX ranges + _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* + { + return "cub::DeviceSegmentedRadixSort"; + } + + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = false; + constexpr bool is_overwrite_okay = false; + using DispatchT = + DispatchSegmentedSort; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @name Keys-only //! @{ @@ -250,24 +293,16 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; - constexpr bool is_overwrite_okay = false; - using DispatchT = - DispatchSegmentedSort; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, - d_keys, - d_values, + d_keys_in, + d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -298,6 +333,42 @@ struct DeviceSegmentedSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = true; + constexpr bool is_overwrite_okay = false; + using DispatchT = + DispatchSegmentedSort; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of keys into descending order. Approximately //! ``num_items + 2 * num_segments`` auxiliary storage required. @@ -417,24 +488,16 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; - constexpr bool is_overwrite_okay = false; - using DispatchT = - DispatchSegmentedSort; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, - d_keys, - d_values, + d_keys_in, + d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -465,6 +528,41 @@ struct DeviceSegmentedSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = false; + constexpr bool is_overwrite_okay = true; + + using DispatchT = + DispatchSegmentedSort; + + DoubleBuffer d_values; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required. //! @@ -595,25 +693,9 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; - constexpr bool is_overwrite_okay = true; - - using DispatchT = - DispatchSegmentedSort; - - DoubleBuffer d_values; - - return DispatchT::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - is_overwrite_okay, - stream); + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysNoNVTX( + d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template @@ -634,6 +716,41 @@ struct DeviceSegmentedSort d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = true; + constexpr bool is_overwrite_okay = true; + + using DispatchT = + DispatchSegmentedSort; + + DoubleBuffer d_values; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of keys into descending order. Approximately //! ``2 * num_segments`` auxiliary storage required. @@ -765,25 +882,9 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; - constexpr bool is_overwrite_okay = true; - - using DispatchT = - DispatchSegmentedSort; - - DoubleBuffer d_values; - - return DispatchT::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - is_overwrite_okay, - stream); + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysDescendingNoNVTX( + d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template @@ -927,7 +1028,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortKeys( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, @@ -1089,7 +1191,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortKeysDescending( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, @@ -1261,7 +1364,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortKeys( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } @@ -1415,7 +1519,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortKeysDescending( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } @@ -1437,6 +1542,43 @@ struct DeviceSegmentedSort d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = false; + constexpr bool is_overwrite_okay = false; + using DispatchT = DispatchSegmentedSort; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @} end member group //! @name Key-value pairs //! @{ @@ -1584,23 +1726,18 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; - constexpr bool is_overwrite_okay = false; - using DispatchT = DispatchSegmentedSort; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, - d_keys, - d_values, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -1635,6 +1772,43 @@ struct DeviceSegmentedSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = true; + constexpr bool is_overwrite_okay = false; + using DispatchT = DispatchSegmentedSort; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. @@ -1778,23 +1952,18 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; - constexpr bool is_overwrite_okay = false; - using DispatchT = DispatchSegmentedSort; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, - d_keys, - d_values, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -1829,6 +1998,38 @@ struct DeviceSegmentedSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = false; + constexpr bool is_overwrite_okay = true; + using DispatchT = DispatchSegmentedSort; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -1979,11 +2180,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; - constexpr bool is_overwrite_okay = true; - using DispatchT = DispatchSegmentedSort; - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, @@ -1992,7 +2190,6 @@ struct DeviceSegmentedSort num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -2023,6 +2220,38 @@ struct DeviceSegmentedSort stream); } +private: + // Internal version without NVTX range + template + CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) + { + constexpr bool is_descending = true; + constexpr bool is_overwrite_okay = true; + using DispatchT = DispatchSegmentedSort; + + return DispatchT::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite_okay, + stream); + } + +public: //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -2172,11 +2401,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; - constexpr bool is_overwrite_okay = true; - using DispatchT = DispatchSegmentedSort; - - return DispatchT::Dispatch( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, @@ -2185,7 +2411,6 @@ struct DeviceSegmentedSort num_segments, d_begin_offsets, d_end_offsets, - is_overwrite_okay, stream); } @@ -2359,7 +2584,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortPairs( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, @@ -2547,7 +2773,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortPairsDescending( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, @@ -2743,7 +2970,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortPairs( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, @@ -2932,7 +3160,8 @@ struct DeviceSegmentedSort EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - return SortPairsDescending( + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); + return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 2ba2cc83f59..f5cf533f4b3 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -43,6 +43,7 @@ #endif // no system header #include +#include #include #include #include @@ -176,6 +177,8 @@ struct DeviceSelect int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); + using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -307,6 +310,8 @@ struct DeviceSelect int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); + using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -463,6 +468,8 @@ struct DeviceSelect SelectOp select_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); + using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -606,6 +613,8 @@ struct DeviceSelect SelectOp select_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); + using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -743,6 +752,8 @@ struct DeviceSelect SelectOp select_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); + using OffsetT = int; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) @@ -846,6 +857,8 @@ struct DeviceSelect SelectOp select_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); + using OffsetT = int; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) @@ -963,6 +976,8 @@ struct DeviceSelect int num_items, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique"); + using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) @@ -1143,6 +1158,8 @@ struct DeviceSelect EqualityOpT equality_op, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::UniqueByKey"); + using OffsetT = detail::choose_offset_t; return DispatchUniqueByKey< diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index f4494766992..30e8545da92 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -44,6 +44,7 @@ #include +#include #include #include @@ -188,6 +189,8 @@ struct DeviceSpmv int num_nonzeros, cudaStream_t stream = 0) { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV"); + SpmvParams spmv_params; spmv_params.d_values = d_values; spmv_params.d_row_end_offsets = d_row_offsets + 1; diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 80fcb5560ac..656fc2574d9 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -1231,7 +1231,7 @@ struct DispatchSegmentedSort : SelectedPolicy auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get()); - cub::DevicePartition::If( + cub::DevicePartition::IfNoNVTX( nullptr, three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), @@ -1511,7 +1511,7 @@ private: auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + num_segments); - error = CubDebug(cub::DevicePartition::If( + error = CubDebug(cub::DevicePartition::IfNoNVTX( device_partition_temp_storage.get(), three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu index 495af1c2229..2dcf3276018 100644 --- a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu +++ b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu index e63663957a1..5f464c33ec8 100644 --- a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu +++ b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_bulk.cu b/cub/test/catch2_test_device_bulk.cu index 4282fcd6130..0f0a5172117 100644 --- a/cub/test/catch2_test_device_bulk.cu +++ b/cub/test/catch2_test_device_bulk.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_decoupled_look_back.cu b/cub/test/catch2_test_device_decoupled_look_back.cu index 619f2f4ce91..dab95aa1873 100644 --- a/cub/test/catch2_test_device_decoupled_look_back.cu +++ b/cub/test/catch2_test_device_decoupled_look_back.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #undef NDEBUG #include diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu index fd9d991082c..62ccfec02cd 100644 --- a/cub/test/catch2_test_device_for.cu +++ b/cub/test/catch2_test_device_for.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_for_api.cu b/cub/test/catch2_test_device_for_api.cu index b964d8663b0..49ae911fc65 100644 --- a/cub/test/catch2_test_device_for_api.cu +++ b/cub/test/catch2_test_device_for_api.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu index bf0d06e935e..2263b3987e4 100644 --- a/cub/test/catch2_test_device_for_copy.cu +++ b/cub/test/catch2_test_device_for_copy.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_for_utils.cu b/cub/test/catch2_test_device_for_utils.cu index edfb9b43c77..5ecbf0773e4 100644 --- a/cub/test/catch2_test_device_for_utils.cu +++ b/cub/test/catch2_test_device_for_utils.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include "catch2_test_helper.h" diff --git a/cub/test/catch2_test_device_merge_sort.cu b/cub/test/catch2_test_device_merge_sort.cu index d9c229b9383..ec448199711 100644 --- a/cub/test/catch2_test_device_merge_sort.cu +++ b/cub/test/catch2_test_device_merge_sort.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_merge_sort_iterators.cu b/cub/test/catch2_test_device_merge_sort_iterators.cu index 11e01dd2177..cb555f70224 100644 --- a/cub/test/catch2_test_device_merge_sort_iterators.cu +++ b/cub/test/catch2_test_device_merge_sort_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_merge_sort_vsmem.cu b/cub/test/catch2_test_device_merge_sort_vsmem.cu index 593c59e7b3d..3fe114d4690 100644 --- a/cub/test/catch2_test_device_merge_sort_vsmem.cu +++ b/cub/test/catch2_test_device_merge_sort_vsmem.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 96d03c3d324..2317c4bfb2e 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index 2973c749fcd..84890a1233f 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_radix_sort_custom.cu b/cub/test/catch2_test_device_radix_sort_custom.cu index 6db869d5d53..4ffc5c008f3 100644 --- a/cub/test/catch2_test_device_radix_sort_custom.cu +++ b/cub/test/catch2_test_device_radix_sort_custom.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu index 701c5118bf1..961361622d5 100644 --- a/cub/test/catch2_test_device_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_radix_sort_keys.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_radix_sort_pairs.cu b/cub/test/catch2_test_device_radix_sort_pairs.cu index 9feb01a850e..bcd853a96e1 100644 --- a/cub/test/catch2_test_device_radix_sort_pairs.cu +++ b/cub/test/catch2_test_device_radix_sort_pairs.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 78e514dad2f..1e9e08c9113 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu index 0a9a583e6c6..39f31d5e781 100644 --- a/cub/test/catch2_test_device_reduce_by_key.cu +++ b/cub/test/catch2_test_device_reduce_by_key.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include "c2h/custom_type.cuh" diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu index 0b484ce4059..3637813b5f6 100644 --- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu +++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_reduce_fp_inf.cu b/cub/test/catch2_test_device_reduce_fp_inf.cu index b03d37b8389..61fa9b67889 100644 --- a/cub/test/catch2_test_device_reduce_fp_inf.cu +++ b/cub/test/catch2_test_device_reduce_fp_inf.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index 5b273ae4e63..7c7f74ec63a 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_run_length_encode.cu b/cub/test/catch2_test_device_run_length_encode.cu index d6c3ce60818..79a062f778c 100644 --- a/cub/test/catch2_test_device_run_length_encode.cu +++ b/cub/test/catch2_test_device_run_length_encode.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index bf363d9e013..185c0598ce6 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 646912012e0..8bbb9d64311 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_scan_by_key.cu b/cub/test/catch2_test_device_scan_by_key.cu index 45989a916c6..f5e40b1e2e9 100644 --- a/cub/test/catch2_test_device_scan_by_key.cu +++ b/cub/test/catch2_test_device_scan_by_key.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_scan_by_key_iterators.cu b/cub/test/catch2_test_device_scan_by_key_iterators.cu index e433d1458fd..ca242cf6105 100644 --- a/cub/test/catch2_test_device_scan_by_key_iterators.cu +++ b/cub/test/catch2_test_device_scan_by_key_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index a146bc82fc9..576d0d3f747 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu index 497cd9e8e9a..70c5a63f2f8 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu index 089e9b51165..10237a6460b 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index f796a09e0e4..770b85b0194 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_reduce_api.cu b/cub/test/catch2_test_device_segmented_reduce_api.cu index a33d7121d3d..935f4afa87c 100644 --- a/cub/test/catch2_test_device_segmented_reduce_api.cu +++ b/cub/test/catch2_test_device_segmented_reduce_api.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index de4ada78965..8ab495ddc59 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu index 6629a271dd8..7036d50ae45 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu index d0ef8182e33..4be32f66c89 100644 --- a/cub/test/catch2_test_device_segmented_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_sort_keys.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu index b2a3bd9fb38..144ea13e762 100644 --- a/cub/test/catch2_test_device_segmented_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_api.cu b/cub/test/catch2_test_device_select_api.cu index 7323bd5b2d3..6c230566f5c 100644 --- a/cub/test/catch2_test_device_select_api.cu +++ b/cub/test/catch2_test_device_select_api.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu index 5abd2425fc8..f3477787ecd 100644 --- a/cub/test/catch2_test_device_select_flagged.cu +++ b/cub/test/catch2_test_device_select_flagged.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_flagged_if.cu b/cub/test/catch2_test_device_select_flagged_if.cu index 6add4c22da5..652e6986723 100644 --- a/cub/test/catch2_test_device_select_flagged_if.cu +++ b/cub/test/catch2_test_device_select_flagged_if.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index 36dda800b8c..38a071e004c 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_if_vsmem.cu b/cub/test/catch2_test_device_select_if_vsmem.cu index 9d60fe23ad5..3552478b071 100644 --- a/cub/test/catch2_test_device_select_if_vsmem.cu +++ b/cub/test/catch2_test_device_select_if_vsmem.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index b3b855a104f..51c6200c624 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_select_unique_by_key.cu b/cub/test/catch2_test_device_select_unique_by_key.cu index 3cb20c4a836..4603f000704 100644 --- a/cub/test/catch2_test_device_select_unique_by_key.cu +++ b/cub/test/catch2_test_device_select_unique_by_key.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu index 5c10f0e572e..f94f48a5b2a 100644 --- a/cub/test/catch2_test_device_three_way_partition.cu +++ b/cub/test/catch2_test_device_three_way_partition.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/catch2_test_device_transform_reduce.cu b/cub/test/catch2_test_device_transform_reduce.cu index 5dd8e2d3de9..7c313562d7c 100644 --- a/cub/test/catch2_test_device_transform_reduce.cu +++ b/cub/test/catch2_test_device_transform_reduce.cu @@ -25,6 +25,9 @@ * ******************************************************************************/ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + #include #include diff --git a/cub/test/insert_nested_NVTX_range_guard.h b/cub/test/insert_nested_NVTX_range_guard.h new file mode 100644 index 00000000000..1626aaff9b5 --- /dev/null +++ b/cub/test/insert_nested_NVTX_range_guard.h @@ -0,0 +1,35 @@ +#pragma once + +// Include this file at the top of a unit test for CUB device algorithms to check whether any inserted NVTX ranges nest. + +#include +#include + +#include + +#if defined(__cpp_inline_variables) +inline thread_local bool entered = false; + +struct NestedNVTXRangeGuard +{ + NestedNVTXRangeGuard(const char* name) + { + UNSCOPED_INFO("Entering NVTX range " << name); + if (entered) + { + FAIL("Nested NVTX range detected"); + } + entered = true; + } + + ~NestedNVTXRangeGuard() + { + entered = false; + UNSCOPED_INFO("Leaving NVTX range"); + } +}; + +# define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) \ + ::cuda::std::optional<::NestedNVTXRangeGuard> __cub_nvtx3_reentrency_guard; \ + NV_IF_TARGET(NV_IS_HOST, __cub_nvtx3_reentrency_guard.emplace(name);); +#endif diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional index a21d5592f70..b4940773eae 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional @@ -264,6 +264,7 @@ struct __optional_destruct_base<_Tp, false> }; bool __engaged_; + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__optional_destruct_base() {