Implement a way to provide a memory_resource through the execution policy

miscco · miscco · commit 5947190b1623 · 2025-12-25T10:02:24.000+01:00
diff --git a/libcudacxx/include/cuda/__execution/policy.h b/libcudacxx/include/cuda/__execution/policy.h
@@ -23,10 +23,14 @@
 #if _CCCL_HAS_BACKEND_CUDA()
 
 #  include <cuda/__fwd/execution_policy.h>
+#  include <cuda/__memory_resource/device_memory_pool.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__memory_resource/resource.h>
 #  include <cuda/__stream/get_stream.h>
 #  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__execution/policy.h>
 #  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/forward.h>
 
 #  include <cuda/std/__cccl/prologue.h>
 
@@ -37,7 +41,7 @@ struct __policy_stream_holder
 {
   ::cuda::stream_ref __stream_;
 
-  _CCCL_API constexpr __policy_stream_holder(::cuda::stream_ref __stream) noexcept
+  _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref __stream) noexcept
       : __stream_(__stream)
   {}
 };
@@ -51,27 +55,64 @@ struct __policy_stream_holder<false>
   _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref) noexcept {}
 };
 
+template <bool _HasResource>
+struct __policy_memory_resource_holder
+{
+  using __resource_t = ::cuda::mr::any_resource<::cuda::mr::device_accessible>;
+
+  __resource_t __resource_;
+
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&& __resource) noexcept
+      : __resource_(::cuda::std::forward<_Resource>(__resource))
+  {}
+};
+
+template <>
+struct __policy_memory_resource_holder<false>
+{
+  _CCCL_HIDE_FROM_ABI __policy_memory_resource_holder() = default;
+
+  //! @brief Dummy constructor to simplify implementation of the cuda policy
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&&) noexcept {}
+};
+
 template <uint32_t _Policy>
 struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
     : __execution_policy_base<_Policy, __execution_backend::__none>
     , protected __policy_stream_holder<__cuda_policy_with_stream<_Policy>>
+    , protected __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>
 {
 private:
   template <uint32_t, __execution_backend>
   friend struct __execution_policy_base;
 
-  using __stream_holder = __policy_stream_holder<__cuda_policy_with_stream<_Policy>>;
+  using __stream_holder   = __policy_stream_holder<__cuda_policy_with_stream<_Policy>>;
+  using __resource_holder = __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>;
 
   template <uint32_t _OtherPolicy>
   _CCCL_HOST_API constexpr __execution_policy_base(
     const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy) noexcept
       : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
   {}
 
   template <uint32_t _OtherPolicy>
   _CCCL_HOST_API constexpr __execution_policy_base(
-    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>&, ::cuda::stream_ref __stream) noexcept
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy,
+    ::cuda::stream_ref __stream) noexcept
       : __stream_holder(__stream)
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
+  {}
+
+  template <uint32_t _OtherPolicy, class _Resource>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy, _Resource&& __resource) noexcept
+      : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(::cuda::std::forward<_Resource>(__resource))
   {}
 
 public:
@@ -109,6 +150,40 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_b
     }
   }
 
+  //! @brief Set the current memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND _WithResource)
+  [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_memory_resource(_Resource&& __resource) noexcept
+  {
+    this->__resource_ = __resource;
+    return *this;
+  }
+
+  //! @brief Convert to a policy that holds a memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND(!_WithResource))
+  [[nodiscard]] _CCCL_HOST_API auto set_memory_resource(_Resource&& __resource) const noexcept
+  {
+    constexpr uint32_t __new_policy =
+      __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_memory_resource>;
+    return __execution_policy_base<__new_policy>{*this, __resource};
+  }
+
+  //! @brief Return either a stored or a default memory resource
+  //! @note We cannot put that into the __policy_memory_resource_holder because we need a stream for the device
+  [[nodiscard]] _CCCL_HOST_API auto query(const ::cuda::mr::get_memory_resource_t&) const noexcept
+  {
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      return this->__resource_;
+    }
+    else
+    {
+      ::cuda::stream_ref __stream = this->query(::cuda::get_stream);
+      return ::cuda::device_default_memory_pool(__stream.device());
+    }
+  }
+
   template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
   [[nodiscard]] _CCCL_API friend constexpr bool operator==(
     const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept
@@ -126,6 +201,14 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_b
       }
     }
 
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      if (__lhs.query(::cuda::mr::get_memory_resource) != __rhs.query(::cuda::mr::get_memory_resource))
+      {
+        return false;
+      }
+    }
+
     return true;
   }
 
diff --git a/libcudacxx/include/cuda/__fwd/execution_policy.h b/libcudacxx/include/cuda/__fwd/execution_policy.h
@@ -30,7 +30,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
 enum __cuda_backend_options : uint16_t
 {
-  __with_stream = 1 << 0, ///> Determines whether the policy holds a stream
+  __with_stream          = 1 << 0, ///> Determines whether the policy holds a stream
+  __with_memory_resource = 1 << 1, ///> Determines whether the policy holds a memory resource
 };
 
 //! @brief Sets the execution backend to cuda
@@ -58,6 +59,11 @@ template <uint32_t _Policy>
 inline constexpr bool __cuda_policy_with_stream =
   __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_stream;
 
+//! @brief Detects whether a given policy holds a user provided memory resource
+template <uint32_t _Policy>
+inline constexpr bool __cuda_policy_with_memory_resource =
+  __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_memory_resource;
+
 _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
 
 #  include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h
@@ -31,6 +31,7 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
 _CCCL_DIAG_POP
 
 #  include <cuda/__execution/policy.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
 #  include <cuda/__runtime/api_wrapper.h>
 #  include <cuda/__stream/get_stream.h>
 #  include <cuda/std/__exception/cuda_error.h>
@@ -58,7 +59,7 @@ template <>
 struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
 {
   //! Ensures we properly deallocate the memory allocated for the result
-  template <class _Tp>
+  template <class _Tp, class _Resource>
   struct __allocation_guard
   {
     //! This helper struct ensures that we can properly assign types with a nontrivial assignment operator
@@ -78,24 +79,18 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
     };
 
     ::cuda::stream_ref __stream_;
+    _Resource& __resource_;
     _Tp* __ptr_;
 
-    _CCCL_HOST_API __allocation_guard(::cuda::stream_ref __stream)
+    _CCCL_HOST_API __allocation_guard(::cuda::stream_ref __stream, _Resource& __resource)
         : __stream_(__stream)
-        , __ptr_(nullptr)
-    {
-      _CCCL_TRY_CUDA_API(
-        ::cudaMallocAsync,
-        "__pstl_cuda_reduce: allocation failed",
-        reinterpret_cast<void**>(&__ptr_),
-        sizeof(_Tp),
-        __stream_.get());
-    }
+        , __resource_(__resource)
+        , __ptr_(static_cast<_Tp*>(__resource_.allocate(__stream_, sizeof(_Tp), alignof(_Tp))))
+    {}
 
     _CCCL_HOST_API ~__allocation_guard()
     {
-      _CCCL_TRY_CUDA_API(::cudaFreeAsync, "__pstl_cuda_reduce: deallocate failed", __ptr_, __stream_.get());
-
+      __resource_.deallocate(__stream_, __ptr_, sizeof(_Tp), alignof(_Tp));
       __stream_.sync();
     }
 
@@ -113,8 +108,9 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
 
     {
       // Allocate memory for result
-      auto __stream = __policy.query(::cuda::get_stream);
-      __allocation_guard<_Tp> __guard{__stream};
+      auto __stream   = __policy.query(::cuda::get_stream);
+      auto __resource = __policy.query(::cuda::mr::get_memory_resource);
+      __allocation_guard<_Tp, decltype(__resource)> __guard{__stream, __resource};
 
       const auto __count = ::cuda::std::distance(__first, __last);
       _CCCL_TRY_CUDA_API(
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/memory_resource>
+#include <cuda/std/__pstl/for_each.h>
+#include <cuda/std/execution>
+#include <cuda/std/memory>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+struct test_resource
+{
+  __host__ __device__ void* allocate_sync(std::size_t, std::size_t)
+  {
+    return nullptr;
+  }
+
+  __host__ __device__ void deallocate_sync(void* ptr, std::size_t, std::size_t) noexcept
+  {
+    // ensure that we did get the right inputs forwarded
+    _val = *static_cast<int*>(ptr);
+  }
+
+  __host__ __device__ void* allocate(cuda::stream_ref, std::size_t, std::size_t)
+  {
+    return &_val;
+  }
+
+  __host__ __device__ void deallocate(cuda::stream_ref, void* ptr, std::size_t, std::size_t)
+  {
+    // ensure that we did get the right inputs forwarded
+    _val = *static_cast<int*>(ptr);
+  }
+
+  __host__ __device__ bool operator==(const test_resource& other) const
+  {
+    return _val == other._val;
+  }
+  __host__ __device__ bool operator!=(const test_resource& other) const
+  {
+    return _val != other._val;
+  }
+
+  friend constexpr void get_property(const test_resource&, ::cuda::mr::device_accessible) noexcept {}
+
+  int _val = 0;
+};
+
+template <class Policy>
+void test(Policy pol)
+{
+  auto old_stream = ::cuda::get_stream(pol);
+  { // Ensure that the plain policy returns a well defined memory resource
+    auto expected_resource = ::cuda::device_default_memory_pool(cuda::device_ref{0});
+    assert(cuda::mr::get_memory_resource(pol) == expected_resource);
+  }
+
+  { // Ensure that we can attach a memory resource to an execution policy
+    test_resource resource{42};
+    auto pol_with_resource = pol.set_memory_resource(resource);
+    assert(cuda::mr::get_memory_resource(pol_with_resource) == resource);
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+
+    using policy_t = decltype(pol_with_resource);
+    static_assert(noexcept(pol.set_memory_resource(resource)));
+    static_assert(cuda::std::is_execution_policy_v<policy_t>);
+  }
+
+  { // Ensure that attaching a memory resource multiple times just overwrites the old one
+    test_resource resource{42};
+    auto pol_with_resource = pol.set_memory_resource(resource);
+    assert(cuda::mr::get_memory_resource(pol_with_resource) == resource);
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+
+    using policy_t = decltype(pol_with_resource);
+    test_resource other_resource{1337};
+    decltype(auto) pol_with_other_resource = pol_with_resource.set_memory_resource(other_resource);
+    static_assert(cuda::std::is_same_v<decltype(pol_with_other_resource), policy_t&>);
+    assert(::cuda::mr::get_memory_resource(pol_with_resource) == other_resource);
+    assert(::cuda::mr::get_memory_resource(pol_with_other_resource) == other_resource);
+    assert(cuda::std::addressof(pol_with_resource) == cuda::std::addressof(pol_with_other_resource));
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+  }
+}
+
+void test()
+{
+  namespace execution = cuda::std::execution;
+  static_assert(!execution::__queryable_with<execution::sequenced_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(!execution::__queryable_with<execution::parallel_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(
+    !execution::__queryable_with<execution::parallel_unsequenced_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(!execution::__queryable_with<execution::unsequenced_policy, ::cuda::mr::get_memory_resource_t>);
+
+  test(cuda::execution::__cub_par_unseq);
+
+  // Ensure that all works even if we have a stream attached
+  test(cuda::execution::__cub_par_unseq.set_stream(::cuda::stream{cuda::device_ref{0}}));
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp
@@ -22,12 +22,6 @@
 template <class Policy>
 void test(Policy pol)
 {
-  namespace execution = cuda::std::execution;
-  static_assert(!execution::__queryable_with<execution::sequenced_policy, ::cuda::get_stream_t>);
-  static_assert(!execution::__queryable_with<execution::parallel_policy, ::cuda::get_stream_t>);
-  static_assert(!execution::__queryable_with<execution::parallel_unsequenced_policy, ::cuda::get_stream_t>);
-  static_assert(!execution::__queryable_with<execution::unsequenced_policy, ::cuda::get_stream_t>);
-
   { // Ensure that the plain policy returns a well defined stream
     cuda::stream_ref expected_stream{cudaStreamPerThread};
     assert(cuda::get_stream(pol) == expected_stream);
@@ -41,7 +35,6 @@ void test(Policy pol)
     using stream_policy_t = decltype(pol_with_stream);
     static_assert(noexcept(pol.set_stream(stream)));
     static_assert(cuda::std::is_execution_policy_v<stream_policy_t>);
-    static_assert(cuda::std::is_base_of_v<cuda::std::execution::__policy_stream_holder<true>, stream_policy_t>);
   }
 
   { // Ensure that attaching a stream multiple times just overwrites the old stream
@@ -68,6 +61,9 @@ void test()
   static_assert(!execution::__queryable_with<execution::unsequenced_policy, ::cuda::get_stream_t>);
 
   test(cuda::execution::__cub_par_unseq);
+
+  // Ensure that all works even if we have a memory resource
+  test(cuda::execution::__cub_par_unseq.set_memory_resource(::cuda::device_default_memory_pool(::cuda::device_ref{0})));
 }
 
 int main(int, char**)