24 #ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
25 #define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
31 #include <TiledArray/config.h>
33 #ifdef TILEDARRAY_HAS_CUDA
36 #include <cuda_runtime.h>
37 #include <nvToolsExt.h>
38 #include <thrust/system/cuda/error.h>
39 #include <thrust/system_error.h>
42 #include <umpire/Umpire.hpp>
43 #include <umpire/strategy/DynamicPool.hpp>
44 #include <umpire/strategy/SizeLimiter.hpp>
45 #include <umpire/strategy/ThreadSafeAllocator.hpp>
48 #include <madness/world/print.h>
49 #include <madness/world/safempi.h>
50 #include <madness/world/thread.h>
54 #define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
55 #define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
56 #define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
58 inline void __cudaSafeCall(cudaError err,
const char* file,
const int line) {
59 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
60 if (cudaSuccess != err) {
62 ss <<
"cudaSafeCall() failed at: " << file <<
":" << line;
63 std::string what = ss.str();
64 throw thrust::system_error(err, thrust::cuda_category(), what);
69 inline void __cudaSafeCallNoThrow(cudaError err,
const char* file,
71 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
72 if (cudaSuccess != err) {
73 madness::print_error(
"cudaSafeCallNoThrow() failed at: ", file,
":", line);
78 inline void __cudaCheckError(
const char* file,
const int line) {
79 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
80 cudaError err = cudaGetLastError();
81 if (cudaSuccess != err) {
83 ss <<
"cudaCheckError() failed at: " << file <<
":" << line;
84 std::string what = ss.str();
85 throw thrust::system_error(err, thrust::cuda_category(), what);
94 inline std::pair<int, int> mpi_local_rank_size(World& world) {
96 world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0);
97 return std::make_pair(host_comm.Get_rank(), host_comm.Get_size());
100 inline int num_cuda_streams() {
101 int num_streams = -1;
102 char* num_stream_char = std::getenv(
"TA_CUDA_NUM_STREAMS");
104 if (num_stream_char) {
105 num_streams = std::atoi(num_stream_char);
112 inline int num_cuda_devices() {
113 int num_devices = -1;
114 CudaSafeCall(cudaGetDeviceCount(&num_devices));
118 inline int current_cuda_device_id(World& world) {
119 int mpi_local_size = -1;
120 int mpi_local_rank = -1;
121 std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world);
123 int num_devices = detail::num_cuda_devices();
125 int cuda_device_id = -1;
129 if (mpi_local_size <= num_devices) {
130 cuda_device_id = mpi_local_rank % num_devices;
132 char* cvd_cstr = std::getenv(
"CUDA_VISIBLE_DEVICES");
135 if (num_devices != 1) {
136 throw std::runtime_error(
138 "CUDA_VISIBLE_DEVICES environment variable is set, hence using "
139 "the provided device-to-rank mapping; BUT TiledArray found ") +
140 std::to_string(num_devices) +
141 " CUDA devices; only 1 CUDA device / MPI process is supported");
145 throw std::runtime_error(
146 std::string(
"TiledArray found ") + std::to_string(mpi_local_size) +
147 " MPI ranks on a node with " + std::to_string(num_devices) +
148 " CUDA devices; only 1 MPI process / CUDA device model is currently "
153 return cuda_device_id;
156 inline void CUDART_CB cuda_readyflag_callback(
void* userData) {
158 std::atomic<bool>* flag =
static_cast<std::atomic<bool>*
>(userData);
164 ProbeFlag(std::atomic<bool>* f) : flag(f) {}
166 bool operator()()
const {
return flag->load(); }
168 std::atomic<bool>* flag;
171 inline void thread_wait_cuda_stream(
const cudaStream_t& stream) {
172 std::atomic<bool>* flag =
new std::atomic<bool>(
false);
175 cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag));
177 detail::ProbeFlag probe(flag);
180 madness::ThreadPool::await(probe,
false,
true);
188 inline const cudaStream_t*& tls_cudastream_accessor() {
189 static thread_local
const cudaStream_t* thread_local_stream_ptr{
nullptr};
190 return thread_local_stream_ptr;
193 inline void synchronize_stream(
const cudaStream_t* stream) {
194 tls_cudastream_accessor() = stream;
207 for (
auto& stream : cuda_streams_) {
208 CudaSafeCallNoThrow(cudaStreamDestroy(stream));
213 cudaEnv(cudaEnv& cuda_global) =
delete;
216 cudaEnv operator=(cudaEnv& cuda_global) =
delete;
219 static std::unique_ptr<cudaEnv>& instance() {
220 static std::unique_ptr<cudaEnv> instance_{
nullptr};
228 static void initialize(std::unique_ptr<cudaEnv>& instance, World& world) {
230 if (instance ==
nullptr) {
231 int num_streams = detail::num_cuda_streams();
232 int num_devices = detail::num_cuda_devices();
233 int device_id = detail::current_cuda_device_id(world);
235 CudaSafeCall(cudaSetDevice(device_id));
236 CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
245 auto& rm = umpire::ResourceManager::getInstance();
247 auto mem_total_free = cudaEnv::memory_total_and_free_device();
251 constexpr
auto introspect =
true;
253 constexpr
auto introspect =
false;
258 auto alloc_grain = mem_total_free.second / 10;
259 auto um_dynamic_pool =
260 rm.makeAllocator<umpire::strategy::DynamicPool, introspect>(
261 "UMDynamicPool", rm.getAllocator(
"UM"), mem_total_free.second,
263 auto thread_safe_um_dynamic_pool =
264 rm.makeAllocator<umpire::strategy::ThreadSafeAllocator, introspect>(
265 "ThreadSafeUMDynamicPool", um_dynamic_pool);
268 auto dev_size_limited_alloc =
269 rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
270 "size_limited_alloc", rm.getAllocator(
"DEVICE"),
271 mem_total_free.first);
272 auto dev_dynamic_pool =
273 rm.makeAllocator<umpire::strategy::DynamicPool, introspect>(
274 "CUDADynamicPool", dev_size_limited_alloc, 0, alloc_grain);
275 auto thread_safe_dev_dynamic_pool =
276 rm.makeAllocator<umpire::strategy::ThreadSafeAllocator, introspect>(
277 "ThreadSafeCUDADynamicPool", dev_dynamic_pool);
279 auto cuda_env = std::unique_ptr<cudaEnv>(
new cudaEnv(
280 world, num_devices, device_id, num_streams,
281 thread_safe_um_dynamic_pool, thread_safe_dev_dynamic_pool));
282 instance = std::move(cuda_env);
286 World& world()
const {
return *world_; }
288 int num_cuda_devices()
const {
return num_cuda_devices_; }
290 int current_cuda_device_id()
const {
return current_cuda_device_id_; }
292 int num_cuda_streams()
const {
return num_cuda_streams_; }
294 bool concurrent_managed_access()
const {
295 return cuda_device_concurrent_managed_access_;
298 size_t stream_id(
const cudaStream_t& stream)
const {
299 auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream);
300 if (it == cuda_streams_.end()) abort();
301 return it - cuda_streams_.begin();
305 static std::pair<size_t, size_t> memory_total_and_free_device() {
306 std::pair<size_t, size_t> result;
308 CudaSafeCall(cudaMemGetInfo(&result.second, &result.first));
316 std::vector<std::pair<size_t, size_t>> memory_total_and_free()
const {
317 auto world_size = world_->size();
318 std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
319 auto rank = world_->rank();
320 std::tie(total_memory.at(
rank), free_memory.at(
rank)) =
321 cudaEnv::memory_total_and_free_device();
322 world_->gop.sum(total_memory.data(), total_memory.size());
323 world_->gop.sum(free_memory.data(), free_memory.size());
324 std::vector<std::pair<size_t, size_t>> result(world_size);
325 for (
int r = 0; r != world_size; ++r) {
326 result.at(r) = {total_memory.at(r), free_memory.at(r)};
331 const cudaStream_t& cuda_stream(std::size_t i)
const {
332 return cuda_streams_.at(i);
335 const cudaStream_t& cuda_stream_h2d()
const {
336 return cuda_streams_[num_cuda_streams_];
339 const cudaStream_t& cuda_stream_d2h()
const {
340 return cuda_streams_[num_cuda_streams_ + 1];
343 umpire::Allocator& um_dynamic_pool() {
return um_dynamic_pool_; }
345 umpire::Allocator& device_dynamic_pool() {
return device_dynamic_pool_; }
348 cudaEnv(World& world,
int num_devices,
int device_id,
int num_streams,
349 umpire::Allocator um_alloc, umpire::Allocator device_alloc)
351 um_dynamic_pool_(um_alloc),
352 device_dynamic_pool_(device_alloc),
353 num_cuda_devices_(num_devices),
354 current_cuda_device_id_(device_id),
355 num_cuda_streams_(num_streams) {
356 if (num_devices <= 0) {
357 throw std::runtime_error(
"No CUDA-Enabled GPUs Found!\n");
361 CudaSafeCall(cudaSetDevice(current_cuda_device_id_));
365 CudaSafeCall(cudaGetDeviceProperties(&prop, device_id));
366 if (!prop.managedMemory) {
367 throw std::runtime_error(
"CUDA Device doesn't support managedMemory\n");
369 int concurrent_managed_access;
370 CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access,
371 cudaDevAttrConcurrentManagedAccess,
373 cuda_device_concurrent_managed_access_ = concurrent_managed_access;
374 if (!cuda_device_concurrent_managed_access_) {
375 std::cout <<
"\nWarning: CUDA Device doesn't support "
376 "ConcurrentManagedAccess!\n\n";
380 cuda_streams_.resize(num_cuda_streams_ + 2);
381 for (
auto& stream : cuda_streams_) {
382 CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
384 std::cout <<
"created " << num_cuda_streams_
385 <<
" CUDA streams + 2 I/O streams" << std::endl;
393 umpire::Allocator um_dynamic_pool_;
395 umpire::Allocator device_dynamic_pool_;
397 int num_cuda_devices_;
398 int current_cuda_device_id_;
399 bool cuda_device_concurrent_managed_access_;
401 int num_cuda_streams_;
402 std::vector<cudaStream_t> cuda_streams_;
407 template <
typename Range>
408 const cudaStream_t& get_stream_based_on_range(
const Range& range) {
410 auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams();
411 auto& stream = cudaEnv::instance()->cuda_stream(stream_id);
420 enum class argbColor : uint32_t {
426 magenta = 0xFFFF00FF,
434 inline void range_push(
const char* range_title, argbColor range_color) {
435 nvtxEventAttributes_t eventAttrib = {0};
436 eventAttrib.version = NVTX_VERSION;
437 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
438 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
439 eventAttrib.colorType = NVTX_COLOR_ARGB;
440 eventAttrib.color =
static_cast<uint32_t
>(range_color);
441 eventAttrib.message.ascii = range_title;
442 nvtxRangePushEx(&eventAttrib);
446 inline void range_pop() { nvtxRangePop(); }
452 #endif // TILEDARRAY_HAS_CUDA
454 #endif // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED