cuda.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2018 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Chong Peng
19  * Department of Chemistry, Virginia Tech
20  * July 23, 2018
21  *
22  */
23 
24 #ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
25 #define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
26 
27 #include <cassert>
28 #include <cstdlib>
29 #include <vector>
30 
31 #include <TiledArray/config.h>
32 
33 #ifdef TILEDARRAY_HAS_CUDA
34 
35 #include <cuda.h>
36 #include <cuda_runtime.h>
37 #include <nvToolsExt.h>
38 #include <thrust/system/cuda/error.h>
39 #include <thrust/system_error.h>
40 
41 // for memory management
42 #include <umpire/Umpire.hpp>
43 #include <umpire/strategy/DynamicPool.hpp>
44 #include <umpire/strategy/SizeLimiter.hpp>
45 #include <umpire/strategy/ThreadSafeAllocator.hpp>
46 
48 #include <madness/world/print.h>
49 #include <madness/world/safempi.h>
50 #include <madness/world/thread.h>
51 
52 #include <TiledArray/error.h>
53 
54 #define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
55 #define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
56 #define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
57 
58 inline void __cudaSafeCall(cudaError err, const char* file, const int line) {
59 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
60  if (cudaSuccess != err) {
61  std::stringstream ss;
62  ss << "cudaSafeCall() failed at: " << file << ":" << line;
63  std::string what = ss.str();
64  throw thrust::system_error(err, thrust::cuda_category(), what);
65  }
66 #endif
67 }
68 
69 inline void __cudaSafeCallNoThrow(cudaError err, const char* file,
70  const int line) {
71 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
72  if (cudaSuccess != err) {
73  madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line);
74  }
75 #endif
76 }
77 
78 inline void __cudaCheckError(const char* file, const int line) {
79 #ifdef TILEDARRAY_CHECK_CUDA_ERROR
80  cudaError err = cudaGetLastError();
81  if (cudaSuccess != err) {
82  std::stringstream ss;
83  ss << "cudaCheckError() failed at: " << file << ":" << line;
84  std::string what = ss.str();
85  throw thrust::system_error(err, thrust::cuda_category(), what);
86  }
87 #endif
88 }
89 
90 namespace TiledArray {
91 
92 namespace detail {
93 
94 inline std::pair<int, int> mpi_local_rank_size(World& world) {
95  auto host_comm =
96  world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0);
97  return std::make_pair(host_comm.Get_rank(), host_comm.Get_size());
98 }
99 
100 inline int num_cuda_streams() {
101  int num_streams = -1;
102  char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS");
104  if (num_stream_char) {
105  num_streams = std::atoi(num_stream_char);
106  } else {
107  num_streams = 3;
108  }
109  return num_streams;
110 }
111 
112 inline int num_cuda_devices() {
113  int num_devices = -1;
114  CudaSafeCall(cudaGetDeviceCount(&num_devices));
115  return num_devices;
116 }
117 
118 inline int current_cuda_device_id(World& world) {
119  int mpi_local_size = -1;
120  int mpi_local_rank = -1;
121  std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world);
122 
123  int num_devices = detail::num_cuda_devices();
124 
125  int cuda_device_id = -1;
126  // devices may already be pre-mapped
127  // if mpi_local_size <= num_devices : all ranks are in same resource set, map
128  // round robin
129  if (mpi_local_size <= num_devices) {
130  cuda_device_id = mpi_local_rank % num_devices;
131  } else { // mpi_local_size > num_devices
132  char* cvd_cstr = std::getenv("CUDA_VISIBLE_DEVICES");
133  if (cvd_cstr) { // CUDA_VISIBLE_DEVICES is set, assume that pre-mapped
134  // make sure that there is only 1 device available here
135  if (num_devices != 1) {
136  throw std::runtime_error(
137  std::string(
138  "CUDA_VISIBLE_DEVICES environment variable is set, hence using "
139  "the provided device-to-rank mapping; BUT TiledArray found ") +
140  std::to_string(num_devices) +
141  " CUDA devices; only 1 CUDA device / MPI process is supported");
142  }
143  cuda_device_id = 0;
144  } else { // not enough devices + devices are not pre-mapped
145  throw std::runtime_error(
146  std::string("TiledArray found ") + std::to_string(mpi_local_size) +
147  " MPI ranks on a node with " + std::to_string(num_devices) +
148  " CUDA devices; only 1 MPI process / CUDA device model is currently "
149  "supported");
150  }
151  }
152 
153  return cuda_device_id;
154 }
155 
156 inline void CUDART_CB cuda_readyflag_callback(void* userData) {
157  // convert void * to std::atomic<bool>
158  std::atomic<bool>* flag = static_cast<std::atomic<bool>*>(userData);
159  // set the flag to be true
160  flag->store(true);
161 }
162 
163 struct ProbeFlag {
164  ProbeFlag(std::atomic<bool>* f) : flag(f) {}
165 
166  bool operator()() const { return flag->load(); }
167 
168  std::atomic<bool>* flag;
169 };
170 
171 inline void thread_wait_cuda_stream(const cudaStream_t& stream) {
172  std::atomic<bool>* flag = new std::atomic<bool>(false);
173 
174  CudaSafeCall(
175  cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag));
176 
177  detail::ProbeFlag probe(flag);
178 
179  // wait with sleep and do not do work
180  madness::ThreadPool::await(probe, false, true);
181  // madness::ThreadPool::await(probe, true, true);
182 
183  delete flag;
184 }
185 
186 } // namespace detail
187 
188 inline const cudaStream_t*& tls_cudastream_accessor() {
189  static thread_local const cudaStream_t* thread_local_stream_ptr{nullptr};
190  return thread_local_stream_ptr;
191 }
192 
193 inline void synchronize_stream(const cudaStream_t* stream) {
194  tls_cudastream_accessor() = stream;
195 }
196 
203 class cudaEnv {
204  public:
205  ~cudaEnv() {
206  // destroy cuda streams on current device
207  for (auto& stream : cuda_streams_) {
208  CudaSafeCallNoThrow(cudaStreamDestroy(stream));
209  }
210  }
211 
213  cudaEnv(cudaEnv& cuda_global) = delete;
214 
216  cudaEnv operator=(cudaEnv& cuda_global) = delete;
217 
219  static std::unique_ptr<cudaEnv>& instance() {
220  static std::unique_ptr<cudaEnv> instance_{nullptr};
221  if (!instance_) {
223  }
224  return instance_;
225  }
226 
228  static void initialize(std::unique_ptr<cudaEnv>& instance, World& world) {
229  // initialize only when not initialized
230  if (instance == nullptr) {
231  int num_streams = detail::num_cuda_streams();
232  int num_devices = detail::num_cuda_devices();
233  int device_id = detail::current_cuda_device_id(world);
234  // set device for current MPI process .. will be set in the ctor as well
235  CudaSafeCall(cudaSetDevice(device_id));
236  CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
237 
238  // uncomment to debug umpire ops
239  //
240  // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel(
241  // umpire::util::message::Debug);
242 
243  // make Thread Safe UM Dynamic POOL
244 
245  auto& rm = umpire::ResourceManager::getInstance();
246 
247  auto mem_total_free = cudaEnv::memory_total_and_free_device();
248 
249  // turn off Umpire introspection for non-Debug builds
250 #ifndef NDEBUG
251  constexpr auto introspect = true;
252 #else
253  constexpr auto introspect = false;
254 #endif
255 
256  // allocate all free memory for UM pool
257  // subsequent allocs will use 1/10 of the total device memory
258  auto alloc_grain = mem_total_free.second / 10;
259  auto um_dynamic_pool =
260  rm.makeAllocator<umpire::strategy::DynamicPool, introspect>(
261  "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second,
262  alloc_grain);
263  auto thread_safe_um_dynamic_pool =
264  rm.makeAllocator<umpire::strategy::ThreadSafeAllocator, introspect>(
265  "ThreadSafeUMDynamicPool", um_dynamic_pool);
266 
267  // allocate zero memory for device pool, same grain for subsequent allocs
268  auto dev_size_limited_alloc =
269  rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
270  "size_limited_alloc", rm.getAllocator("DEVICE"),
271  mem_total_free.first);
272  auto dev_dynamic_pool =
273  rm.makeAllocator<umpire::strategy::DynamicPool, introspect>(
274  "CUDADynamicPool", dev_size_limited_alloc, 0, alloc_grain);
275  auto thread_safe_dev_dynamic_pool =
276  rm.makeAllocator<umpire::strategy::ThreadSafeAllocator, introspect>(
277  "ThreadSafeCUDADynamicPool", dev_dynamic_pool);
278 
279  auto cuda_env = std::unique_ptr<cudaEnv>(new cudaEnv(
280  world, num_devices, device_id, num_streams,
281  thread_safe_um_dynamic_pool, thread_safe_dev_dynamic_pool));
282  instance = std::move(cuda_env);
283  }
284  }
285 
286  World& world() const { return *world_; }
287 
288  int num_cuda_devices() const { return num_cuda_devices_; }
289 
290  int current_cuda_device_id() const { return current_cuda_device_id_; }
291 
292  int num_cuda_streams() const { return num_cuda_streams_; }
293 
294  bool concurrent_managed_access() const {
295  return cuda_device_concurrent_managed_access_;
296  }
297 
298  size_t stream_id(const cudaStream_t& stream) const {
299  auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream);
300  if (it == cuda_streams_.end()) abort();
301  return it - cuda_streams_.begin();
302  }
303 
305  static std::pair<size_t, size_t> memory_total_and_free_device() {
306  std::pair<size_t, size_t> result;
307  // N.B. cudaMemGetInfo returns {free,total}
308  CudaSafeCall(cudaMemGetInfo(&result.second, &result.first));
309  return result;
310  }
311 
313 
316  std::vector<std::pair<size_t, size_t>> memory_total_and_free() const {
317  auto world_size = world_->size();
318  std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
319  auto rank = world_->rank();
320  std::tie(total_memory.at(rank), free_memory.at(rank)) =
321  cudaEnv::memory_total_and_free_device();
322  world_->gop.sum(total_memory.data(), total_memory.size());
323  world_->gop.sum(free_memory.data(), free_memory.size());
324  std::vector<std::pair<size_t, size_t>> result(world_size);
325  for (int r = 0; r != world_size; ++r) {
326  result.at(r) = {total_memory.at(r), free_memory.at(r)};
327  }
328  return result;
329  }
330 
331  const cudaStream_t& cuda_stream(std::size_t i) const {
332  return cuda_streams_.at(i);
333  }
334 
335  const cudaStream_t& cuda_stream_h2d() const {
336  return cuda_streams_[num_cuda_streams_];
337  }
338 
339  const cudaStream_t& cuda_stream_d2h() const {
340  return cuda_streams_[num_cuda_streams_ + 1];
341  }
342 
343  umpire::Allocator& um_dynamic_pool() { return um_dynamic_pool_; }
344 
345  umpire::Allocator& device_dynamic_pool() { return device_dynamic_pool_; }
346 
347  protected:
348  cudaEnv(World& world, int num_devices, int device_id, int num_streams,
349  umpire::Allocator um_alloc, umpire::Allocator device_alloc)
350  : world_(&world),
351  um_dynamic_pool_(um_alloc),
352  device_dynamic_pool_(device_alloc),
353  num_cuda_devices_(num_devices),
354  current_cuda_device_id_(device_id),
355  num_cuda_streams_(num_streams) {
356  if (num_devices <= 0) {
357  throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
358  }
359 
360  // set device for current MPI process
361  CudaSafeCall(cudaSetDevice(current_cuda_device_id_));
362 
364  cudaDeviceProp prop;
365  CudaSafeCall(cudaGetDeviceProperties(&prop, device_id));
366  if (!prop.managedMemory) {
367  throw std::runtime_error("CUDA Device doesn't support managedMemory\n");
368  }
369  int concurrent_managed_access;
370  CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access,
371  cudaDevAttrConcurrentManagedAccess,
372  device_id));
373  cuda_device_concurrent_managed_access_ = concurrent_managed_access;
374  if (!cuda_device_concurrent_managed_access_) {
375  std::cout << "\nWarning: CUDA Device doesn't support "
376  "ConcurrentManagedAccess!\n\n";
377  }
378 
379  // creates cuda streams on current device
380  cuda_streams_.resize(num_cuda_streams_ + 2);
381  for (auto& stream : cuda_streams_) {
382  CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
383  }
384  std::cout << "created " << num_cuda_streams_
385  << " CUDA streams + 2 I/O streams" << std::endl;
386  }
387 
388  private:
389  // the world used to initialize this
390  World* world_;
391 
393  umpire::Allocator um_dynamic_pool_;
395  umpire::Allocator device_dynamic_pool_;
396 
397  int num_cuda_devices_;
398  int current_cuda_device_id_;
399  bool cuda_device_concurrent_managed_access_;
400 
401  int num_cuda_streams_;
402  std::vector<cudaStream_t> cuda_streams_;
403 };
404 
405 namespace detail {
406 
407 template <typename Range>
408 const cudaStream_t& get_stream_based_on_range(const Range& range) {
409  // TODO better way to get stream based on the id of tensor
410  auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams();
411  auto& stream = cudaEnv::instance()->cuda_stream(stream_id);
412  return stream;
413 }
414 
415 } // namespace detail
416 
417 namespace nvidia {
418 
419 // Color definitions for nvtxcalls
420 enum class argbColor : uint32_t {
421  red = 0xFFFF0000,
422  blue = 0xFF0000FF,
423  green = 0xFF008000,
424  yellow = 0xFFFFFF00,
425  cyan = 0xFF00FFFF,
426  magenta = 0xFFFF00FF,
427  gray = 0xFF808080,
428  purple = 0xFF800080
429 };
430 
434 inline void range_push(const char* range_title, argbColor range_color) {
435  nvtxEventAttributes_t eventAttrib = {0};
436  eventAttrib.version = NVTX_VERSION;
437  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
438  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
439  eventAttrib.colorType = NVTX_COLOR_ARGB;
440  eventAttrib.color = static_cast<uint32_t>(range_color);
441  eventAttrib.message.ascii = range_title;
442  nvtxRangePushEx(&eventAttrib);
443 }
444 
446 inline void range_pop() { nvtxRangePop(); }
447 
448 } // namespace nvidia
449 
450 } // namespace TiledArray
451 
452 #endif // TILEDARRAY_HAS_CUDA
453 
454 #endif // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
auto rank(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1617
World & initialize(int &argc, char **&argv, const SafeMPI::Intracomm &comm, bool quiet=true)
Definition: tiledarray.cpp:80
World & get_default_world()
Definition: madness.h:90