btas_um_tensor.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2018 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTiledArrayBILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Chong Peng
19  * Department of Chemistry, Virginia Tech
20  * July 24, 2018
21  *
22  */
23 
24 #ifndef TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
25 #define TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
26 
27 #include <tiledarray_fwd.h>
28 
30 
31 #ifdef TILEDARRAY_HAS_CUDA
32 
36 #include <TiledArray/tile.h>
37 
38 namespace TiledArray {
39 
40 namespace detail {
41 template <typename T, typename Range>
42 struct is_cuda_tile<
43  ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>>
44  : public std::true_type {};
45 
46 template <typename T>
47 void to_cuda(const TiledArray::btasUMTensorVarray<T> &tile) {
48  cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
49  auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range());
50  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
51  tile.storage(), stream);
52 }
53 
54 } // end of namespace detail
55 
56 } // end of namespace TiledArray
57 
59 namespace madness {
60 namespace archive {
61 
62 template <class Archive, typename T>
63 struct ArchiveLoadImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
64  static inline void load(const Archive &ar,
65  TiledArray::btasUMTensorVarray<T> &t) {
66  TiledArray::Range range{};
67  TiledArray::cuda_um_btas_varray<T> store{};
68  ar &range &store;
69  t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
70  // cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
71  // auto &stream = TiledArray::detail::get_stream_based_on_range(range);
72  // TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(t.storage(),
73  // stream);
74  }
75 };
76 
77 template <class Archive, typename T>
78 struct ArchiveStoreImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
79  static inline void store(const Archive &ar,
80  const TiledArray::btasUMTensorVarray<T> &t) {
81  CudaSafeCall(cudaSetDevice(
82  TiledArray::cudaEnv::instance()->current_cuda_device_id()));
83  auto &stream = TiledArray::detail::get_stream_based_on_range(t.range());
84  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(t.storage(),
85  stream);
86  ar &t.range() & t.storage();
87  }
88 };
89 
90 } // namespace archive
91 } // namespace madness
92 
93 namespace TiledArray {
97 
98 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
99 btasUMTensorVarray<T, Range> gemm(
100  const btasUMTensorVarray<T, Range> &left,
101  const btasUMTensorVarray<T, Range> &right, Scalar factor,
102  const TiledArray::math::GemmHelper &gemm_helper) {
103  return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
104 }
105 
106 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
107 void gemm(btasUMTensorVarray<T, Range> &result,
108  const btasUMTensorVarray<T, Range> &left,
109  const btasUMTensorVarray<T, Range> &right, Scalar factor,
110  const TiledArray::math::GemmHelper &gemm_helper) {
111  return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper);
112 }
113 
117 
118 template <typename T, typename Range>
119 btasUMTensorVarray<T, Range> clone(const btasUMTensorVarray<T, Range> &arg) {
120  // TODO how to copy Unified Memory? from CPU or GPU? currently
121  // always copy on GPU, but need to investigate
122  return btas_tensor_clone_cuda_impl(arg);
123 }
124 
128 template <typename T, typename Range, typename Index>
129 btasUMTensorVarray<T, Range> shift(const btasUMTensorVarray<T, Range> &arg,
130  const Index &range_shift) {
131  // make a copy of the old range
132  Range result_range(arg.range());
133  // shift the range
134  result_range.inplace_shift(range_shift);
135 
136  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
137 
138  // @important select the stream using the shifted range
139  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
140 
141  typename btasUMTensorVarray<T, Range>::storage_type result_storage;
142 
143  make_device_storage(result_storage, result_range.volume(), cuda_stream);
144  btasUMTensorVarray<T, Range> result(std::move(result_range),
145  std::move(result_storage));
146 
147  // call cublasCopy
148  const auto &handle = cuBLASHandlePool::handle();
149  CublasSafeCall(cublasSetStream(handle, cuda_stream));
150 
151  CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
152  1, device_data(result.storage()), 1));
153 
154  synchronize_stream(&cuda_stream);
155  return result;
156 }
157 
161 template <typename T, typename Range, typename Index>
162 btasUMTensorVarray<T, Range>& shift_to(btasUMTensorVarray<T, Range> &arg,
163  const Index &range_shift) {
164  const_cast<Range &>(arg.range()).inplace_shift(range_shift);
165  return arg;
166 }
167 
171 
172 template <typename T, typename Range>
173 btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
174  const TiledArray::Permutation &perm) {
175  // compute result range
176  auto result_range = perm * arg.range();
177  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
178 
179  // compute the stream to use
180  auto &stream = detail::get_stream_based_on_range(result_range);
181 
182  // allocate result memory
183  typename btasUMTensorVarray<T, Range>::storage_type storage;
184  make_device_storage(storage, result_range.area(), stream);
185 
186  btasUMTensorVarray<T, Range> result(std::move(result_range),
187  std::move(storage));
188 
189  // invoke the permute function
190  cutt_permute(const_cast<T *>(device_data(arg.storage())),
191  device_data(result.storage()), arg.range(), perm, stream);
192 
193  synchronize_stream(&stream);
194 
195  return result;
196 }
197 
201 
202 template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
203 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
204  const Scalar factor) {
205  detail::to_cuda(arg);
206  return btas_tensor_scale_cuda_impl(arg, factor);
207 }
208 
209 template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
210 btasUMTensorVarray<T, Range>& scale_to(btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
211  detail::to_cuda(arg);
212  btas_tensor_scale_to_cuda_impl(arg, factor);
213  return arg;
214 }
215 
216 template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
217 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
218  const Scalar factor,
219  const Perm &perm) {
220  auto result = scale(arg, factor);
221 
222  // wait to finish before switch stream
223  auto stream = tls_cudastream_accessor();
224  cudaStreamSynchronize(*stream);
225 
226  return permute(result, perm);
227 }
228 
232 
233 template <typename T, typename Range>
234 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg) {
235  detail::to_cuda(arg);
236  return btas_tensor_scale_cuda_impl(arg, T(-1.0));
237 }
238 
239 template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
240 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
241  const Perm &perm) {
242  auto result = neg(arg);
243 
244  // wait to finish before switch stream
245  auto stream = tls_cudastream_accessor();
246  cudaStreamSynchronize(*stream);
247 
248  return permute(result, perm);
249 }
250 
251 template <typename T, typename Range>
252 btasUMTensorVarray<T, Range>& neg_to(btasUMTensorVarray<T, Range> &arg) {
253  detail::to_cuda(arg);
254  btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
255  return arg;
256 }
257 
261 
262 template <typename T, typename Range>
263 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
264  const btasUMTensorVarray<T, Range> &arg2) {
265  detail::to_cuda(arg1);
266  detail::to_cuda(arg2);
267  return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
268 }
269 
270 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
271 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
272  const btasUMTensorVarray<T, Range> &arg2,
273  const Scalar factor) {
274  auto result = subt(arg1, arg2);
275  btas_tensor_scale_to_cuda_impl(result, factor);
276  return result;
277 }
278 
279 template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
280 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
281  const btasUMTensorVarray<T, Range> &arg2,
282  const Perm &perm) {
283  auto result = subt(arg1, arg2);
284 
285  // wait to finish before switch stream
286  auto stream = tls_cudastream_accessor();
287  cudaStreamSynchronize(*stream);
288 
289  return permute(result, perm);
290 }
291 
292 template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
293 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
294  const btasUMTensorVarray<T, Range> &arg2,
295  const Scalar factor,
296  const Perm &perm) {
297  auto result = subt(arg1, arg2, factor);
298 
299  // wait to finish before switch stream
300  auto stream = tls_cudastream_accessor();
301  cudaStreamSynchronize(*stream);
302 
303  return permute(result, perm);
304 }
305 
309 
310 template <typename T, typename Range>
311 btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
312  const btasUMTensorVarray<T, Range> &arg1) {
313  detail::to_cuda(result);
314  detail::to_cuda(arg1);
315  btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
316  return result;
317 }
318 
319 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
320 btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
321  const btasUMTensorVarray<T, Range> &arg1, const Scalar factor) {
322  subt_to(result, arg1);
323  btas_tensor_scale_to_cuda_impl(result, factor);
324  return result;
325 }
326 
330 
331 template <typename T, typename Range>
332 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
333  const btasUMTensorVarray<T, Range> &arg2) {
334  detail::to_cuda(arg1);
335  detail::to_cuda(arg2);
336  return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
337 }
338 
339 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
340 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
341  const btasUMTensorVarray<T, Range> &arg2,
342  const Scalar factor) {
343  auto result = add(arg1, arg2);
344  btas_tensor_scale_to_cuda_impl(result, factor);
345  return result;
346 }
347 
348 template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
349 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
350  const btasUMTensorVarray<T, Range> &arg2,
351  const Scalar factor,
352  const Perm &perm) {
353  auto result = add(arg1, arg2, factor);
354 
355  // wait to finish before switch stream
356  auto stream = tls_cudastream_accessor();
357  cudaStreamSynchronize(*stream);
358 
359  return permute(result, perm);
360 }
361 
362 template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
363 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
364  const btasUMTensorVarray<T, Range> &arg2,
365  const Perm &perm) {
366  auto result = add(arg1, arg2);
367 
368  // wait to finish before switch stream
369  auto stream = tls_cudastream_accessor();
370  cudaStreamSynchronize(*stream);
371 
372  return permute(result, perm);
373 }
374 
378 
379 template <typename T, typename Range>
380 btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
381  const btasUMTensorVarray<T, Range> &arg) {
382  detail::to_cuda(result);
383  detail::to_cuda(arg);
384  btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
385  return result;
386 }
387 
388 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
389 btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
390  const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
391  add_to(result, arg);
392  btas_tensor_scale_to_cuda_impl(result, factor);
393  return result;
394 }
395 
399 template <typename T, typename Range>
400 typename btasUMTensorVarray<T, Range>::value_type dot(
401  const btasUMTensorVarray<T, Range> &arg1,
402  const btasUMTensorVarray<T, Range> &arg2) {
403  detail::to_cuda(arg1);
404  detail::to_cuda(arg2);
405  return btas_tensor_dot_cuda_impl(arg1, arg2);
406 }
407 
411 template <typename T, typename Range>
412 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
413  const btasUMTensorVarray<T, Range> &arg2) {
414  detail::to_cuda(arg1);
415  detail::to_cuda(arg2);
416  return btas_tensor_mult_cuda_impl(arg1, arg2);
417 }
418 
419 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
420 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
421  const btasUMTensorVarray<T, Range> &arg2,
422  const Scalar factor) {
423  auto result = mult(arg1, arg2);
424  btas_tensor_scale_to_cuda_impl(result, factor);
425  return result;
426 }
427 
428 template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
429 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
430  const btasUMTensorVarray<T, Range> &arg2,
431  const Perm &perm) {
432  auto result = mult(arg1, arg2);
433 
434  // wait to finish before switch stream
435  auto stream = tls_cudastream_accessor();
436  cudaStreamSynchronize(*stream);
437 
438  return permute(result, perm);
439 }
440 
441 template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
442 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
443  const btasUMTensorVarray<T, Range> &arg2,
444  const Scalar factor,
445  const Perm &perm) {
446  auto result = mult(arg1, arg2, factor);
447 
448  // wait to finish before switch stream
449  auto stream = tls_cudastream_accessor();
450  cudaStreamSynchronize(*stream);
451 
452  return permute(result, perm);
453 }
454 
458 template <typename T, typename Range>
459 btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
460  const btasUMTensorVarray<T, Range> &arg) {
461  detail::to_cuda(result);
462  detail::to_cuda(arg);
463  btas_tensor_mult_to_cuda_impl(result, arg);
464  return result;
465 }
466 
467 template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
468 btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
469  const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
470  mult_to(result, arg);
471  btas_tensor_scale_to_cuda_impl(result, factor);
472  return result;
473 }
474 
478 
482 
483 template <typename T, typename Range>
484 typename btasUMTensorVarray<T, Range>::value_type squared_norm(
485  const btasUMTensorVarray<T, Range> &arg) {
486  detail::to_cuda(arg);
487  return btas_tensor_squared_norm_cuda_impl(arg);
488 }
489 
493 
494 template <typename T, typename Range>
495 typename btasUMTensorVarray<T, Range>::value_type norm(
496  const btasUMTensorVarray<T, Range> &arg) {
497  detail::to_cuda(arg);
498  return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg));
499 }
500 
504 template <typename T, typename Range>
505 typename btasUMTensorVarray<T, Range>::value_type trace(
506  const btasUMTensorVarray<T, Range> &arg) {
507  assert(false);
508 }
509 
513 template <typename T, typename Range>
514 typename btasUMTensorVarray<T, Range>::value_type sum(
515  const btasUMTensorVarray<T, Range> &arg) {
516  detail::to_cuda(arg);
517  return btas_tensor_sum_cuda_impl(arg);
518 }
519 
523 template <typename T, typename Range>
524 typename btasUMTensorVarray<T, Range>::value_type product(
525  const btasUMTensorVarray<T, Range> &arg) {
526  detail::to_cuda(arg);
527  return btas_tensor_product_cuda_impl(arg);
528 }
529 
533 template <typename T, typename Range>
534 typename btasUMTensorVarray<T, Range>::value_type max(
535  const btasUMTensorVarray<T, Range> &arg) {
536  detail::to_cuda(arg);
537  return btas_tensor_max_cuda_impl(arg);
538 }
539 
543 template <typename T, typename Range>
544 typename btasUMTensorVarray<T, Range>::value_type abs_max(
545  const btasUMTensorVarray<T, Range> &arg) {
546  detail::to_cuda(arg);
547  return btas_tensor_absmax_cuda_impl(arg);
548 }
549 
553 template <typename T, typename Range>
554 typename btasUMTensorVarray<T, Range>::value_type min(
555  const btasUMTensorVarray<T, Range> &arg) {
556  detail::to_cuda(arg);
557  return btas_tensor_min_cuda_impl(arg);
558 }
559 
563 template <typename T, typename Range>
564 typename btasUMTensorVarray<T, Range>::value_type abs_min(
565  const btasUMTensorVarray<T, Range> &arg) {
566  detail::to_cuda(arg);
567  return btas_tensor_absmin_cuda_impl(arg);
568 }
569 
571 template <typename UMTensor, typename Policy>
572 void to_host(
574  auto to_host = [](TiledArray::Tile<UMTensor> &tile) {
575  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
576  auto &stream = detail::get_stream_based_on_range(tile.range());
577 
578  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
579  tile.tensor().storage(), stream);
580  };
581 
582  auto &world = um_array.world();
583 
584  auto start = um_array.pmap()->begin();
585  auto end = um_array.pmap()->end();
586 
587  for (; start != end; ++start) {
588  if (!um_array.is_zero(*start)) {
589  world.taskq.add(to_host, um_array.find(*start));
590  }
591  }
592 
593  world.gop.fence();
594  CudaSafeCall(cudaDeviceSynchronize());
595 };
596 
598 template <typename UMTensor, typename Policy>
599 void to_device(
601  auto to_device = [](TiledArray::Tile<UMTensor> &tile) {
602  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
603  auto &stream = detail::get_stream_based_on_range(tile.range());
604 
605  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
606  tile.tensor().storage(), stream);
607  };
608 
609  auto &world = um_array.world();
610 
611  auto start = um_array.pmap()->begin();
612  auto end = um_array.pmap()->end();
613 
614  for (; start != end; ++start) {
615  if (!um_array.is_zero(*start)) {
616  world.taskq.add(to_device, um_array.find(*start));
617  }
618  }
619 
620  world.gop.fence();
621  CudaSafeCall(cudaDeviceSynchronize());
622 };
623 
625 template <typename UMTensor, typename TATensor, typename Policy>
626 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
628 um_tensor_to_ta_tensor(
629  const TiledArray::DistArray<UMTensor, Policy> &um_array) {
630  const auto convert_tile_memcpy = [](const UMTensor &tile) {
631  TATensor result(tile.tensor().range());
632 
633  auto &stream = cudaEnv::instance()->cuda_stream_d2h();
634  CudaSafeCall(
635  cudaMemcpyAsync(result.data(), tile.data(),
636  tile.size() * sizeof(typename TATensor::value_type),
637  cudaMemcpyDefault, stream));
638  synchronize_stream(&stream);
639 
640  return result;
641  };
642 
643  const auto convert_tile_um = [](const UMTensor &tile) {
644  TATensor result(tile.tensor().range());
645  using std::begin;
646  const auto n = tile.tensor().size();
647 
648  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
649  auto &stream = detail::get_stream_based_on_range(tile.range());
650 
651  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
652  tile.tensor().storage(), stream);
653 
654  std::copy_n(tile.data(), n, result.data());
655 
656  return result;
657  };
658 
659  const char *use_legacy_conversion =
660  std::getenv("TA_CUDA_LEGACY_UM_CONVERSION");
661  auto ta_array = use_legacy_conversion
662  ? to_new_tile_type(um_array, convert_tile_um)
663  : to_new_tile_type(um_array, convert_tile_memcpy);
664 
665  um_array.world().gop.fence();
666  return ta_array;
667 }
668 
670 template <typename UMTensor, typename TATensor, typename Policy>
671 typename std::enable_if<std::is_same<UMTensor, TATensor>::value,
673 um_tensor_to_ta_tensor(
674  const TiledArray::DistArray<UMTensor, Policy> &um_array) {
675  return um_array;
676 }
677 
679 template <typename UMTensor, typename TATensor, typename Policy>
680 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
682 ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
683  auto convert_tile_memcpy = [](const TATensor &tile) {
685 
686  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
687 
688  using Tensor = typename UMTensor::tensor_type;
689 
690  auto &stream = cudaEnv::instance()->cuda_stream_h2d();
691  typename Tensor::storage_type storage;
692  make_device_storage(storage, tile.range().area(), stream);
693  Tensor result(tile.range(), std::move(storage));
694 
695  CudaSafeCall(
696  cudaMemcpyAsync(result.data(), tile.data(),
697  tile.size() * sizeof(typename Tensor::value_type),
698  cudaMemcpyDefault, stream));
699 
700  synchronize_stream(&stream);
701  return TiledArray::Tile<Tensor>(std::move(result));
702  };
703 
704  auto convert_tile_um = [](const TATensor &tile) {
706 
707  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
708 
709  using Tensor = typename UMTensor::tensor_type;
710  typename Tensor::storage_type storage(tile.range().area());
711 
712  Tensor result(tile.range(), std::move(storage));
713 
714  const auto n = tile.size();
715 
716  std::copy_n(tile.data(), n, result.data());
717 
718  auto &stream = detail::get_stream_based_on_range(result.range());
719 
720  // prefetch data to GPU
721  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
722  result.storage(), stream);
723 
724  return TiledArray::Tile<Tensor>(std::move(result));
725  };
726 
727  const char *use_legacy_conversion =
728  std::getenv("TA_CUDA_LEGACY_UM_CONVERSION");
729  auto um_array = use_legacy_conversion
730  ? to_new_tile_type(array, convert_tile_um)
731  : to_new_tile_type(array, convert_tile_memcpy);
732 
733  array.world().gop.fence();
734  return um_array;
735 }
736 
738 template <typename UMTensor, typename TATensor, typename Policy>
739 typename std::enable_if<std::is_same<UMTensor, TATensor>::value,
741 ta_tensor_to_um_tensor(const TiledArray::DistArray<UMTensor, Policy> &array) {
742  return array;
743 }
744 
745 } // namespace TiledArray
746 
747 #ifndef TILEDARRAY_HEADER_ONLY
748 
749 extern template class btas::varray<double,
750  TiledArray::cuda_um_allocator<double>>;
751 extern template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
752 extern template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
753 extern template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
754 
755 extern template class btas::Tensor<double, TiledArray::Range,
756  TiledArray::cuda_um_btas_varray<double>>;
757 extern template class btas::Tensor<float, TiledArray::Range,
758  TiledArray::cuda_um_btas_varray<float>>;
759 extern template class btas::Tensor<int, TiledArray::Range,
760  TiledArray::cuda_um_btas_varray<int>>;
761 extern template class btas::Tensor<long, TiledArray::Range,
762  TiledArray::cuda_um_btas_varray<long>>;
763 
764 extern template class TiledArray::Tile<btas::Tensor<
765  double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
766 extern template class TiledArray::Tile<btas::Tensor<
767  float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
768 extern template class TiledArray::Tile<
769  btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
770 extern template class TiledArray::Tile<btas::Tensor<
771  long, TiledArray::Range, TiledArray::cuda_um_btas_varray<long>>>;
772 
773 #endif // TILEDARRAY_HEADER_ONLY
774 
775 #endif // TILEDARRAY_HAS_CUDA
776 
777 #endif // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
decltype(auto) subt(const Tile< Left > &left, const Tile< Right > &right)
Subtract tile arguments.
Definition: tile.h:879
Tile< Arg > & shift_to(Tile< Arg > &arg, const Index &range_shift)
Shift the range of arg in place.
Definition: tile.h:704
std::enable_if<!TiledArray::detail::is_permutation_v< Perm >, TiledArray::Range >::type permute(const TiledArray::Range &r, const Perm &p)
Definition: btas.h:803
Contraction to *GEMM helper.
Definition: gemm_helper.h:40
Tile< Result > & scale_to(Tile< Result > &result, const Scalar factor)
Scale to the result tile.
Definition: tile.h:1204
detail::ShiftWrapper< T > shift(T &tensor)
Shift a tensor from one range to another.
Permutation of a sequence of objects indexed by base-0 indices.
Definition: permutation.h:130
decltype(auto) to_new_tile_type(DistArray< Tile, Policy > const &old_array, Op &&op)
Function to convert an array to a new array with a different tile type.
Tile< Result > & mult_to(Tile< Result > &result, const Tile< Arg > &arg)
Multiply to the result tile.
Definition: tile.h:1081
DistArray< Tile, Policy > clone(const DistArray< Tile, Policy > &arg)
Create a deep copy of an array.
Definition: clone.h:43
decltype(auto) trace(const Tile< Arg > &arg)
Sum the hyper-diagonal elements a tile.
Definition: tile.h:1477
auto dot(const DistArray< Tile, Policy > &a, const DistArray< Tile, Policy > &b)
Definition: dist_array.h:1640
decltype(auto) norm(const Tile< Arg > &arg)
Vector 2-norm of a tile.
Definition: tile.h:1527
decltype(auto) min(const Tile< Arg > &arg)
Minimum element of a tile.
Definition: tile.h:1559
auto abs_min(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1630
constexpr auto end(Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:51
decltype(auto) add(const Tile< Left > &left, const Tile< Right > &right)
Add tile arguments.
Definition: tile.h:734
void make_device_storage(cpu_cuda_vector< T > &storage, std::size_t n, cudaStream_t stream=0)
auto abs_max(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1635
decltype(auto) neg(const Tile< Arg > &arg)
Negate the tile argument.
Definition: tile.h:1218
void load(TiledArray::DistArray< Tile, Policy > &x, const std::string name)
Definition: dist_array.h:1696
Tile< Result > & add_to(Tile< Result > &result, const Tile< Arg > &arg)
Add to the result tile.
Definition: tile.h:831
decltype(auto) mult(const Tile< Left > &left, const Tile< Right > &right)
Multiplication tile arguments.
Definition: tile.h:1018
auto squared_norm(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1655
Tile< Result > & neg_to(Tile< Result > &result)
In-place negate tile.
Definition: tile.h:1243
Forward declarations.
Definition: dist_array.h:57
World & world() const
World accessor.
Definition: dist_array.h:1007
T * device_data(cpu_cuda_vector< T > &storage)
decltype(auto) scale(const Tile< Arg > &arg, const Scalar factor)
Scalar the tile argument.
Definition: tile.h:1174
allocator_type::value_type value_type
Array element type.
Definition: tensor.h:66
decltype(auto) product(const Tile< Arg > &arg)
Multiply the elements of a tile.
Definition: tile.h:1506
decltype(auto) max(const Tile< Arg > &arg)
Maximum element of a tile.
Definition: tile.h:1549
decltype(auto) sum(const Tile< Arg > &arg)
Sum the elements of a tile.
Definition: tile.h:1496
Tile< Result > & subt_to(Tile< Result > &result, const Tile< Arg > &arg)
Subtract from the result tile.
Definition: tile.h:972
constexpr auto begin(const Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:45
An N-dimensional shallow copy wrapper for tile objects.
Definition: tile.h:82
A (hyperrectangular) interval on , space of integer -indices.
Definition: range.h:46
decltype(auto) gemm(const Tile< Left > &left, const Tile< Right > &right, const Scalar factor, const math::GemmHelper &gemm_config)
Contract 2 tensors over head/tail modes and scale the product.
Definition: tile.h:1396