24 #ifndef TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
25 #define TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
31 #ifdef TILEDARRAY_HAS_CUDA
41 template <
typename T,
typename Range>
43 ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>>
44 :
public std::true_type {};
47 void to_cuda(
const TiledArray::btasUMTensorVarray<T> &tile) {
48 cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
49 auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range());
50 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
51 tile.storage(), stream);
62 template <
class Archive,
typename T>
63 struct ArchiveLoadImpl<Archive,
TiledArray::btasUMTensorVarray<T>> {
64 static inline void load(
const Archive &ar,
65 TiledArray::btasUMTensorVarray<T> &t) {
67 TiledArray::cuda_um_btas_varray<T> store{};
69 t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
77 template <
class Archive,
typename T>
78 struct ArchiveStoreImpl<Archive,
TiledArray::btasUMTensorVarray<T>> {
79 static inline void store(
const Archive &ar,
80 const TiledArray::btasUMTensorVarray<T> &t) {
81 CudaSafeCall(cudaSetDevice(
82 TiledArray::cudaEnv::instance()->current_cuda_device_id()));
83 auto &stream = TiledArray::detail::get_stream_based_on_range(t.range());
84 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(t.storage(),
86 ar &t.range() & t.storage();
98 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
99 btasUMTensorVarray<T, Range>
gemm(
100 const btasUMTensorVarray<T, Range> &left,
101 const btasUMTensorVarray<T, Range> &right, Scalar factor,
103 return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
106 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
107 void gemm(btasUMTensorVarray<T, Range> &result,
108 const btasUMTensorVarray<T, Range> &left,
109 const btasUMTensorVarray<T, Range> &right, Scalar factor,
111 return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper);
118 template <
typename T,
typename Range>
119 btasUMTensorVarray<T, Range>
clone(
const btasUMTensorVarray<T, Range> &arg) {
122 return btas_tensor_clone_cuda_impl(arg);
128 template <
typename T,
typename Range,
typename Index>
129 btasUMTensorVarray<T, Range>
shift(
const btasUMTensorVarray<T, Range> &arg,
130 const Index &range_shift) {
132 Range result_range(arg.range());
134 result_range.inplace_shift(range_shift);
136 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
139 auto &cuda_stream = detail::get_stream_based_on_range(result_range);
141 typename btasUMTensorVarray<T, Range>::storage_type result_storage;
144 btasUMTensorVarray<T, Range> result(std::move(result_range),
145 std::move(result_storage));
148 const auto &handle = cuBLASHandlePool::handle();
149 CublasSafeCall(cublasSetStream(handle, cuda_stream));
151 CublasSafeCall(cublasCopy(handle, result.size(),
device_data(arg.storage()),
154 synchronize_stream(&cuda_stream);
161 template <
typename T,
typename Range,
typename Index>
162 btasUMTensorVarray<T, Range>&
shift_to(btasUMTensorVarray<T, Range> &arg,
163 const Index &range_shift) {
164 const_cast<Range &
>(arg.range()).inplace_shift(range_shift);
172 template <
typename T,
typename Range>
173 btasUMTensorVarray<T, Range>
permute(
const btasUMTensorVarray<T, Range> &arg,
176 auto result_range = perm * arg.range();
177 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
180 auto &stream = detail::get_stream_based_on_range(result_range);
183 typename btasUMTensorVarray<T, Range>::storage_type storage;
186 btasUMTensorVarray<T, Range> result(std::move(result_range),
190 cutt_permute(
const_cast<T *
>(
device_data(arg.storage())),
191 device_data(result.storage()), arg.range(), perm, stream);
193 synchronize_stream(&stream);
202 template <
typename T,
typename Range,
typename Scalar,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
203 btasUMTensorVarray<T, Range>
scale(
const btasUMTensorVarray<T, Range> &arg,
204 const Scalar factor) {
205 detail::to_cuda(arg);
206 return btas_tensor_scale_cuda_impl(arg, factor);
209 template <
typename T,
typename Range,
typename Scalar,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
210 btasUMTensorVarray<T, Range>&
scale_to(btasUMTensorVarray<T, Range> &arg,
const Scalar factor) {
211 detail::to_cuda(arg);
212 btas_tensor_scale_to_cuda_impl(arg, factor);
216 template <
typename T,
typename Range,
typename Scalar,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
217 btasUMTensorVarray<T, Range>
scale(
const btasUMTensorVarray<T, Range> &arg,
220 auto result =
scale(arg, factor);
223 auto stream = tls_cudastream_accessor();
224 cudaStreamSynchronize(*stream);
233 template <
typename T,
typename Range>
234 btasUMTensorVarray<T, Range>
neg(
const btasUMTensorVarray<T, Range> &arg) {
235 detail::to_cuda(arg);
236 return btas_tensor_scale_cuda_impl(arg, T(-1.0));
239 template <
typename T,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
240 btasUMTensorVarray<T, Range>
neg(
const btasUMTensorVarray<T, Range> &arg,
242 auto result =
neg(arg);
245 auto stream = tls_cudastream_accessor();
246 cudaStreamSynchronize(*stream);
251 template <
typename T,
typename Range>
252 btasUMTensorVarray<T, Range>&
neg_to(btasUMTensorVarray<T, Range> &arg) {
253 detail::to_cuda(arg);
254 btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
262 template <
typename T,
typename Range>
263 btasUMTensorVarray<T, Range>
subt(
const btasUMTensorVarray<T, Range> &arg1,
264 const btasUMTensorVarray<T, Range> &arg2) {
265 detail::to_cuda(arg1);
266 detail::to_cuda(arg2);
267 return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
270 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
271 btasUMTensorVarray<T, Range>
subt(
const btasUMTensorVarray<T, Range> &arg1,
272 const btasUMTensorVarray<T, Range> &arg2,
273 const Scalar factor) {
274 auto result =
subt(arg1, arg2);
275 btas_tensor_scale_to_cuda_impl(result, factor);
279 template <
typename T,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
280 btasUMTensorVarray<T, Range>
subt(
const btasUMTensorVarray<T, Range> &arg1,
281 const btasUMTensorVarray<T, Range> &arg2,
283 auto result =
subt(arg1, arg2);
286 auto stream = tls_cudastream_accessor();
287 cudaStreamSynchronize(*stream);
292 template <
typename T,
typename Scalar,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
293 btasUMTensorVarray<T, Range>
subt(
const btasUMTensorVarray<T, Range> &arg1,
294 const btasUMTensorVarray<T, Range> &arg2,
297 auto result =
subt(arg1, arg2, factor);
300 auto stream = tls_cudastream_accessor();
301 cudaStreamSynchronize(*stream);
310 template <
typename T,
typename Range>
311 btasUMTensorVarray<T, Range>&
subt_to(btasUMTensorVarray<T, Range> &result,
312 const btasUMTensorVarray<T, Range> &arg1) {
313 detail::to_cuda(result);
314 detail::to_cuda(arg1);
315 btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
319 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
320 btasUMTensorVarray<T, Range>&
subt_to(btasUMTensorVarray<T, Range> &result,
321 const btasUMTensorVarray<T, Range> &arg1,
const Scalar factor) {
323 btas_tensor_scale_to_cuda_impl(result, factor);
331 template <
typename T,
typename Range>
332 btasUMTensorVarray<T, Range>
add(
const btasUMTensorVarray<T, Range> &arg1,
333 const btasUMTensorVarray<T, Range> &arg2) {
334 detail::to_cuda(arg1);
335 detail::to_cuda(arg2);
336 return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
339 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
340 btasUMTensorVarray<T, Range>
add(
const btasUMTensorVarray<T, Range> &arg1,
341 const btasUMTensorVarray<T, Range> &arg2,
342 const Scalar factor) {
343 auto result =
add(arg1, arg2);
344 btas_tensor_scale_to_cuda_impl(result, factor);
348 template <
typename T,
typename Scalar,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
349 btasUMTensorVarray<T, Range>
add(
const btasUMTensorVarray<T, Range> &arg1,
350 const btasUMTensorVarray<T, Range> &arg2,
353 auto result =
add(arg1, arg2, factor);
356 auto stream = tls_cudastream_accessor();
357 cudaStreamSynchronize(*stream);
362 template <
typename T,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
363 btasUMTensorVarray<T, Range>
add(
const btasUMTensorVarray<T, Range> &arg1,
364 const btasUMTensorVarray<T, Range> &arg2,
366 auto result =
add(arg1, arg2);
369 auto stream = tls_cudastream_accessor();
370 cudaStreamSynchronize(*stream);
379 template <
typename T,
typename Range>
380 btasUMTensorVarray<T, Range>&
add_to(btasUMTensorVarray<T, Range> &result,
381 const btasUMTensorVarray<T, Range> &arg) {
382 detail::to_cuda(result);
383 detail::to_cuda(arg);
384 btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
388 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
389 btasUMTensorVarray<T, Range>&
add_to(btasUMTensorVarray<T, Range> &result,
390 const btasUMTensorVarray<T, Range> &arg,
const Scalar factor) {
392 btas_tensor_scale_to_cuda_impl(result, factor);
399 template <
typename T,
typename Range>
400 typename btasUMTensorVarray<T, Range>::value_type
dot(
401 const btasUMTensorVarray<T, Range> &arg1,
402 const btasUMTensorVarray<T, Range> &arg2) {
403 detail::to_cuda(arg1);
404 detail::to_cuda(arg2);
405 return btas_tensor_dot_cuda_impl(arg1, arg2);
411 template <
typename T,
typename Range>
412 btasUMTensorVarray<T, Range>
mult(
const btasUMTensorVarray<T, Range> &arg1,
413 const btasUMTensorVarray<T, Range> &arg2) {
414 detail::to_cuda(arg1);
415 detail::to_cuda(arg2);
416 return btas_tensor_mult_cuda_impl(arg1, arg2);
419 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
420 btasUMTensorVarray<T, Range>
mult(
const btasUMTensorVarray<T, Range> &arg1,
421 const btasUMTensorVarray<T, Range> &arg2,
422 const Scalar factor) {
423 auto result =
mult(arg1, arg2);
424 btas_tensor_scale_to_cuda_impl(result, factor);
428 template <
typename T,
typename Range,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
429 btasUMTensorVarray<T, Range>
mult(
const btasUMTensorVarray<T, Range> &arg1,
430 const btasUMTensorVarray<T, Range> &arg2,
432 auto result =
mult(arg1, arg2);
435 auto stream = tls_cudastream_accessor();
436 cudaStreamSynchronize(*stream);
441 template <
typename T,
typename Range,
typename Scalar,
typename Perm,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
442 btasUMTensorVarray<T, Range>
mult(
const btasUMTensorVarray<T, Range> &arg1,
443 const btasUMTensorVarray<T, Range> &arg2,
446 auto result =
mult(arg1, arg2, factor);
449 auto stream = tls_cudastream_accessor();
450 cudaStreamSynchronize(*stream);
458 template <
typename T,
typename Range>
459 btasUMTensorVarray<T, Range>&
mult_to(btasUMTensorVarray<T, Range> &result,
460 const btasUMTensorVarray<T, Range> &arg) {
461 detail::to_cuda(result);
462 detail::to_cuda(arg);
463 btas_tensor_mult_to_cuda_impl(result, arg);
467 template <
typename T,
typename Scalar,
typename Range,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
468 btasUMTensorVarray<T, Range>&
mult_to(btasUMTensorVarray<T, Range> &result,
469 const btasUMTensorVarray<T, Range> &arg,
const Scalar factor) {
471 btas_tensor_scale_to_cuda_impl(result, factor);
483 template <
typename T,
typename Range>
484 typename btasUMTensorVarray<T, Range>::value_type
squared_norm(
485 const btasUMTensorVarray<T, Range> &arg) {
486 detail::to_cuda(arg);
487 return btas_tensor_squared_norm_cuda_impl(arg);
494 template <
typename T,
typename Range>
495 typename btasUMTensorVarray<T, Range>::value_type
norm(
496 const btasUMTensorVarray<T, Range> &arg) {
497 detail::to_cuda(arg);
498 return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg));
504 template <
typename T,
typename Range>
505 typename btasUMTensorVarray<T, Range>::value_type
trace(
506 const btasUMTensorVarray<T, Range> &arg) {
513 template <
typename T,
typename Range>
514 typename btasUMTensorVarray<T, Range>::value_type
sum(
515 const btasUMTensorVarray<T, Range> &arg) {
516 detail::to_cuda(arg);
517 return btas_tensor_sum_cuda_impl(arg);
523 template <
typename T,
typename Range>
524 typename btasUMTensorVarray<T, Range>::value_type
product(
525 const btasUMTensorVarray<T, Range> &arg) {
526 detail::to_cuda(arg);
527 return btas_tensor_product_cuda_impl(arg);
533 template <
typename T,
typename Range>
534 typename btasUMTensorVarray<T, Range>::value_type
max(
535 const btasUMTensorVarray<T, Range> &arg) {
536 detail::to_cuda(arg);
537 return btas_tensor_max_cuda_impl(arg);
543 template <
typename T,
typename Range>
544 typename btasUMTensorVarray<T, Range>::value_type
abs_max(
545 const btasUMTensorVarray<T, Range> &arg) {
546 detail::to_cuda(arg);
547 return btas_tensor_absmax_cuda_impl(arg);
553 template <
typename T,
typename Range>
554 typename btasUMTensorVarray<T, Range>::value_type
min(
555 const btasUMTensorVarray<T, Range> &arg) {
556 detail::to_cuda(arg);
557 return btas_tensor_min_cuda_impl(arg);
563 template <
typename T,
typename Range>
564 typename btasUMTensorVarray<T, Range>::value_type
abs_min(
565 const btasUMTensorVarray<T, Range> &arg) {
566 detail::to_cuda(arg);
567 return btas_tensor_absmin_cuda_impl(arg);
571 template <
typename UMTensor,
typename Policy>
575 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
576 auto &stream = detail::get_stream_based_on_range(tile.range());
578 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
579 tile.tensor().storage(), stream);
582 auto &world = um_array.world();
584 auto start = um_array.pmap()->begin();
585 auto end = um_array.pmap()->end();
587 for (; start !=
end; ++start) {
588 if (!um_array.is_zero(*start)) {
589 world.taskq.add(to_host, um_array.find(*start));
594 CudaSafeCall(cudaDeviceSynchronize());
598 template <
typename UMTensor,
typename Policy>
602 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
603 auto &stream = detail::get_stream_based_on_range(tile.range());
605 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
606 tile.tensor().storage(), stream);
609 auto &world = um_array.world();
611 auto start = um_array.pmap()->begin();
612 auto end = um_array.pmap()->end();
614 for (; start !=
end; ++start) {
615 if (!um_array.is_zero(*start)) {
616 world.taskq.add(to_device, um_array.find(*start));
621 CudaSafeCall(cudaDeviceSynchronize());
625 template <
typename UMTensor,
typename TATensor,
typename Policy>
626 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
628 um_tensor_to_ta_tensor(
630 const auto convert_tile_memcpy = [](
const UMTensor &tile) {
631 TATensor result(tile.tensor().range());
633 auto &stream = cudaEnv::instance()->cuda_stream_d2h();
635 cudaMemcpyAsync(result.data(), tile.data(),
636 tile.size() *
sizeof(
typename TATensor::value_type),
637 cudaMemcpyDefault, stream));
638 synchronize_stream(&stream);
643 const auto convert_tile_um = [](
const UMTensor &tile) {
644 TATensor result(tile.tensor().range());
646 const auto n = tile.tensor().size();
648 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
649 auto &stream = detail::get_stream_based_on_range(tile.range());
651 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
652 tile.tensor().storage(), stream);
654 std::copy_n(tile.data(), n, result.data());
659 const char *use_legacy_conversion =
660 std::getenv(
"TA_CUDA_LEGACY_UM_CONVERSION");
661 auto ta_array = use_legacy_conversion
665 um_array.
world().gop.fence();
670 template <
typename UMTensor,
typename TATensor,
typename Policy>
671 typename std::enable_if<std::is_same<UMTensor, TATensor>::value,
673 um_tensor_to_ta_tensor(
679 template <
typename UMTensor,
typename TATensor,
typename Policy>
680 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
683 auto convert_tile_memcpy = [](
const TATensor &tile) {
686 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
688 using Tensor =
typename UMTensor::tensor_type;
690 auto &stream = cudaEnv::instance()->cuda_stream_h2d();
691 typename Tensor::storage_type storage;
693 Tensor result(tile.range(), std::move(storage));
696 cudaMemcpyAsync(result.data(), tile.data(),
698 cudaMemcpyDefault, stream));
700 synchronize_stream(&stream);
704 auto convert_tile_um = [](
const TATensor &tile) {
707 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
709 using Tensor =
typename UMTensor::tensor_type;
710 typename Tensor::storage_type storage(tile.range().area());
712 Tensor result(tile.range(), std::move(storage));
714 const auto n = tile.size();
716 std::copy_n(tile.data(), n, result.data());
718 auto &stream = detail::get_stream_based_on_range(result.range());
721 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
722 result.storage(), stream);
727 const char *use_legacy_conversion =
728 std::getenv(
"TA_CUDA_LEGACY_UM_CONVERSION");
729 auto um_array = use_legacy_conversion
733 array.
world().gop.fence();
738 template <
typename UMTensor,
typename TATensor,
typename Policy>
739 typename std::enable_if<std::is_same<UMTensor, TATensor>::value,
747 #ifndef TILEDARRAY_HEADER_ONLY
749 extern template class btas::varray<double,
750 TiledArray::cuda_um_allocator<double>>;
751 extern template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
752 extern template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
753 extern template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
756 TiledArray::cuda_um_btas_varray<double>>;
758 TiledArray::cuda_um_btas_varray<float>>;
760 TiledArray::cuda_um_btas_varray<int>>;
762 TiledArray::cuda_um_btas_varray<long>>;
769 btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
773 #endif // TILEDARRAY_HEADER_ONLY
775 #endif // TILEDARRAY_HAS_CUDA
777 #endif // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H