24 #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
25 #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
30 #ifdef TILEDARRAY_HAS_CUDA
33 #include <btas/tensor.h>
43 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
44 typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
45 btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
46 const btas::Tensor<T, Range, Storage> &left,
47 const btas::Tensor<T, Range, Storage> &right, Scalar factor,
59 std::cbegin(right.range().lobound())));
63 std::cbegin(right.range().upbound())));
65 std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
74 (gemm_helper.
left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
76 (gemm_helper.
right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
78 T factor_t = T(factor);
81 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
87 auto &cuda_stream = detail::get_stream_based_on_range(result_range);
90 typedef btas::Tensor<T, Range, Storage> Tensor;
99 Storage result_storage;
101 result = Tensor(std::move(result_range), std::move(result_storage));
112 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
113 left.storage(), cuda_stream);
114 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
115 right.storage(), cuda_stream);
117 const auto &handle = cuBLASHandlePool::handle();
118 CublasSafeCall(cublasSetStream(handle, cuda_stream));
120 CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.
right_op()),
121 to_cublas_op(gemm_helper.
left_op()), n, m, k,
128 synchronize_stream(&cuda_stream);
132 Storage result_storage(result_range.area());
133 result = Tensor(std::move(result_range), std::move(result_storage));
135 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
136 result.storage(), cuda_stream);
139 cudaMemAdvise(
device_data(left), left.size() *
sizeof(T),
140 cudaMemAdviseSetReadMostly,
141 cudaEnv::instance()->current_cuda_device_id());
142 cudaMemAdvise(
device_data(right), right.size() *
sizeof(T),
143 cudaMemAdviseSetReadMostly,
144 cudaEnv::instance()->current_cuda_device_id());
147 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
148 left.storage(), cuda_stream);
149 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
150 right.storage(), cuda_stream);
153 k, factor_t, left.data(), lda, right.data(), ldb,
154 zero, result.data(), n);
160 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
161 void btas_tensor_gemm_cuda_impl(
162 btas::Tensor<T, Range, Storage> &result,
163 const btas::Tensor<T, Range, Storage> &left,
164 const btas::Tensor<T, Range, Storage> &right, Scalar factor,
181 std::cbegin(result.range().lobound())));
185 std::cbegin(result.range().upbound())));
188 std::cbegin(result.range().extent())));
194 std::cbegin(right.range().lobound()),
195 std::cbegin(result.range().lobound())));
198 std::cbegin(right.range().upbound()),
199 std::cbegin(result.range().upbound())));
202 std::cbegin(result.range().extent())));
208 std::cbegin(right.range().lobound())));
212 std::cbegin(right.range().upbound())));
214 std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
223 (gemm_helper.
left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
225 (gemm_helper.
right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
227 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
228 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
230 T factor_t = T(factor);
247 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
248 left.storage(), cuda_stream);
249 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
250 right.storage(), cuda_stream);
251 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
252 result.storage(), cuda_stream);
254 const auto &handle = cuBLASHandlePool::handle();
255 CublasSafeCall(cublasSetStream(handle, cuda_stream));
256 CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.
right_op()),
257 to_cublas_op(gemm_helper.
left_op()), n, m, k,
261 synchronize_stream(&cuda_stream);
267 cudaMemAdvise(
device_data(left), left.size() *
sizeof(T),
268 cudaMemAdviseSetReadMostly,
269 cudaEnv::instance()->current_cuda_device_id());
270 cudaMemAdvise(
device_data(right), right.size() *
sizeof(T),
271 cudaMemAdviseSetReadMostly,
272 cudaEnv::instance()->current_cuda_device_id());
275 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
276 left.storage(), cuda_stream);
277 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
278 right.storage(), cuda_stream);
279 TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
280 result.storage(), cuda_stream);
283 k, factor_t, left.data(), lda, right.data(), ldb,
284 one, result.data(), n);
289 template <
typename T,
typename Range,
typename Storage>
290 btas::Tensor<T, Range, Storage> btas_tensor_clone_cuda_impl(
291 const btas::Tensor<T, Range, Storage> &arg) {
292 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
294 Storage result_storage;
295 auto result_range = arg.range();
296 auto &cuda_stream = detail::get_stream_based_on_range(result_range);
299 btas::Tensor<T, Range, Storage> result(std::move(result_range),
300 std::move(result_storage));
303 const auto &handle = cuBLASHandlePool::handle();
304 CublasSafeCall(cublasSetStream(handle, cuda_stream));
306 CublasSafeCall(cublasCopy(handle, result.size(),
device_data(arg.storage()),
309 synchronize_stream(&cuda_stream);
314 template <
typename T,
typename Range,
typename Storage,
typename Scalar,
315 typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
316 btas::Tensor<T, Range, Storage> btas_tensor_scale_cuda_impl(
317 const btas::Tensor<T, Range, Storage> &arg,
const Scalar a) {
318 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
319 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
323 auto result = btas_tensor_clone_cuda_impl(arg);
326 const auto &handle = cuBLASHandlePool::handle();
327 CublasSafeCall(cublasSetStream(handle, cuda_stream));
329 cublasScal(handle, result.size(), &a,
device_data(result.storage()), 1));
331 synchronize_stream(&cuda_stream);
337 template <
typename T,
typename Range,
typename Storage,
typename Scalar,
338 typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
339 void btas_tensor_scale_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
341 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
342 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
344 const auto &handle = cuBLASHandlePool::handle();
345 CublasSafeCall(cublasSetStream(handle, cuda_stream));
347 cublasScal(handle, result.size(), &a,
device_data(result.storage()), 1));
349 synchronize_stream(&cuda_stream);
353 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
354 typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
355 btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
356 const btas::Tensor<T, Range, Storage> &arg1,
357 const btas::Tensor<T, Range, Storage> &arg2,
const Scalar a) {
358 auto result = btas_tensor_clone_cuda_impl(arg1);
363 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
364 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
366 if (in_memory_space<MemorySpace::CUDA>(result.storage())) {
367 const auto &handle = cuBLASHandlePool::handle();
368 CublasSafeCall(cublasSetStream(handle, cuda_stream));
369 CublasSafeCall(cublasAxpy(handle, result.size(), &b,
377 synchronize_stream(&cuda_stream);
382 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
383 void btas_tensor_subt_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
384 const btas::Tensor<T, Range, Storage> &arg1,
386 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
387 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
392 const auto &handle = cuBLASHandlePool::handle();
393 CublasSafeCall(cublasSetStream(handle, cuda_stream));
394 CublasSafeCall(cublasAxpy(handle, result.size(), &b,
397 synchronize_stream(&cuda_stream);
401 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
402 btas::Tensor<T, Range, Storage> btas_tensor_add_cuda_impl(
403 const btas::Tensor<T, Range, Storage> &arg1,
404 const btas::Tensor<T, Range, Storage> &arg2,
const Scalar a) {
405 auto result = btas_tensor_clone_cuda_impl(arg1);
407 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
408 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
410 const auto &handle = cuBLASHandlePool::handle();
411 CublasSafeCall(cublasSetStream(handle, cuda_stream));
412 CublasSafeCall(cublasAxpy(handle, result.size(), &a,
416 synchronize_stream(&cuda_stream);
421 template <
typename T,
typename Scalar,
typename Range,
typename Storage,
typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
422 void btas_tensor_add_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
423 const btas::Tensor<T, Range, Storage> &arg,
425 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
426 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
431 const auto &handle = cuBLASHandlePool::handle();
432 CublasSafeCall(cublasSetStream(handle, cuda_stream));
433 CublasSafeCall(cublasAxpy(handle, result.size(), &a,
437 synchronize_stream(&cuda_stream);
441 template <
typename T,
typename Range,
typename Storage>
442 void btas_tensor_mult_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
443 const btas::Tensor<T, Range, Storage> &arg) {
444 auto device_id = cudaEnv::instance()->current_cuda_device_id();
445 auto &cuda_stream = detail::get_stream_based_on_range(result.range());
447 std::size_t n = result.size();
451 mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id);
452 synchronize_stream(&cuda_stream);
456 template <
typename T,
typename Range,
typename Storage>
457 btas::Tensor<T, Range, Storage> btas_tensor_mult_cuda_impl(
458 const btas::Tensor<T, Range, Storage> &arg1,
459 const btas::Tensor<T, Range, Storage> &arg2) {
460 std::size_t n = arg1.size();
464 auto device_id = cudaEnv::instance()->current_cuda_device_id();
465 CudaSafeCall(cudaSetDevice(device_id));
466 auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
468 Storage result_storage;
470 btas::Tensor<T, Range, Storage> result(arg1.range(),
471 std::move(result_storage));
473 mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream,
476 synchronize_stream(&cuda_stream);
481 template <
typename T,
typename Range,
typename Storage>
482 typename btas::Tensor<T, Range, Storage>::value_type
483 btas_tensor_squared_norm_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
484 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
486 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
488 auto &storage = arg.storage();
492 if (in_memory_space<MemorySpace::CUDA>(storage)) {
493 const auto &handle = cuBLASHandlePool::handle();
494 CublasSafeCall(cublasSetStream(handle, cuda_stream));
495 CublasSafeCall(cublasDot(handle, size,
device_data(storage), 1,
501 synchronize_stream(&cuda_stream);
506 template <
typename T,
typename Range,
typename Storage>
507 typename btas::Tensor<T, Range, Storage>::value_type btas_tensor_dot_cuda_impl(
508 const btas::Tensor<T, Range, Storage> &arg1,
509 const btas::Tensor<T, Range, Storage> &arg2) {
510 CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
512 auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
515 integer size = arg1.storage().size();
517 TA_ASSERT(size == arg2.storage().size());
520 if (in_memory_space<MemorySpace::CUDA>(arg1.storage()) &&
521 in_memory_space<MemorySpace::CUDA>(arg2.storage())) {
522 const auto &handle = cuBLASHandlePool::handle();
523 CublasSafeCall(cublasSetStream(handle, cuda_stream));
524 CublasSafeCall(cublasDot(handle, size,
device_data(arg1.storage()), 1,
530 synchronize_stream(&cuda_stream);
534 template <
typename T,
typename Range,
typename Storage>
535 T btas_tensor_sum_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
536 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
537 auto device_id = cudaEnv::instance()->current_cuda_device_id();
539 auto &storage = arg.storage();
540 auto n = storage.size();
542 auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id);
544 synchronize_stream(&cuda_stream);
548 template <
typename T,
typename Range,
typename Storage>
549 T btas_tensor_product_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
550 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
551 auto device_id = cudaEnv::instance()->current_cuda_device_id();
553 auto &storage = arg.storage();
554 auto n = storage.size();
556 auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id);
558 synchronize_stream(&cuda_stream);
562 template <
typename T,
typename Range,
typename Storage>
563 T btas_tensor_min_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
564 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
565 auto device_id = cudaEnv::instance()->current_cuda_device_id();
567 auto &storage = arg.storage();
568 auto n = storage.size();
570 auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id);
572 synchronize_stream(&cuda_stream);
576 template <
typename T,
typename Range,
typename Storage>
577 T btas_tensor_max_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
578 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
579 auto device_id = cudaEnv::instance()->current_cuda_device_id();
581 auto &storage = arg.storage();
582 auto n = storage.size();
584 auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id);
586 synchronize_stream(&cuda_stream);
590 template <
typename T,
typename Range,
typename Storage>
591 T btas_tensor_absmin_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
592 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
593 auto device_id = cudaEnv::instance()->current_cuda_device_id();
595 auto &storage = arg.storage();
596 auto n = storage.size();
598 auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id);
600 synchronize_stream(&cuda_stream);
604 template <
typename T,
typename Range,
typename Storage>
605 T btas_tensor_absmax_cuda_impl(
const btas::Tensor<T, Range, Storage> &arg) {
606 auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
607 auto device_id = cudaEnv::instance()->current_cuda_device_id();
609 auto &storage = arg.storage();
610 auto n = storage.size();
612 auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id);
614 synchronize_stream(&cuda_stream);
620 #endif // TILEDARRAY_HAS_CUDA
622 #endif // TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED