btas_cublas.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2018 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Chong Peng
19  * Department of Chemistry, Virginia Tech
20  * July 24, 2018
21  *
22  */
23 
24 #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
25 #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
26 
27 #include <TiledArray/cuda/cublas.h>
28 #include <TiledArray/math/blas.h>
29 
30 #ifdef TILEDARRAY_HAS_CUDA
31 
33 #include <btas/tensor.h>
34 
40 
41 namespace TiledArray {
42 
43 template <typename T, typename Scalar, typename Range, typename Storage,
44  typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
45 btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
46  const btas::Tensor<T, Range, Storage> &left,
47  const btas::Tensor<T, Range, Storage> &right, Scalar factor,
48  const TiledArray::math::GemmHelper &gemm_helper) {
49  // Check that the arguments are not empty and have the correct ranks
50  TA_ASSERT(!left.empty());
51  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
52  TA_ASSERT(!right.empty());
53  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
54 
55  // Check that the inner dimensions of left and right match
56  TA_ASSERT(
58  gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
59  std::cbegin(right.range().lobound())));
60  TA_ASSERT(
62  gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
63  std::cbegin(right.range().upbound())));
64  TA_ASSERT(gemm_helper.left_right_congruent(
65  std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
66 
67  // Compute gemm dimensions
69  integer m = 1, n = 1, k = 1;
70  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
71 
72  // Get the leading dimension for left and right matrices.
73  const integer lda =
74  (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
75  const integer ldb =
76  (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
77 
78  T factor_t = T(factor);
79  T zero(0);
80 
81  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
82 
83  // typedef typename Tensor::storage_type storage_type;
84  auto result_range =
85  gemm_helper.make_result_range<Range>(left.range(), right.range());
86 
87  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
88 
89  // the result Tensor type
90  typedef btas::Tensor<T, Range, Storage> Tensor;
91  Tensor result;
92 
93  // check if stream is busy
94  // auto stream_status = cudaStreamQuery(cuda_stream);
95 
96  // if stream is completed, use GPU
97  // if (stream_status == cudaSuccess) {
98  if (true) {
99  Storage result_storage;
100  make_device_storage(result_storage, result_range.area(), cuda_stream);
101  result = Tensor(std::move(result_range), std::move(result_storage));
102 
103  // left and right are readonly!!
104  // cudaMemAdvise(device_data(left), left.size() * sizeof(T),
105  // cudaMemAdviseSetReadMostly,
106  // cudaEnv::instance()->current_cuda_device_id());
107  // cudaMemAdvise(device_data(right), right.size() * sizeof(T),
108  // cudaMemAdviseSetReadMostly,
109  // cudaEnv::instance()->current_cuda_device_id());
110 
111  // prefetch data
112  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
113  left.storage(), cuda_stream);
114  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
115  right.storage(), cuda_stream);
116 
117  const auto &handle = cuBLASHandlePool::handle();
118  CublasSafeCall(cublasSetStream(handle, cuda_stream));
119 
120  CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
121  to_cublas_op(gemm_helper.left_op()), n, m, k,
122  &factor_t, device_data(right.storage()), ldb,
123  device_data(left.storage()), lda, &zero,
124  device_data(result.storage()), n));
125 
126  // wait for cuda calls to finish
127  // detail::thread_wait_cuda_stream(cuda_stream);
128  synchronize_stream(&cuda_stream);
129  }
130  // otherwise, use CPU
131  else {
132  Storage result_storage(result_range.area());
133  result = Tensor(std::move(result_range), std::move(result_storage));
134 
135  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
136  result.storage(), cuda_stream);
137 
138  // left and right are readonly!!
139  cudaMemAdvise(device_data(left), left.size() * sizeof(T),
140  cudaMemAdviseSetReadMostly,
141  cudaEnv::instance()->current_cuda_device_id());
142  cudaMemAdvise(device_data(right), right.size() * sizeof(T),
143  cudaMemAdviseSetReadMostly,
144  cudaEnv::instance()->current_cuda_device_id());
145 
146  // prefetch data
147  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
148  left.storage(), cuda_stream);
149  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
150  right.storage(), cuda_stream);
151 
152  TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
153  k, factor_t, left.data(), lda, right.data(), ldb,
154  zero, result.data(), n);
155  }
156 
157  return result;
158 }
159 
160 template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
161 void btas_tensor_gemm_cuda_impl(
162  btas::Tensor<T, Range, Storage> &result,
163  const btas::Tensor<T, Range, Storage> &left,
164  const btas::Tensor<T, Range, Storage> &right, Scalar factor,
165  const TiledArray::math::GemmHelper &gemm_helper) {
166  // Check that the result is not empty and has the correct rank
167  TA_ASSERT(!result.empty());
168  TA_ASSERT(result.range().rank() == gemm_helper.result_rank());
169 
170  // Check that the arguments are not empty and have the correct ranks
171  TA_ASSERT(!left.empty());
172  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
173  TA_ASSERT(!right.empty());
174  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
175 
176  // Check that the outer dimensions of left match the the corresponding
177  // dimensions in result
178  TA_ASSERT(
180  gemm_helper.left_result_congruent(std::cbegin(left.range().lobound()),
181  std::cbegin(result.range().lobound())));
182  TA_ASSERT(
184  gemm_helper.left_result_congruent(std::cbegin(left.range().upbound()),
185  std::cbegin(result.range().upbound())));
186  TA_ASSERT(
187  gemm_helper.left_result_congruent(std::cbegin(left.range().extent()),
188  std::cbegin(result.range().extent())));
189 
190  // Check that the outer dimensions of right match the the corresponding
191  // dimensions in result
193  gemm_helper.right_result_congruent(
194  std::cbegin(right.range().lobound()),
195  std::cbegin(result.range().lobound())));
197  gemm_helper.right_result_congruent(
198  std::cbegin(right.range().upbound()),
199  std::cbegin(result.range().upbound())));
200  TA_ASSERT(
201  gemm_helper.right_result_congruent(std::cbegin(right.range().extent()),
202  std::cbegin(result.range().extent())));
203 
204  // Check that the inner dimensions of left and right match
205  TA_ASSERT(
207  gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
208  std::cbegin(right.range().lobound())));
209  TA_ASSERT(
211  gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
212  std::cbegin(right.range().upbound())));
213  TA_ASSERT(gemm_helper.left_right_congruent(
214  std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
215 
216  // Compute gemm dimensions
218  integer m, n, k;
219  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
220 
221  // Get the leading dimension for left and right matrices.
222  const integer lda =
223  (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
224  const integer ldb =
225  (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
226 
227  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
228  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
229 
230  T factor_t = T(factor);
231  T one(1);
232  // check if stream is busy
233  // auto stream_status = cudaStreamQuery(cuda_stream);
234 
235  // if stream is completed, use GPU
236  // if (stream_status == cudaSuccess) {
237  if (true) {
238  // left and right are readonly!!
239  // cudaMemAdvise(device_data(left), left.size() * sizeof(T),
240  // cudaMemAdviseSetReadMostly,
241  // cudaEnv::instance()->current_cuda_device_id());
242  // cudaMemAdvise(device_data(right), right.size() * sizeof(T),
243  // cudaMemAdviseSetReadMostly,
244  // cudaEnv::instance()->current_cuda_device_id());
245 
246  // prefetch all data
247  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
248  left.storage(), cuda_stream);
249  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
250  right.storage(), cuda_stream);
251  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
252  result.storage(), cuda_stream);
253 
254  const auto &handle = cuBLASHandlePool::handle();
255  CublasSafeCall(cublasSetStream(handle, cuda_stream));
256  CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
257  to_cublas_op(gemm_helper.left_op()), n, m, k,
258  &factor_t, device_data(right.storage()), ldb,
259  device_data(left.storage()), lda, &one,
260  device_data(result.storage()), n));
261  synchronize_stream(&cuda_stream);
262 
263  // detail::thread_wait_cuda_stream(cuda_stream);
264 
265  } else {
266  // left and right are readonly!!
267  cudaMemAdvise(device_data(left), left.size() * sizeof(T),
268  cudaMemAdviseSetReadMostly,
269  cudaEnv::instance()->current_cuda_device_id());
270  cudaMemAdvise(device_data(right), right.size() * sizeof(T),
271  cudaMemAdviseSetReadMostly,
272  cudaEnv::instance()->current_cuda_device_id());
273 
274  // prefetch data
275  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
276  left.storage(), cuda_stream);
277  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
278  right.storage(), cuda_stream);
279  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
280  result.storage(), cuda_stream);
281 
282  TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
283  k, factor_t, left.data(), lda, right.data(), ldb,
284  one, result.data(), n);
285  }
286 }
287 
289 template <typename T, typename Range, typename Storage>
290 btas::Tensor<T, Range, Storage> btas_tensor_clone_cuda_impl(
291  const btas::Tensor<T, Range, Storage> &arg) {
292  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
293 
294  Storage result_storage;
295  auto result_range = arg.range();
296  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
297 
298  make_device_storage(result_storage, arg.size(), cuda_stream);
299  btas::Tensor<T, Range, Storage> result(std::move(result_range),
300  std::move(result_storage));
301 
302  // call cublasCopy
303  const auto &handle = cuBLASHandlePool::handle();
304  CublasSafeCall(cublasSetStream(handle, cuda_stream));
305 
306  CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
307  1, device_data(result.storage()), 1));
308 
309  synchronize_stream(&cuda_stream);
310  return result;
311 }
312 
314 template <typename T, typename Range, typename Storage, typename Scalar,
315  typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
316 btas::Tensor<T, Range, Storage> btas_tensor_scale_cuda_impl(
317  const btas::Tensor<T, Range, Storage> &arg, const Scalar a) {
318  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
319  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
320  // std::cout << "scale, tile offset: " << arg.range().offset() << " stream: "
321  // << arg.range().offset() % cudaEnv::instance()->num_cuda_streams() << "\n";
322 
323  auto result = btas_tensor_clone_cuda_impl(arg);
324 
325  // call cublasScale
326  const auto &handle = cuBLASHandlePool::handle();
327  CublasSafeCall(cublasSetStream(handle, cuda_stream));
328  CublasSafeCall(
329  cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
330 
331  synchronize_stream(&cuda_stream);
332 
333  return result;
334 }
335 
337 template <typename T, typename Range, typename Storage, typename Scalar,
338  typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
339 void btas_tensor_scale_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
340  const Scalar a) {
341  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
342  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
343  // call cublasScale
344  const auto &handle = cuBLASHandlePool::handle();
345  CublasSafeCall(cublasSetStream(handle, cuda_stream));
346  CublasSafeCall(
347  cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
348 
349  synchronize_stream(&cuda_stream);
350 }
351 
353 template <typename T, typename Scalar, typename Range, typename Storage,
354  typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
355 btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
356  const btas::Tensor<T, Range, Storage> &arg1,
357  const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
358  auto result = btas_tensor_clone_cuda_impl(arg1);
359 
360  // revert the sign of a
361  auto b = -a;
362 
363  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
364  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
365 
366  if (in_memory_space<MemorySpace::CUDA>(result.storage())) {
367  const auto &handle = cuBLASHandlePool::handle();
368  CublasSafeCall(cublasSetStream(handle, cuda_stream));
369  CublasSafeCall(cublasAxpy(handle, result.size(), &b,
370  device_data(arg2.storage()), 1,
371  device_data(result.storage()), 1));
372  } else {
373  TA_ASSERT(false);
374  // btas::axpy(1.0, arg, result);
375  }
376 
377  synchronize_stream(&cuda_stream);
378  return result;
379 }
380 
382 template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
383 void btas_tensor_subt_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
384  const btas::Tensor<T, Range, Storage> &arg1,
385  const Scalar a) {
386  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
387  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
388 
389  // revert the sign of a
390  auto b = -a;
391 
392  const auto &handle = cuBLASHandlePool::handle();
393  CublasSafeCall(cublasSetStream(handle, cuda_stream));
394  CublasSafeCall(cublasAxpy(handle, result.size(), &b,
395  device_data(arg1.storage()), 1,
396  device_data(result.storage()), 1));
397  synchronize_stream(&cuda_stream);
398 }
399 
401 template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
402 btas::Tensor<T, Range, Storage> btas_tensor_add_cuda_impl(
403  const btas::Tensor<T, Range, Storage> &arg1,
404  const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
405  auto result = btas_tensor_clone_cuda_impl(arg1);
406 
407  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
408  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
409 
410  const auto &handle = cuBLASHandlePool::handle();
411  CublasSafeCall(cublasSetStream(handle, cuda_stream));
412  CublasSafeCall(cublasAxpy(handle, result.size(), &a,
413  device_data(arg2.storage()), 1,
414  device_data(result.storage()), 1));
415 
416  synchronize_stream(&cuda_stream);
417  return result;
418 }
419 
421 template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
422 void btas_tensor_add_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
423  const btas::Tensor<T, Range, Storage> &arg,
424  const Scalar a) {
425  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
426  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
427 
428  // TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(result.storage(),cuda_stream);
429  // TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(arg.storage(),cuda_stream);
430 
431  const auto &handle = cuBLASHandlePool::handle();
432  CublasSafeCall(cublasSetStream(handle, cuda_stream));
433  CublasSafeCall(cublasAxpy(handle, result.size(), &a,
434  device_data(arg.storage()), 1,
435  device_data(result.storage()), 1));
436 
437  synchronize_stream(&cuda_stream);
438 }
439 
441 template <typename T, typename Range, typename Storage>
442 void btas_tensor_mult_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
443  const btas::Tensor<T, Range, Storage> &arg) {
444  auto device_id = cudaEnv::instance()->current_cuda_device_id();
445  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
446 
447  std::size_t n = result.size();
448 
449  TA_ASSERT(n == arg.size());
450 
451  mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id);
452  synchronize_stream(&cuda_stream);
453 }
454 
456 template <typename T, typename Range, typename Storage>
457 btas::Tensor<T, Range, Storage> btas_tensor_mult_cuda_impl(
458  const btas::Tensor<T, Range, Storage> &arg1,
459  const btas::Tensor<T, Range, Storage> &arg2) {
460  std::size_t n = arg1.size();
461 
462  TA_ASSERT(arg2.size() == n);
463 
464  auto device_id = cudaEnv::instance()->current_cuda_device_id();
465  CudaSafeCall(cudaSetDevice(device_id));
466  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
467 
468  Storage result_storage;
469  make_device_storage(result_storage, n, cuda_stream);
470  btas::Tensor<T, Range, Storage> result(arg1.range(),
471  std::move(result_storage));
472 
473  mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream,
474  device_id);
475 
476  synchronize_stream(&cuda_stream);
477  return result;
478 }
479 
480 // foreach(i) result += arg[i] * arg[i]
481 template <typename T, typename Range, typename Storage>
482 typename btas::Tensor<T, Range, Storage>::value_type
483 btas_tensor_squared_norm_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
484  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
485 
486  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
487 
488  auto &storage = arg.storage();
490  integer size = storage.size();
491  T result = 0;
492  if (in_memory_space<MemorySpace::CUDA>(storage)) {
493  const auto &handle = cuBLASHandlePool::handle();
494  CublasSafeCall(cublasSetStream(handle, cuda_stream));
495  CublasSafeCall(cublasDot(handle, size, device_data(storage), 1,
496  device_data(storage), 1, &result));
497  } else {
498  TA_ASSERT(false);
499  // result = TiledArray::math::dot(size, storage.data(), storage.data());
500  }
501  synchronize_stream(&cuda_stream);
502  return result;
503 }
504 
505 // foreach(i) result += arg1[i] * arg2[i]
506 template <typename T, typename Range, typename Storage>
507 typename btas::Tensor<T, Range, Storage>::value_type btas_tensor_dot_cuda_impl(
508  const btas::Tensor<T, Range, Storage> &arg1,
509  const btas::Tensor<T, Range, Storage> &arg2) {
510  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
511 
512  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
513 
515  integer size = arg1.storage().size();
516 
517  TA_ASSERT(size == arg2.storage().size());
518 
519  T result = 0;
520  if (in_memory_space<MemorySpace::CUDA>(arg1.storage()) &&
521  in_memory_space<MemorySpace::CUDA>(arg2.storage())) {
522  const auto &handle = cuBLASHandlePool::handle();
523  CublasSafeCall(cublasSetStream(handle, cuda_stream));
524  CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1,
525  device_data(arg2.storage()), 1, &result));
526  } else {
527  TA_ASSERT(false);
528  // result = TiledArray::math::dot(size, storage.data(), storage.data());
529  }
530  synchronize_stream(&cuda_stream);
531  return result;
532 }
533 
534 template <typename T, typename Range, typename Storage>
535 T btas_tensor_sum_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
536  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
537  auto device_id = cudaEnv::instance()->current_cuda_device_id();
538 
539  auto &storage = arg.storage();
540  auto n = storage.size();
541 
542  auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id);
543 
544  synchronize_stream(&cuda_stream);
545  return result;
546 }
547 
548 template <typename T, typename Range, typename Storage>
549 T btas_tensor_product_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
550  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
551  auto device_id = cudaEnv::instance()->current_cuda_device_id();
552 
553  auto &storage = arg.storage();
554  auto n = storage.size();
555 
556  auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id);
557 
558  synchronize_stream(&cuda_stream);
559  return result;
560 }
561 
562 template <typename T, typename Range, typename Storage>
563 T btas_tensor_min_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
564  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
565  auto device_id = cudaEnv::instance()->current_cuda_device_id();
566 
567  auto &storage = arg.storage();
568  auto n = storage.size();
569 
570  auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id);
571 
572  synchronize_stream(&cuda_stream);
573  return result;
574 }
575 
576 template <typename T, typename Range, typename Storage>
577 T btas_tensor_max_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
578  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
579  auto device_id = cudaEnv::instance()->current_cuda_device_id();
580 
581  auto &storage = arg.storage();
582  auto n = storage.size();
583 
584  auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id);
585 
586  synchronize_stream(&cuda_stream);
587  return result;
588 }
589 
590 template <typename T, typename Range, typename Storage>
591 T btas_tensor_absmin_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
592  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
593  auto device_id = cudaEnv::instance()->current_cuda_device_id();
594 
595  auto &storage = arg.storage();
596  auto n = storage.size();
597 
598  auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id);
599 
600  synchronize_stream(&cuda_stream);
601  return result;
602 }
603 
604 template <typename T, typename Range, typename Storage>
605 T btas_tensor_absmax_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
606  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
607  auto device_id = cudaEnv::instance()->current_cuda_device_id();
608 
609  auto &storage = arg.storage();
610  auto n = storage.size();
611 
612  auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id);
613 
614  synchronize_stream(&cuda_stream);
615  return result;
616 }
617 
618 } // namespace TiledArray
619 
620 #endif // TILEDARRAY_HAS_CUDA
621 
622 #endif // TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
R make_result_range(const Left &left, const Right &right) const
Construct a result range based on left and right ranges.
Definition: gemm_helper.h:165
Contraction to *GEMM helper.
Definition: gemm_helper.h:40
unsigned int left_rank() const
Left-hand argument rank accessor.
Definition: gemm_helper.h:138
blas::Op left_op() const
Definition: gemm_helper.h:275
int64_t integer
Definition: blas.h:44
bool right_result_congruent(const Right &right, const Result &result) const
Definition: gemm_helper.h:221
bool left_right_congruent(const Left &left, const Right &right) const
Definition: gemm_helper.h:238
bool left_result_congruent(const Left &left, const Result &result) const
Definition: gemm_helper.h:205
void make_device_storage(cpu_cuda_vector< T > &storage, std::size_t n, cudaStream_t stream=0)
#define TA_ASSERT(EXPR,...)
Definition: error.h:39
unsigned int result_rank() const
Result rank accessor.
Definition: gemm_helper.h:133
void ignore_tile_position(bool b)
Definition: utility.h:81
void gemm(Op op_a, Op op_b, const integer m, const integer n, const integer k, const S1 alpha, const T1 *a, const integer lda, const T2 *b, const integer ldb, const S2 beta, T3 *c, const integer ldc)
Definition: blas.h:71
T * device_data(cpu_cuda_vector< T > &storage)
void compute_matrix_sizes(blas::integer &m, blas::integer &n, blas::integer &k, const Left &left, const Right &right) const
Compute the matrix dimension that can be used in a *GEMM call.
Definition: gemm_helper.h:254
blas::Op right_op() const
Definition: gemm_helper.h:276
unsigned int right_rank() const
Right-hand argument rank accessor.
Definition: gemm_helper.h:143
void zero(DistArray< Tile, Policy > &a)
Definition: basic.h:51