reduce_kernel_impl.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2018 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Chong Peng
19  * Department of Chemistry, Virginia Tech
20  * Apir 11, 2018
21  *
22  */
23 
24 #ifndef TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
25 #define TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
26 
27 #include <limits>
28 
30 #include <thrust/device_vector.h>
31 #include <thrust/execution_policy.h>
32 #include <thrust/functional.h>
33 #include <thrust/reduce.h>
34 #include <thrust/transform_reduce.h>
35 
36 namespace TiledArray {
37 
38 namespace detail {
39 
40 template <typename T>
41 struct absolute_value : public thrust::unary_function<T, T> {
42  __host__ __device__ T operator()(const T &x) const {
43  return x < T(0) ? -x : x;
44  }
45 };
46 
47 } // namespace detail
48 
50 template <typename T, typename ReduceOp>
51 T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init,
52  cudaStream_t stream, int device_id) {
53  CudaSafeCall(cudaSetDevice(device_id));
54 
55  auto arg_p = thrust::device_pointer_cast(arg);
56 
57  auto result = thrust::reduce(thrust::cuda::par.on(stream), arg_p, arg_p + n,
58  init, std::forward<ReduceOp>(op));
59 
60  return result;
61 }
62 
63 template <typename T>
64 T product_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
65  cudaStream_t stream, int device_id) {
66  T init(1);
67  thrust::multiplies<T> mul_op;
68  return reduce_cuda_kernel_impl(mul_op, arg, n, init, stream, device_id);
69 }
70 
71 template <typename T>
72 T sum_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
73  int device_id) {
74  T init(0);
75  thrust::plus<T> plus_op;
76  return reduce_cuda_kernel_impl(plus_op, arg, n, init, stream, device_id);
77 }
78 
79 template <typename T>
80 T max_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
81  int device_id) {
82  T init = std::numeric_limits<T>::lowest();
83  thrust::maximum<T> max_op;
84  return reduce_cuda_kernel_impl(max_op, arg, n, init, stream, device_id);
85 }
86 
87 template <typename T>
88 T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
89  int device_id) {
90  T init = std::numeric_limits<T>::max();
91  thrust::minimum<T> min_op;
92  return reduce_cuda_kernel_impl(min_op, arg, n, init, stream, device_id);
93 }
94 
95 template <typename T>
96 T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
97  cudaStream_t stream, int device_id) {
98  T init(0);
99  thrust::maximum<T> max_op;
101 
102  CudaSafeCall(cudaSetDevice(device_id));
103 
104  auto arg_p = thrust::device_pointer_cast(arg);
105 
106  auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p,
107  arg_p + n, abs_op, init, max_op);
108 
109  return result;
110 }
111 
112 template <typename T>
113 T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
114  cudaStream_t stream, int device_id) {
115  T init(0);
116  thrust::minimum<T> min_op;
118 
119  CudaSafeCall(cudaSetDevice(device_id));
120 
121  auto arg_p = thrust::device_pointer_cast(arg);
122 
123  auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p,
124  arg_p + n, abs_op, init, min_op);
125  return result;
126 }
127 
128 } // namespace TiledArray
129 
130 #endif // TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
__host__ __device__ T operator()(const T &x) const
T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init, cudaStream_t stream, int device_id)
T = reduce(T* arg)
T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)
T max_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)
T product_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)
KroneckerDeltaTile< _N >::numeric_type max(const KroneckerDeltaTile< _N > &arg)
T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)
T sum_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)
T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, int device_id)