TiledArray  0.7.0
transpose.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * transpose.h
22  * Jun 9, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
27 #define TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
28 
29 #include <TiledArray/error.h>
31 
32 namespace TiledArray {
33  namespace math {
34 
36 
38  template <std::size_t N> class TransposeUnwind;
39 
40  template <>
41  class TransposeUnwind<0> {
42  public:
43 
44  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
45 
46  template <typename Op, typename Result, typename... Args>
47  static TILEDARRAY_FORCE_INLINE void
48  gather_trans(Op&& op, Result* MADNESS_RESTRICT const result,
49  const std::size_t arg_stride, const Args* MADNESS_RESTRICT const... args)
50  {
51  // Load arg block
52  Block<Result> result_block;
53  for_each_block(op, result_block, Block<Args>(args)...);
54 
55  // Transpose arg_block
56  result_block.scatter_to(result, TILEDARRAY_LOOP_UNWIND);
57  }
58 
59  template <typename Op, typename Result>
60  static TILEDARRAY_FORCE_INLINE void
61  block_scatter(Op&& op, Result* const result, const Result* const arg,
62  const std::size_t /*result_stride*/)
63  {
64  for_each_block_ptr(op, result, arg);
65  }
66 
67  }; // class TransposeUnwind<0>
68 
69  template <std::size_t N>
70  class TransposeUnwind : public TransposeUnwind<N - 1> {
71  public:
72 
74 
75  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
76 
77  template <typename Op, typename Result, typename... Args>
78  static TILEDARRAY_FORCE_INLINE void
79  gather_trans(Op&& op, Result* MADNESS_RESTRICT const result,
80  const std::size_t arg_stride, const Args* MADNESS_RESTRICT const... args)
81  {
82  {
83  // Load arg block
84  Block<Result> result_block;
85  for_each_block(op, result_block, Block<Args>(args)...);
86 
87  // Transpose arg_block
88  result_block.scatter_to(result, TILEDARRAY_LOOP_UNWIND);
89  }
90 
91  TransposeUnwindN1::gather_trans(op, result + 1,
92  arg_stride, (args + arg_stride)...);
93  }
94 
95  template <typename Op, typename Result>
96  static TILEDARRAY_FORCE_INLINE void
97  block_scatter(Op&& op, Result* const result, const Result* const arg,
98  const std::size_t result_stride)
99  {
100  for_each_block_ptr(op, result, arg);
101  TransposeUnwindN1::block_scatter(op, result + result_stride,
102  arg + TILEDARRAY_LOOP_UNWIND, result_stride);
103  }
104 
105  }; // class TransposeUnwind
106 
107  // Convenience typedef
109 
110 
111  template <typename InputOp, typename OutputOp, typename Result, typename... Args>
112  TILEDARRAY_FORCE_INLINE void
113  transpose_block(InputOp&& input_op, OutputOp&& output_op,
114  const std::size_t result_stride, Result* const result,
115  const std::size_t arg_stride, const Args* const... args)
116  {
117  constexpr std::size_t block_size = TILEDARRAY_LOOP_UNWIND * TILEDARRAY_LOOP_UNWIND;
118  TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
119 
120  // Transpose block
121  TransposeUnwindN::gather_trans(input_op, temp,
122  arg_stride, args...);
123 
124  TransposeUnwindN::block_scatter(output_op, result,
125  temp, result_stride);
126  }
127 
128 
129  template <typename InputOp, typename OutputOp, typename Result, typename... Args>
130  TILEDARRAY_FORCE_INLINE void
131  transpose_block(InputOp&& input_op, OutputOp&& output_op,
132  const std::size_t m, const std::size_t n,
133  const std::size_t result_stride, Result* MADNESS_RESTRICT const result,
134  const std::size_t arg_stride, const Args* MADNESS_RESTRICT const... args)
135  {
138 
139  constexpr std::size_t block_size = TILEDARRAY_LOOP_UNWIND * TILEDARRAY_LOOP_UNWIND;
140  TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
141 
142  // Copy and transpose arg data into temp block
143  for(std::size_t i = 0ul; i < m; ++i) {
144  std::size_t offset = i * arg_stride;
145  for(std::size_t j = 0ul, x = i; j < n; ++j, x += TILEDARRAY_LOOP_UNWIND, ++offset)
146  input_op(temp[x], args[offset]...);
147  }
148 
149  // Copy the temp block into result
150  for(std::size_t j = 0ul; j < n; ++j) {
151  Result* MADNESS_RESTRICT const result_j = result + (j * result_stride);
152  const Result* MADNESS_RESTRICT const temp_j = temp + (j * TILEDARRAY_LOOP_UNWIND);
153  for(std::size_t i = 0ul; i < m; ++i)
154  output_op(result_j + i, temp_j[i]);
155  }
156  }
157 
159 
175  template <typename InputOp, typename OutputOp, typename Result, typename... Args>
176  void transpose(InputOp&& input_op, OutputOp&& output_op,
177  const std::size_t m, const std::size_t n,
178  const std::size_t result_stride, Result* result,
179  const std::size_t arg_stride, const Args* const... args)
180  {
181  // Compute block iteration control variables
182  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
183  const std::size_t mx = m & index_mask; // = m - m % TILEDARRAY_LOOP_UNWIND
184  const std::size_t nx = n & index_mask; // = n - n % TILEDARRAY_LOOP_UNWIND
185  const std::size_t m_tail = m - mx;
186  const std::size_t n_tail = n - nx;
187  const std::size_t result_block_step = result_stride * TILEDARRAY_LOOP_UNWIND;
188  const std::size_t arg_block_step = arg_stride * TILEDARRAY_LOOP_UNWIND;
189  const std::size_t arg_end = mx * arg_stride;
190  const Result* result_end = result + (nx * result_stride);
191 
192  const auto wrapper_input_op =
193  [&] (Result& res, param_type<Args>... a) { res = input_op(a...); };
194 
195  // Iterate over block rows
196  std::size_t arg_start = 0;
197  for(; arg_start < arg_end; arg_start += arg_block_step, result += TILEDARRAY_LOOP_UNWIND) {
198  std::size_t arg_offset = arg_start;
199  Result* result_ij = result;
200  for(; result_ij < result_end; result_ij += result_block_step,
201  arg_offset += TILEDARRAY_LOOP_UNWIND)
202  transpose_block(wrapper_input_op, output_op, result_stride, result_ij,
203  arg_stride, (args + arg_offset)...);
204 
205  if(n_tail)
206  transpose_block(wrapper_input_op, output_op, TILEDARRAY_LOOP_UNWIND,
207  n_tail, result_stride, result_ij, arg_stride, (args + arg_offset)...);
208  }
209 
210  if(m_tail) {
211  std::size_t arg_offset = arg_start;
212  Result* result_ij = result;
213  for(; result_ij < result_end; result_ij += result_block_step,
214  arg_offset += TILEDARRAY_LOOP_UNWIND)
215  transpose_block(wrapper_input_op, output_op, m_tail,
216  TILEDARRAY_LOOP_UNWIND, result_stride, result_ij, arg_stride,
217  (args + arg_offset)...);
218 
219  if(n_tail)
220  transpose_block(wrapper_input_op, output_op, m_tail, n_tail,
221  result_stride, result_ij, arg_stride, (args + arg_offset)...);
222  }
223  }
224 
225  } // namespace math
226 } // namespace TiledArray
227 
228 #endif // TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
void scatter_to(T *const data, std::size_t stride) const
Definition: vector_op.h:280
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:152
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
Definition: transpose.h:79
STL namespace.
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
Definition: vector_op.h:44
TILEDARRAY_FORCE_INLINE void transpose_block(InputOp &&input_op, OutputOp &&output_op, const std::size_t result_stride, Result *const result, const std::size_t arg_stride, const Args *const ... args)
Definition: transpose.h:113
Partial transpose algorithm automatic loop unwinding.
Definition: transpose.h:38
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:33
#define TA_ASSERT(a)
Definition: error.h:107
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t result_stride)
Definition: transpose.h:97
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t)
Definition: transpose.h:61
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
Definition: transpose.h:48
void transpose(InputOp &&input_op, OutputOp &&output_op, const std::size_t m, const std::size_t n, const std::size_t result_stride, Result *result, const std::size_t arg_stride, const Args *const ... args)
Matrix transpose and initialization.
Definition: transpose.h:176
TILEDARRAY_FORCE_INLINE std::enable_if<(sizeof...(Args) >=0)>::type for_each_block_ptr(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:175
TransposeUnwind< TILEDARRAY_LOOP_UNWIND - 1 > TransposeUnwindN
Definition: transpose.h:108
TransposeUnwind< N - 1 > TransposeUnwindN1
Definition: transpose.h:73
static constexpr std::size_t offset
Definition: transpose.h:75