transpose.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * transpose.h
22  * Jun 9, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
27 #define TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
28 
29 #include <TiledArray/error.h>
31 
32 namespace TiledArray {
33 namespace math {
34 
36 
38 template <std::size_t N>
39 class TransposeUnwind;
40 
41 template <>
42 class TransposeUnwind<0> {
43  public:
44  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
45 
46  template <typename Op, typename Result, typename... Args>
47  static TILEDARRAY_FORCE_INLINE void gather_trans(
48  Op&& op, Result* MADNESS_RESTRICT const result,
49  const std::size_t arg_stride,
50  const Args* MADNESS_RESTRICT const... args) {
51  // Load arg block
52  Block<Result> result_block;
53  for_each_block(std::forward<Op>(op), result_block, Block<Args>(args)...);
54 
55  // Transpose arg_block
56  result_block.scatter_to(result, TILEDARRAY_LOOP_UNWIND);
57  }
58 
59  template <typename Op, typename Result>
60  static TILEDARRAY_FORCE_INLINE void block_scatter(
61  Op&& op, Result* const result, const Result* const arg,
62  const std::size_t /*result_stride*/) {
63  for_each_block_ptr(std::forward<Op>(op), result, arg);
64  }
65 
66 }; // class TransposeUnwind<0>
67 
68 template <std::size_t N>
69 class TransposeUnwind : public TransposeUnwind<N - 1> {
70  public:
72 
73  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
74 
75  template <typename Op, typename Result, typename... Args>
76  static TILEDARRAY_FORCE_INLINE void gather_trans(
77  Op&& op, Result* MADNESS_RESTRICT const result,
78  const std::size_t arg_stride,
79  const Args* MADNESS_RESTRICT const... args) {
80  {
81  // Load arg block
82  Block<Result> result_block;
83  for_each_block(op, result_block, Block<Args>(args)...);
84 
85  // Transpose arg_block
86  result_block.scatter_to(result, TILEDARRAY_LOOP_UNWIND);
87  }
88 
89  TransposeUnwindN1::gather_trans(std::forward<Op>(op), result + 1,
90  arg_stride, (args + arg_stride)...);
91  }
92 
93  template <typename Op, typename Result>
94  static TILEDARRAY_FORCE_INLINE void block_scatter(
95  Op&& op, Result* const result, const Result* const arg,
96  const std::size_t result_stride) {
97  for_each_block_ptr(op, result, arg);
99  std::forward<Op>(op), result + result_stride,
100  arg + TILEDARRAY_LOOP_UNWIND, result_stride);
101  }
102 
103 }; // class TransposeUnwind
104 
105 // Convenience typedef
107 
108 template <typename InputOp, typename OutputOp, typename Result,
109  typename... Args>
110 TILEDARRAY_FORCE_INLINE void transpose_block(InputOp&& input_op,
111  OutputOp&& output_op,
112  const std::size_t result_stride,
113  Result* const result,
114  const std::size_t arg_stride,
115  const Args* const... args) {
116  constexpr std::size_t block_size =
118  TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
119 
120  // Transpose block
121  TransposeUnwindN::gather_trans(std::forward<InputOp>(input_op), temp,
122  arg_stride, args...);
123 
124  TransposeUnwindN::block_scatter(std::forward<OutputOp>(output_op), result,
125  temp, result_stride);
126 }
127 
128 template <typename InputOp, typename OutputOp, typename Result,
129  typename... Args>
130 TILEDARRAY_FORCE_INLINE void transpose_block(
131  InputOp&& input_op, OutputOp&& output_op, const std::size_t m,
132  const std::size_t n, const std::size_t result_stride,
133  Result* MADNESS_RESTRICT const result, const std::size_t arg_stride,
134  const Args* MADNESS_RESTRICT const... args) {
137 
138  constexpr std::size_t block_size =
140  TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
141 
142  // Copy and transpose arg data into temp block
143  for (std::size_t i = 0ul; i < m; ++i) {
144  std::size_t offset = i * arg_stride;
145  for (std::size_t j = 0ul, x = i; j < n;
146  ++j, x += TILEDARRAY_LOOP_UNWIND, ++offset)
147  input_op(temp[x], args[offset]...);
148  }
149 
150  // Copy the temp block into result
151  for (std::size_t j = 0ul; j < n; ++j) {
152  Result* MADNESS_RESTRICT const result_j = result + (j * result_stride);
153  const Result* MADNESS_RESTRICT const temp_j =
154  temp + (j * TILEDARRAY_LOOP_UNWIND);
155  for (std::size_t i = 0ul; i < m; ++i) output_op(result_j + i, temp_j[i]);
156  }
157 }
158 
160 
176 template <typename InputOp, typename OutputOp, typename Result,
177  typename... Args>
178 void transpose(InputOp&& input_op, OutputOp&& output_op, const std::size_t m,
179  const std::size_t n, const std::size_t result_stride,
180  Result* result, const std::size_t arg_stride,
181  const Args* const... args) {
182  // Compute block iteration control variables
183  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
184  const std::size_t mx = m & index_mask; // = m - m % TILEDARRAY_LOOP_UNWIND
185  const std::size_t nx = n & index_mask; // = n - n % TILEDARRAY_LOOP_UNWIND
186  const std::size_t m_tail = m - mx;
187  const std::size_t n_tail = n - nx;
188  const std::size_t result_block_step = result_stride * TILEDARRAY_LOOP_UNWIND;
189  const std::size_t arg_block_step = arg_stride * TILEDARRAY_LOOP_UNWIND;
190  const std::size_t arg_end = mx * arg_stride;
191  const Result* result_end = result + (nx * result_stride);
192 
193  const auto wrapper_input_op = [&](Result& res, param_type<Args>... a) {
194  res = input_op(a...);
195  };
196 
197  // Iterate over block rows
198  std::size_t arg_start = 0;
199  for (; arg_start < arg_end;
200  arg_start += arg_block_step, result += TILEDARRAY_LOOP_UNWIND) {
201  std::size_t arg_offset = arg_start;
202  Result* result_ij = result;
203  for (; result_ij < result_end;
204  result_ij += result_block_step, arg_offset += TILEDARRAY_LOOP_UNWIND)
205  transpose_block(wrapper_input_op, output_op, result_stride, result_ij,
206  arg_stride, (args + arg_offset)...);
207 
208  if (n_tail)
209  transpose_block(wrapper_input_op, output_op, TILEDARRAY_LOOP_UNWIND,
210  n_tail, result_stride, result_ij, arg_stride,
211  (args + arg_offset)...);
212  }
213 
214  if (m_tail) {
215  std::size_t arg_offset = arg_start;
216  Result* result_ij = result;
217  for (; result_ij < result_end;
218  result_ij += result_block_step, arg_offset += TILEDARRAY_LOOP_UNWIND)
219  transpose_block(wrapper_input_op, output_op, m_tail,
220  TILEDARRAY_LOOP_UNWIND, result_stride, result_ij,
221  arg_stride, (args + arg_offset)...);
222 
223  if (n_tail)
224  transpose_block(wrapper_input_op, output_op, m_tail, n_tail,
225  result_stride, result_ij, arg_stride,
226  (args + arg_offset)...);
227  }
228 }
229 
230 } // namespace math
231 } // namespace TiledArray
232 
233 #endif // TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:162
::blas::Op Op
Definition: blas.h:46
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t result_stride)
Definition: transpose.h:94
void scatter_to(T *const data, std::size_t stride) const
Definition: vector_op.h:279
static constexpr std::size_t offset
Definition: transpose.h:73
TransposeUnwind< N - 1 > TransposeUnwindN1
Definition: transpose.h:71
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
Definition: transpose.h:47
#define TA_ASSERT(EXPR,...)
Definition: error.h:39
TILEDARRAY_FORCE_INLINE std::enable_if<(sizeof...(Args) >=0)>::type for_each_block_ptr(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:182
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
Definition: transpose.h:76
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:40
void transpose(InputOp &&input_op, OutputOp &&output_op, const std::size_t m, const std::size_t n, const std::size_t result_stride, Result *result, const std::size_t arg_stride, const Args *const ... args)
Matrix transpose and initialization.
Definition: transpose.h:178
TransposeUnwind< TILEDARRAY_LOOP_UNWIND - 1 > TransposeUnwindN
Definition: transpose.h:106
TILEDARRAY_FORCE_INLINE void transpose_block(InputOp &&input_op, OutputOp &&output_op, const std::size_t result_stride, Result *const result, const std::size_t arg_stride, const Args *const ... args)
Definition: transpose.h:110
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
Definition: vector_op.h:54
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t)
Definition: transpose.h:60
Partial transpose algorithm automatic loop unwinding.
Definition: transpose.h:69