partial_reduce.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * justus
19  * Department of Chemistry, Virginia Tech
20  *
21  * partial_reduce.h
22  * Feb 27, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
27 #define TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
28 
30 #include <TiledArray/utility.h>
31 
32 namespace TiledArray {
33 namespace math {
34 
36 
38 template <std::size_t N>
39 class PartialReduceUnwind;
40 
41 template <>
43  public:
44  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
45 
46  template <typename Left, typename Right, typename Result, typename Op>
47  static TILEDARRAY_FORCE_INLINE void row_reduce(
48  const Left* MADNESS_RESTRICT const left, const std::size_t,
49  const Right* MADNESS_RESTRICT const right,
50  Result* MADNESS_RESTRICT const result, const Op& op) {
51  // Load the left block
52  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
53  copy_block(left_block, left);
54 
55  reduce_block(op, result[offset], left_block, right);
56  }
57 
58  template <typename Arg, typename Result, typename Op>
59  static TILEDARRAY_FORCE_INLINE void row_reduce(
60  const Arg* MADNESS_RESTRICT const arg, const std::size_t,
61  Result* MADNESS_RESTRICT const result, const Op& op) {
62  // Load the left block
63  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
64  copy_block(arg_block, arg);
65 
66  reduce_block(op, result[offset], arg_block);
67  }
68 
69  template <typename Left, typename Right, typename Result, typename Op>
70  static TILEDARRAY_FORCE_INLINE void col_reduce(
71  const Left* MADNESS_RESTRICT const left, const std::size_t /*stride*/,
72  const Right* MADNESS_RESTRICT const right,
73  Result* MADNESS_RESTRICT const result, const Op& op) {
74  // Load right block
75  const Right right_j = right[offset];
76 
77  // Load left block
78  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
79  copy_block(left_block, left);
80 
82  [right_j, &op](Result& result_ij, const Left left_i) {
83  op(result_ij, left_i, right_j);
84  },
85  result, left_block);
86  }
87 
88  template <typename Arg, typename Result, typename Op>
89  static TILEDARRAY_FORCE_INLINE void col_reduce(
90  const Arg* MADNESS_RESTRICT const arg, const std::size_t /*stride*/,
91  Result* MADNESS_RESTRICT const result, const Op& op) {
92  // Load the arg block
93  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
94  copy_block(arg_block, arg);
95 
96  for_each_block(op, result, arg_block);
97  }
98 
99 }; // class PartialReduceUnwind<0>
100 
101 template <std::size_t N>
103  public:
105 
106  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
107 
108  template <typename Left, typename Right, typename Result, typename Op>
109  static TILEDARRAY_FORCE_INLINE void row_reduce(
110  const Left* MADNESS_RESTRICT const left, const std::size_t stride,
111  const Right* MADNESS_RESTRICT const right,
112  Result* MADNESS_RESTRICT const result, const Op& op) {
113  {
114  // Load the left block
115  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
116  copy_block(left_block, left);
117 
118  reduce_block(op, result[offset], left_block, right);
119  }
120 
121  PartialReduceUnwindN1::row_reduce(left + stride, stride, right, result, op);
122  }
123 
124  template <typename Arg, typename Result, typename Op>
125  static TILEDARRAY_FORCE_INLINE void row_reduce(
126  const Arg* MADNESS_RESTRICT const arg, const std::size_t stride,
127  Result* MADNESS_RESTRICT const result, const Op& op) {
128  {
129  // Load the left block
130  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
131  copy_block(arg_block, arg);
132 
133  reduce_block(op, result[offset], arg_block);
134  }
135 
136  PartialReduceUnwindN1::row_reduce(arg + stride, stride, result, op);
137  }
138 
139  template <typename Left, typename Right, typename Result, typename Op>
140  static TILEDARRAY_FORCE_INLINE void col_reduce(
141  const Left* MADNESS_RESTRICT const left, const std::size_t stride,
142  const Right* MADNESS_RESTRICT const right,
143  Result* MADNESS_RESTRICT const result, const Op& op) {
144  {
145  // Load right block
146  const Right right_j = right[offset];
147 
148  // Load left block
149  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
150  copy_block(left_block, left);
151 
153  [right_j, &op](Result& result_ij, const Left left_i) {
154  op(result_ij, left_i, right_j);
155  },
156  result, left_block);
157  }
158 
159  PartialReduceUnwindN1::col_reduce(left + stride, stride, right, result, op);
160  }
161 
162  template <typename Arg, typename Result, typename Op>
163  static TILEDARRAY_FORCE_INLINE void col_reduce(
164  const Arg* MADNESS_RESTRICT const arg, const std::size_t stride,
165  Result* MADNESS_RESTRICT const result, const Op& op) {
166  {
167  // Load the left block
168  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
169  copy_block(arg_block, arg);
170 
171  for_each_block(op, result, arg_block);
172  }
173 
174  PartialReduceUnwindN1::col_reduce(arg + stride, stride, result, op);
175  }
176 }; // class OuterVectorOpUnwind
177 
178 // Convenience typedef
180 
181 // TODO reduce_op
183 
194 template <typename Left, typename Right, typename Result, typename Op>
195 void row_reduce(const std::size_t m, const std::size_t n,
196  const Left* MADNESS_RESTRICT const left,
197  const Right* MADNESS_RESTRICT const right,
198  Result* MADNESS_RESTRICT const result, const Op& op) {
199  std::size_t i = 0ul;
200 
201  // Compute block iteration limit
202  const std::size_t mx =
203  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
204  const std::size_t nx =
205  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
206 
207  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
208  // Load result block
209  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
210  copy_block(result_block, result + i);
211 
212  // Compute left pointer offset
213  const Left* MADNESS_RESTRICT const left_i = left + (i * n);
214 
215  std::size_t j = 0ul;
216  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
217  // Load right block
218  TILEDARRAY_ALIGNED_STORAGE Right right_block[TILEDARRAY_LOOP_UNWIND];
219  copy_block(right_block, right + j);
220 
221  // Compute and store a block
222  PartialReduceUnwindN::row_reduce(left_i + j, n, right_block, result_block,
223  op);
224  }
225 
226  for (; j < n; ++j) {
227  // Load right block
228  const Right right_j = right[j];
229 
230  // Compute a block
231  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
232  gather_block(left_block, left_i + j, n);
234  [right_j, &op](Result& result_ij, const Left left_i) {
235  op(result_ij, left_i, right_j);
236  },
237  result_block, left_block);
238  }
239 
240  // Post store result
241  copy_block(result + i, result_block);
242  }
243 
244  for (; i < m; ++i) {
245  // Load result block
246  Result result_block = result[i];
247  reduce_op_serial(op, n, result_block, left + (i * n), right);
248  result[i] = result_block;
249  }
250 }
251 
253 
263 template <typename Arg, typename Result, typename Op>
264 void row_reduce(const std::size_t m, const std::size_t n,
265  const Arg* MADNESS_RESTRICT const arg,
266  Result* MADNESS_RESTRICT const result, const Op& op) {
267  std::size_t i = 0ul;
268 
269  // Compute block iteration limit
270  const std::size_t mx =
271  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
272  const std::size_t nx =
273  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
274 
275  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
276  // Load result block
277  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
278  copy_block(result_block, result + i);
279 
280  // Compute left pointer offset
281  const Arg* MADNESS_RESTRICT const arg_i = arg + (i * n);
282 
283  std::size_t j = 0ul;
284  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
285  // Compute and store a block
286  PartialReduceUnwindN::row_reduce(arg_i + j, n, result_block, op);
287  }
288 
289  for (; j < n; ++j) {
290  // Compute a block
291  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
292  gather_block(arg_block, arg_i + j, n);
293  for_each_block(op, result_block, arg_block);
294  }
295 
296  // Post process and store result
297  copy_block(result + i, result_block);
298  }
299 
300  for (; i < m; ++i) {
301  // Load result block
302  Result result_block = result[i];
303  reduce_op_serial(op, n, result_block, arg + (i * n));
304  result[i] = result_block;
305  }
306 }
307 
309 
321 template <typename Left, typename Right, typename Result, typename Op>
322 void col_reduce(const std::size_t m, const std::size_t n,
323  const Left* MADNESS_RESTRICT const left,
324  const Right* MADNESS_RESTRICT const right,
325  Result* MADNESS_RESTRICT const result, const Op& op) {
326  std::size_t i = 0ul;
327 
328  // Compute block iteration limit
329  const std::size_t mx =
330  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
331  const std::size_t nx =
332  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
333 
334  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
335  // Load right block
336  TILEDARRAY_ALIGNED_STORAGE Right right_block[TILEDARRAY_LOOP_UNWIND];
337  copy_block(right_block, right + i);
338 
339  // Compute left pointer offset
340  const Left* MADNESS_RESTRICT const left_i = left + (i * n);
341 
342  std::size_t j = 0ul;
343  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
344  // Load result block
345  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
346  copy_block(result_block, result + j);
347 
348  // Compute and store a block
349  PartialReduceUnwindN::col_reduce(left_i + j, n, right_block, result_block,
350  op);
351 
352  // Store the result
353  copy_block(result + j, result_block);
354  }
355 
356  for (; j < n; ++j) {
357  // Load result block
358  Result result_block = result[j];
359 
360  // Compute a block
361  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
362  gather_block(left_block, left_i + j, n);
363  reduce_block(op, result_block, left_block, right_block);
364 
365  result[j] = result_block;
366  }
367  }
368 
369  for (; i < m; ++i) {
370  const Right right_i = right[i];
371 
372  // Reduce row i to result
374  [&op, right_i](Result& result_j, const Left left_ij) {
375  op(result_j, left_ij, right_i);
376  },
377  n, result, left + (i * n));
378  }
379 }
380 
382 
392 template <typename Arg, typename Result, typename Op>
393 void col_reduce(const std::size_t m, const std::size_t n,
394  const Arg* MADNESS_RESTRICT const arg,
395  Result* MADNESS_RESTRICT const result, const Op& op) {
396  std::size_t i = 0ul;
397 
398  // Compute block iteration limit
399  const std::size_t mx =
400  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
401  const std::size_t nx =
402  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
403 
404  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
405  // Compute left pointer offset
406  const Arg* MADNESS_RESTRICT const arg_i = arg + (i * n);
407 
408  std::size_t j = 0ul;
409  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
410  // Load result block
411  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
412  copy_block(result_block, result + j);
413 
414  // Compute and store a block
415  PartialReduceUnwindN::col_reduce(arg_i + j, n, result_block, op);
416 
417  // Store the result
418  copy_block(result + j, result_block);
419  }
420 
421  for (; j < n; ++j) {
422  // Load result block
423  Result result_block = result[j];
424 
425  // Compute a block
426  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
427  gather_block(arg_block, arg_i + j, n);
428  reduce_block(op, result_block, arg_block);
429 
430  result[j] = result_block;
431  }
432  }
433 
434  for (; i < m; ++i) {
435  // Reduce row i to result
436  inplace_vector_op(op, n, result, arg + (i * n));
437  }
438 }
439 
440 } // namespace math
441 } // namespace TiledArray
442 
443 #endif // TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:162
::blas::Op Op
Definition: blas.h:46
static TILEDARRAY_FORCE_INLINE void col_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t stride, Result *MADNESS_RESTRICT const result, const Op &op)
void reduce_op_serial(Op &&op, const std::size_t n, Result &result, const Args *const ... args)
Definition: vector_op.h:562
TILEDARRAY_FORCE_INLINE void gather_block(Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:248
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:219
void inplace_vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:391
static TILEDARRAY_FORCE_INLINE void row_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
Partial reduce algorithm automatic loop unwinding.
void row_reduce(const std::size_t m, const std::size_t n, const Left *MADNESS_RESTRICT const left, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
Reduce the rows of a matrix.
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:40
PartialReduceUnwind< N - 1 > PartialReduceUnwindN1
PartialReduceUnwind< TILEDARRAY_LOOP_UNWIND - 1 > PartialReduceUnwindN
static TILEDARRAY_FORCE_INLINE void col_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t stride, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
TILEDARRAY_FORCE_INLINE void reduce_block(Op &&op, Result &result, const Args *const ... args)
Definition: vector_op.h:200
static TILEDARRAY_FORCE_INLINE void col_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void row_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t stride, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void row_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void row_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t stride, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void col_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
void col_reduce(const std::size_t m, const std::size_t n, const Left *MADNESS_RESTRICT const left, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
Reduce the columns of a matrix.