TiledArray  0.7.0
partial_reduce.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * justus
19  * Department of Chemistry, Virginia Tech
20  *
21  * partial_reduce.h
22  * Feb 27, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
27 #define TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
28 
30 #include <TiledArray/utility.h>
31 
32 namespace TiledArray {
33  namespace math {
34 
36 
38  template <std::size_t N>
40 
41  template <>
43  public:
44 
45  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
46 
47  template <typename Left, typename Right, typename Result, typename Op>
48  static TILEDARRAY_FORCE_INLINE void
49  row_reduce(const Left* MADNESS_RESTRICT const left, const std::size_t,
50  const Right* MADNESS_RESTRICT const right, Result* MADNESS_RESTRICT const result,
51  const Op& op)
52  {
53  // Load the left block
54  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
55  copy_block(left_block, left);
56 
57  reduce_block(op, result[offset], left_block, right);
58  }
59 
60  template <typename Arg, typename Result, typename Op>
61  static TILEDARRAY_FORCE_INLINE void
62  row_reduce(const Arg* MADNESS_RESTRICT const arg, const std::size_t,
63  Result* MADNESS_RESTRICT const result, const Op& op)
64  {
65  // Load the left block
66  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
67  copy_block(arg_block, arg);
68 
69  reduce_block(op, result[offset], arg_block);
70  }
71 
72  template <typename Left, typename Right, typename Result, typename Op>
73  static TILEDARRAY_FORCE_INLINE void
74  col_reduce(const Left* MADNESS_RESTRICT const left, const std::size_t /*stride*/,
75  const Right* MADNESS_RESTRICT const right, Result* MADNESS_RESTRICT const result,
76  const Op& op)
77  {
78  // Load right block
79  const Right right_j = right[offset];
80 
81  // Load left block
82  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
83  copy_block(left_block, left);
84 
85  for_each_block([right_j,&op] (Result& result_ij, const Left left_i)
86  { op(result_ij, left_i, right_j); }, result, left_block);
87  }
88 
89 
90  template <typename Arg, typename Result, typename Op>
91  static TILEDARRAY_FORCE_INLINE void
92  col_reduce(const Arg* MADNESS_RESTRICT const arg, const std::size_t /*stride*/,
93  Result* MADNESS_RESTRICT const result, const Op& op)
94  {
95  // Load the arg block
96  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
97  copy_block(arg_block, arg);
98 
99  for_each_block(op, result, arg_block);
100  }
101 
102  }; // class PartialReduceUnwind<0>
103 
104 
105  template <std::size_t N>
106  class PartialReduceUnwind : public PartialReduceUnwind<N - 1> {
107  public:
108 
110 
111  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
112 
113  template <typename Left, typename Right, typename Result, typename Op>
114  static TILEDARRAY_FORCE_INLINE void
115  row_reduce(const Left* MADNESS_RESTRICT const left, const std::size_t stride,
116  const Right* MADNESS_RESTRICT const right, Result* MADNESS_RESTRICT const result,
117  const Op& op)
118  {
119  {
120  // Load the left block
121  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
122  copy_block(left_block, left);
123 
124  reduce_block(op, result[offset], left_block, right);
125  }
126 
127  PartialReduceUnwindN1::row_reduce(left + stride, stride, right, result, op);
128  }
129 
130 
131  template <typename Arg, typename Result, typename Op>
132  static TILEDARRAY_FORCE_INLINE void
133  row_reduce(const Arg* MADNESS_RESTRICT const arg, const std::size_t stride,
134  Result* MADNESS_RESTRICT const result, const Op& op)
135  {
136  {
137  // Load the left block
138  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
139  copy_block(arg_block, arg);
140 
141  reduce_block(op, result[offset], arg_block);
142  }
143 
144  PartialReduceUnwindN1::row_reduce(arg + stride, stride, result, op);
145  }
146 
147 
148  template <typename Left, typename Right, typename Result, typename Op>
149  static TILEDARRAY_FORCE_INLINE void
150  col_reduce(const Left* MADNESS_RESTRICT const left, const std::size_t stride,
151  const Right* MADNESS_RESTRICT const right, Result* MADNESS_RESTRICT const result,
152  const Op& op)
153  {
154  {
155  // Load right block
156  const Right right_j = right[offset];
157 
158  // Load left block
159  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
160  copy_block(left_block, left);
161 
162  for_each_block([right_j,&op] (Result& result_ij, const Left left_i)
163  { op(result_ij, left_i, right_j); }, result, left_block);
164  }
165 
166  PartialReduceUnwindN1::col_reduce(left + stride, stride, right, result, op);
167  }
168 
169  template <typename Arg, typename Result, typename Op>
170  static TILEDARRAY_FORCE_INLINE void
171  col_reduce(const Arg* MADNESS_RESTRICT const arg, const std::size_t stride,
172  Result* MADNESS_RESTRICT const result, const Op& op)
173  {
174  {
175  // Load the left block
176  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
177  copy_block(arg_block, arg);
178 
179  for_each_block(op, result, arg_block);
180  }
181 
182  PartialReduceUnwindN1::col_reduce(arg + stride, stride, result, op);
183  }
184  }; // class OuterVectorOpUnwind
185 
186  // Convenience typedef
188 
189 
190  //TODO reduce_op
192 
203  template <typename Left, typename Right, typename Result, typename Op>
204  void row_reduce(const std::size_t m, const std::size_t n,
205  const Left* MADNESS_RESTRICT const left, const Right* MADNESS_RESTRICT const right,
206  Result* MADNESS_RESTRICT const result, const Op& op)
207  {
208  std::size_t i = 0ul;
209 
210  // Compute block iteration limit
211  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
212  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
213 
214  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
215 
216  // Load result block
217  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
218  copy_block(result_block, result + i);
219 
220  // Compute left pointer offset
221  const Left* MADNESS_RESTRICT const left_i = left + (i * n);
222 
223  std::size_t j = 0ul;
224  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
225 
226  // Load right block
227  TILEDARRAY_ALIGNED_STORAGE Right right_block[TILEDARRAY_LOOP_UNWIND];
228  copy_block(right_block, right + j);
229 
230  // Compute and store a block
231  PartialReduceUnwindN::row_reduce(left_i + j, n, right_block, result_block, op);
232 
233  }
234 
235  for(; j < n; ++j) {
236 
237  // Load right block
238  const Right right_j = right[j];
239 
240  // Compute a block
241  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
242  gather_block(left_block, left_i + j, n);
243  for_each_block([right_j,&op] (Result& result_ij, const Left left_i)
244  { op(result_ij, left_i, right_j); }, result_block, left_block);
245 
246  }
247 
248  // Post store result
249  copy_block(result + i, result_block);
250  }
251 
252  for(; i < m; ++i) {
253 
254  // Load result block
255  Result result_block = result[i];
256  reduce_op_serial(op, n, result_block, left + (i * n), right);
257  result[i] = result_block;
258  }
259  }
260 
261 
263 
273  template <typename Arg, typename Result, typename Op>
274  void row_reduce(const std::size_t m, const std::size_t n,
275  const Arg* MADNESS_RESTRICT const arg, Result* MADNESS_RESTRICT const result, const Op& op)
276  {
277  std::size_t i = 0ul;
278 
279  // Compute block iteration limit
280  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
281  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
282 
283  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
284 
285  // Load result block
286  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
287  copy_block(result_block, result + i);
288 
289  // Compute left pointer offset
290  const Arg* MADNESS_RESTRICT const arg_i = arg + (i * n);
291 
292  std::size_t j = 0ul;
293  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
294 
295  // Compute and store a block
296  PartialReduceUnwindN::row_reduce(arg_i + j, n, result_block, op);
297 
298  }
299 
300  for(; j < n; ++j) {
301 
302  // Compute a block
303  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
304  gather_block(arg_block, arg_i + j, n);
305  for_each_block(op, result_block, arg_block);
306 
307  }
308 
309  // Post process and store result
310  copy_block(result + i, result_block);
311  }
312 
313  for(; i < m; ++i) {
314 
315  // Load result block
316  Result result_block = result[i];
317  reduce_op_serial(op, n, result_block, arg + (i * n));
318  result[i] = result_block;
319  }
320  }
321 
323 
335  template <typename Left, typename Right, typename Result, typename Op>
336  void col_reduce(const std::size_t m, const std::size_t n,
337  const Left* MADNESS_RESTRICT const left, const Right* MADNESS_RESTRICT const right,
338  Result* MADNESS_RESTRICT const result, const Op& op)
339  {
340  std::size_t i = 0ul;
341 
342  // Compute block iteration limit
343  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
344  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
345 
346  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
347 
348  // Load right block
349  TILEDARRAY_ALIGNED_STORAGE Right right_block[TILEDARRAY_LOOP_UNWIND];
350  copy_block(right_block, right + i);
351 
352  // Compute left pointer offset
353  const Left* MADNESS_RESTRICT const left_i = left + (i * n);
354 
355  std::size_t j = 0ul;
356  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
357 
358  // Load result block
359  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
360  copy_block(result_block, result + j);
361 
362  // Compute and store a block
363  PartialReduceUnwindN::col_reduce(left_i + j, n, right_block, result_block, op);
364 
365  // Store the result
366  copy_block(result + j, result_block);
367  }
368 
369  for(; j < n; ++j) {
370 
371  // Load result block
372  Result result_block = result[j];
373 
374  // Compute a block
375  TILEDARRAY_ALIGNED_STORAGE Left left_block[TILEDARRAY_LOOP_UNWIND];
376  gather_block(left_block, left_i + j, n);
377  reduce_block(op, result_block, left_block, right_block);
378 
379  result[j] = result_block;
380 
381  }
382 
383  }
384 
385  for(; i < m; ++i) {
386 
387  const Right right_i = right[i];
388 
389  // Reduce row i to result
390  inplace_vector_op([&op,right_i] (Result& result_j, const Left left_ij) {
391  op(result_j, left_ij, right_i);
392  }, n, result, left + (i * n));
393  }
394  }
395 
397 
407  template <typename Arg, typename Result, typename Op>
408  void col_reduce(const std::size_t m, const std::size_t n,
409  const Arg* MADNESS_RESTRICT const arg, Result* MADNESS_RESTRICT const result, const Op& op)
410  {
411  std::size_t i = 0ul;
412 
413  // Compute block iteration limit
414  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
415  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
416 
417  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND) {
418 
419  // Compute left pointer offset
420  const Arg* MADNESS_RESTRICT const arg_i = arg + (i * n);
421 
422  std::size_t j = 0ul;
423  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
424 
425  // Load result block
426  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
427  copy_block(result_block, result + j);
428 
429  // Compute and store a block
430  PartialReduceUnwindN::col_reduce(arg_i + j, n, result_block, op);
431 
432  // Store the result
433  copy_block(result + j, result_block);
434  }
435 
436  for(; j < n; ++j) {
437 
438  // Load result block
439  Result result_block = result[j];
440 
441  // Compute a block
442  TILEDARRAY_ALIGNED_STORAGE Arg arg_block[TILEDARRAY_LOOP_UNWIND];
443  gather_block(arg_block, arg_i + j, n);
444  reduce_block(op, result_block, arg_block);
445 
446  result[j] = result_block;
447 
448  }
449 
450  }
451 
452  for(; i < m; ++i) {
453 
454  // Reduce row i to result
455  inplace_vector_op(op, n, result, arg + (i * n));
456  }
457  }
458 
459  } // namespace math
460 } // namespace TiledArray
461 
462 #endif // TILEDARRAY_MATH_PARTIAL_REDUCE_H__INCLUDED
PartialReduceUnwind< N - 1 > PartialReduceUnwindN1
void row_reduce(const std::size_t m, const std::size_t n, const Left *MADNESS_RESTRICT const left, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
Reduce the rows of a matrix.
static TILEDARRAY_FORCE_INLINE void col_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t stride, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:152
static TILEDARRAY_FORCE_INLINE void row_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t, Result *MADNESS_RESTRICT const result, const Op &op)
TILEDARRAY_FORCE_INLINE void gather_block(Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:250
static TILEDARRAY_FORCE_INLINE void row_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t stride, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void row_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t stride, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void col_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t, Result *MADNESS_RESTRICT const result, const Op &op)
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:220
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:33
static TILEDARRAY_FORCE_INLINE void col_reduce(const Arg *MADNESS_RESTRICT const arg, const std::size_t stride, Result *MADNESS_RESTRICT const result, const Op &op)
static TILEDARRAY_FORCE_INLINE void col_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
void inplace_vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:397
static TILEDARRAY_FORCE_INLINE void row_reduce(const Left *MADNESS_RESTRICT const left, const std::size_t, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
PartialReduceUnwind< TILEDARRAY_LOOP_UNWIND - 1 > PartialReduceUnwindN
Partial reduce algorithm automatic loop unwinding.
void col_reduce(const std::size_t m, const std::size_t n, const Left *MADNESS_RESTRICT const left, const Right *MADNESS_RESTRICT const right, Result *MADNESS_RESTRICT const result, const Op &op)
Reduce the columns of a matrix.
void reduce_op_serial(Op &&op, const std::size_t n, Result &result, const Args *const ... args)
Definition: vector_op.h:567
TILEDARRAY_FORCE_INLINE void reduce_block(Op &&op, Result &result, const Args *const ... args)
Definition: vector_op.h:199