TiledArray  0.7.0
outer.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * outer.h
22  * Feb 16, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_OUTER_H__INCLUDED
27 #define TILEDARRAY_MATH_OUTER_H__INCLUDED
28 
30 #include <TiledArray/utility.h>
31 
32 namespace TiledArray {
33  namespace math {
34 
36 
38  template <std::size_t N>
40 
41  template <>
43  public:
44 
45  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
46 
47  template <typename X, typename Y, typename Result, typename Op>
48  static TILEDARRAY_FORCE_INLINE void
49  outer(const X* const x_block, const Y* const y_block,
50  Result* const result, const std::size_t /*stride*/, const Op& op)
51  {
52  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
53  copy_block(result_block, result);
54 
55  const X x = x_block[offset];
56  for_each_block([x,&op] (Result& r, const Y y) { op(r, x, y); },
57  result_block, y_block);
58 
59  copy_block(result, result_block);
60  }
61 
62  template <typename X, typename Y, typename Init, typename Result, typename Op>
63  static TILEDARRAY_FORCE_INLINE void
64  fill(const X* const x_block, const Y* const y_block,
65  const Init* const init, Result* const result,
66  const std::size_t /*stride*/, const Op& op)
67  {
68  TILEDARRAY_ALIGNED_STORAGE Init init_block[TILEDARRAY_LOOP_UNWIND];
69  copy_block(init_block, init);
70 
71  const X x = x_block[offset];
72  for_each_block([x,&op] (Init& i, const Y y) { op(i, x, y); },
73  init_block, y_block);
74 
75  copy_block(result, init_block);
76  }
77 
78  template <typename X, typename Y, typename Result, typename Op>
79  static TILEDARRAY_FORCE_INLINE void
80  fill(const X* const x_block, const Y* const y_block,
81  Result* const result, const std::size_t /*stride*/, const Op& op)
82  {
83  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
84 
85  const X x = x_block[offset];
86 
87  for_each_block([x,&op] (Result& res, const Y y) { res = op(x, y); },
88  result_block, y_block);
89 
90  copy_block(result, result_block);
91  }
92  }; // class OuterVectorOpUnwind<0>
93 
94 
95  template <std::size_t N>
96  class OuterVectorOpUnwind : public OuterVectorOpUnwind<N - 1> {
97  public:
98 
100 
101  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
102 
103  template <typename X, typename Y, typename Result, typename Op>
104  static TILEDARRAY_FORCE_INLINE void
105  outer(const X* const x_block, const Y* const y_block,
106  Result* const result, const std::size_t stride, const Op& op)
107  {
108  {
109  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
110  copy_block(result_block, result);
111 
112  const X x = x_block[offset];
113  for_each_block([x,&op] (Result& r, const Y y) { op(r, x, y); },
114  result_block, y_block);
115 
116  copy_block(result, result_block);
117  }
118 
119  OuterVectorOpUnwindN1::outer(x_block, y_block, result + stride, stride, op);
120  }
121 
122 
123  template <typename X, typename Y, typename Init, typename Result, typename Op>
124  static TILEDARRAY_FORCE_INLINE void
125  fill(const X* const x_block, const Y* const y_block,
126  const Init* const init, Result* const result,
127  const std::size_t stride, const Op& op)
128  {
129  {
130  TILEDARRAY_ALIGNED_STORAGE Init init_block[TILEDARRAY_LOOP_UNWIND];
131  copy_block(init_block, init);
132 
133  const X x = x_block[offset];
134  for_each_block([x,&op] (Init& i, const Y y) { op(i, x, y); },
135  init_block, y_block);
136 
137  copy_block(result, init_block);
138  }
139 
140  OuterVectorOpUnwindN1::fill(x_block, y_block, init + stride, result + stride, stride, op);
141  }
142 
143  template <typename X, typename Y, typename Result, typename Op>
144  static TILEDARRAY_FORCE_INLINE void
145  fill(const X* const x_block, const Y* const y_block,
146  Result* MADNESS_RESTRICT const result, const std::size_t stride, const Op& op)
147  {
148  {
149  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
150 
151  const X x = x_block[offset];
152 
153  for_each_block([x,&op] (Result& res, const Y y) { res = op(x, y); },
154  result_block, y_block);
155 
156  copy_block(result, result_block);
157  }
158 
159  OuterVectorOpUnwindN1::fill(x_block, y_block, result + stride, stride, op);
160  }
161  }; // class OuterVectorOpUnwind
162 
163  // Convenience typedef
165 
166 
168 
179  template <typename X, typename Y, typename A, typename Op>
180  void outer_fill(const std::size_t m, const std::size_t n,
181  const X* const x, const Y* const y, A* a, const Op& op)
182  {
183  std::size_t i = 0ul;
184 
185  // Compute block iteration limit
186  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
187  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
188  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
189 
190  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride) {
191 
192  // Load x block
193  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
194  copy_block(x_block, x + i);
195 
196  std::size_t j = 0ul;
197  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
198 
199  // Load y block
200  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
201  copy_block(y_block, y + j);
202 
203  // Compute and store a block
204  OuterVectorOpUnwindN::fill(x_block, y_block, a + j, n, op);
205 
206  }
207 
208  for(; j < n; ++j) {
209 
210  // Load y block
211  const Y y_j = y[j];
212 
213  // Compute a block
214  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
215 
216  const auto bind_first_op = [y_j,&op] (A& a_ij, const X x_i) { a_ij = op(x_i, y_j); };
217  for_each_block(bind_first_op, a_block, x_block);
218 
219  // Store a block
220  scatter_block(a + j, n, a_block);
221  }
222  }
223 
224  for(; i < m; ++i, a += n) {
225 
226  const X x_i = x[i];
227  vector_op([x_i,&op] (const Y y_j)
228  { return op(x_i, y_j); }, n, a, y);
229 
230  }
231  }
232 
234 
247  template <typename X, typename Y, typename A, typename Op>
248  void outer(const std::size_t m, const std::size_t n,
249  const X* const x, const Y* const y, A* a, const Op& op)
250  {
251  std::size_t i = 0ul;
252 
253  // Compute block iteration limit
254  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
255  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
256  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
257 
258  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride) {
259 
260  // Load x block
261  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
262  copy_block(x_block, x + i);
263 
264  std::size_t j = 0ul;
265  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
266 
267  // Load y block
268  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
269  copy_block(y_block, y + j);
270 
271  // Load, compute, and store a block
272  OuterVectorOpUnwindN::outer(x_block, y_block, a + j, n, op);
273 
274  }
275 
276  for(; j < n; ++j) {
277 
278  // Load a block
279  A* const a_ij = a + j;
280  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
281  gather_block(a_block, a_ij, n);
282 
283  // Load y block
284  const Y y_j = y[j];
285 
286  // Compute a block
287  for_each_block([y_j,&op] (A& a_ij, const X x_i)
288  { return op(a_ij, x_i, y_j); },
289  a_block, x_block);
290 
291  // Store a block
292  scatter_block(a_ij, n, a_block);
293 
294  }
295  }
296 
297  for(; i < m; ++i, a += n) {
298  const X x_i = x[i];
299  inplace_vector_op([x_i,&op] (A& a_ij, const Y y_j)
300  { return op(a_ij, x_i, y_j); },
301  n, a, y);
302  }
303  }
304 
305 
307 
328  template <typename X, typename Y, typename A, typename B, typename Op>
329  void outer_fill(const std::size_t m, const std::size_t n,
330  const X* MADNESS_RESTRICT const x, const Y* MADNESS_RESTRICT const y,
331  const A* MADNESS_RESTRICT a, B* MADNESS_RESTRICT b, const Op& op)
332  {
333  std::size_t i = 0ul;
334 
335  // Compute block iteration limit
336  const std::size_t mx = m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
337  const std::size_t nx = n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
338  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
339 
340  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride, b += a_block_stride) {
341 
342  // Load x block
343  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
344  copy_block(x_block, x + i);
345 
346  std::size_t j = 0ul;
347  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
348 
349  // Load y block
350  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
351  copy_block(y_block, y + j);
352 
353  // Load, compute, and store a block
354  OuterVectorOpUnwindN::fill(x_block, y_block, a + j, b + j, n, op);
355 
356  }
357 
358  for(; j < n; ++j) {
359 
360  // Load a block
361  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
362  gather_block(a_block, a + j, n);
363 
364  // Load y block
365  const Y y_j = y[j];
366 
367  // Compute a block
368  for_each_block([y_j,&op] (A& a_ij, const X x_i)
369  { return op(a_ij, x_i, y_j); },
370  a_block, x_block);
371 
372  // Store a block
373  scatter_block(b + j, n, a_block);
374  }
375  }
376 
377  for(; i < m; ++i, a += n, b += n) {
378 
379  // Load x block
380  const X x_i = x[i];
381 
382  std::size_t j = 0ul;
383 
384  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
385 
386  // Load a block
387  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
388  copy_block(a_block, a + j);
389 
390  // Load y block
391  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
392  copy_block(y_block, y + j);
393 
394  // Compute outer block
395  for_each_block([x_i,&op] (A& a_ij, const Y y_j)
396  { return op(a_ij, x_i, y_j); },
397  a_block, y_block);
398 
399  // Store a block
400  copy_block(b + j, a_block);
401 
402  }
403 
404  for(; j < n; ++j) {
405 
406  A a_ij = a[j];
407  const Y y_j = y[j];
408  op(a_ij, x_i, y_j);
409  b[j] = a_ij;
410  }
411  }
412  }
413 
414  } // namespace math
415 } // namespace TiledArray
416 
417 #endif // TILEDARRAY_MATH_OUTER_H__INCLUDED
TILEDARRAY_FORCE_INLINE void scatter_block(Result *const result, const std::size_t stride, const Arg *const arg)
Definition: vector_op.h:234
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:152
TILEDARRAY_FORCE_INLINE void gather_block(Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:250
void outer(const std::size_t m, const std::size_t n, const X *const x, const Y *const y, A *a, const Op &op)
Compute the outer of x and y to modify a.
Definition: outer.h:248
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:220
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:33
void outer_fill(const std::size_t m, const std::size_t n, const X *const x, const Y *const y, A *a, const Op &op)
Compute and store outer of x and y in a.
Definition: outer.h:180
void vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:478
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, const Init *const init, Result *const result, const std::size_t stride, const Op &op)
Definition: outer.h:125
Outer algorithm automatic loop unwinding.
Definition: outer.h:39
void inplace_vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:397
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:80
static TILEDARRAY_FORCE_INLINE void outer(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t stride, const Op &op)
Definition: outer.h:105
OuterVectorOpUnwind< TILEDARRAY_LOOP_UNWIND - 1 > OuterVectorOpUnwindN
Definition: outer.h:164
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, Result *MADNESS_RESTRICT const result, const std::size_t stride, const Op &op)
Definition: outer.h:145
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, const Init *const init, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:64
static TILEDARRAY_FORCE_INLINE void outer(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:49
OuterVectorOpUnwind< N - 1 > OuterVectorOpUnwindN1
Definition: outer.h:99
static const std::size_t offset
Definition: outer.h:101