outer.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2014 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * outer.h
22  * Feb 16, 2014
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_OUTER_H__INCLUDED
27 #define TILEDARRAY_MATH_OUTER_H__INCLUDED
28 
30 #include <TiledArray/utility.h>
31 
32 namespace TiledArray {
33 namespace math {
34 
36 
38 template <std::size_t N>
39 class OuterVectorOpUnwind;
40 
41 template <>
43  public:
44  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1;
45 
46  template <typename X, typename Y, typename Result, typename Op>
47  static TILEDARRAY_FORCE_INLINE void outer(const X* const x_block,
48  const Y* const y_block,
49  Result* const result,
50  const std::size_t /*stride*/,
51  const Op& op) {
52  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
53  copy_block(result_block, result);
54 
55  const X x = x_block[offset];
56  for_each_block([x, &op](Result& r, const Y y) { op(r, x, y); },
57  result_block, y_block);
58 
59  copy_block(result, result_block);
60  }
61 
62  template <typename X, typename Y, typename Init, typename Result, typename Op>
63  static TILEDARRAY_FORCE_INLINE void fill(
64  const X* const x_block, const Y* const y_block, const Init* const init,
65  Result* const result, const std::size_t /*stride*/, const Op& op) {
66  TILEDARRAY_ALIGNED_STORAGE Init init_block[TILEDARRAY_LOOP_UNWIND];
67  copy_block(init_block, init);
68 
69  const X x = x_block[offset];
70  for_each_block([x, &op](Init& i, const Y y) { op(i, x, y); }, init_block,
71  y_block);
72 
73  copy_block(result, init_block);
74  }
75 
76  template <typename X, typename Y, typename Result, typename Op>
77  static TILEDARRAY_FORCE_INLINE void fill(const X* const x_block,
78  const Y* const y_block,
79  Result* const result,
80  const std::size_t /*stride*/,
81  const Op& op) {
82  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
83 
84  const X x = x_block[offset];
85 
86  for_each_block([x, &op](Result& res, const Y y) { res = op(x, y); },
87  result_block, y_block);
88 
89  copy_block(result, result_block);
90  }
91 }; // class OuterVectorOpUnwind<0>
92 
93 template <std::size_t N>
95  public:
97 
98  static const std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1;
99 
100  template <typename X, typename Y, typename Result, typename Op>
101  static TILEDARRAY_FORCE_INLINE void outer(const X* const x_block,
102  const Y* const y_block,
103  Result* const result,
104  const std::size_t stride,
105  const Op& op) {
106  {
107  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
108  copy_block(result_block, result);
109 
110  const X x = x_block[offset];
111  for_each_block([x, &op](Result& r, const Y y) { op(r, x, y); },
112  result_block, y_block);
113 
114  copy_block(result, result_block);
115  }
116 
117  OuterVectorOpUnwindN1::outer(x_block, y_block, result + stride, stride, op);
118  }
119 
120  template <typename X, typename Y, typename Init, typename Result, typename Op>
121  static TILEDARRAY_FORCE_INLINE void fill(
122  const X* const x_block, const Y* const y_block, const Init* const init,
123  Result* const result, const std::size_t stride, const Op& op) {
124  {
125  TILEDARRAY_ALIGNED_STORAGE Init init_block[TILEDARRAY_LOOP_UNWIND];
126  copy_block(init_block, init);
127 
128  const X x = x_block[offset];
129  for_each_block([x, &op](Init& i, const Y y) { op(i, x, y); }, init_block,
130  y_block);
131 
132  copy_block(result, init_block);
133  }
134 
135  OuterVectorOpUnwindN1::fill(x_block, y_block, init + stride,
136  result + stride, stride, op);
137  }
138 
139  template <typename X, typename Y, typename Result, typename Op>
140  static TILEDARRAY_FORCE_INLINE void fill(
141  const X* const x_block, const Y* const y_block,
142  Result* MADNESS_RESTRICT const result, const std::size_t stride,
143  const Op& op) {
144  {
145  TILEDARRAY_ALIGNED_STORAGE Result result_block[TILEDARRAY_LOOP_UNWIND];
146 
147  const X x = x_block[offset];
148 
149  for_each_block([x, &op](Result& res, const Y y) { res = op(x, y); },
150  result_block, y_block);
151 
152  copy_block(result, result_block);
153  }
154 
155  OuterVectorOpUnwindN1::fill(x_block, y_block, result + stride, stride, op);
156  }
157 }; // class OuterVectorOpUnwind
158 
159 // Convenience typedef
161 
163 
174 template <typename X, typename Y, typename A, typename Op>
175 void outer_fill(const std::size_t m, const std::size_t n, const X* const x,
176  const Y* const y, A* a, const Op& op) {
177  std::size_t i = 0ul;
178 
179  // Compute block iteration limit
180  const std::size_t mx =
181  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
182  const std::size_t nx =
183  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
184  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
185 
186  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride) {
187  // Load x block
188  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
189  copy_block(x_block, x + i);
190 
191  std::size_t j = 0ul;
192  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
193  // Load y block
194  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
195  copy_block(y_block, y + j);
196 
197  // Compute and store a block
198  OuterVectorOpUnwindN::fill(x_block, y_block, a + j, n, op);
199  }
200 
201  for (; j < n; ++j) {
202  // Load y block
203  const Y y_j = y[j];
204 
205  // Compute a block
206  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
207 
208  const auto bind_first_op = [y_j, &op](A& a_ij, const X x_i) {
209  a_ij = op(x_i, y_j);
210  };
211  for_each_block(bind_first_op, a_block, x_block);
212 
213  // Store a block
214  scatter_block(a + j, n, a_block);
215  }
216  }
217 
218  for (; i < m; ++i, a += n) {
219  const X x_i = x[i];
220  vector_op([x_i, &op](const Y y_j) { return op(x_i, y_j); }, n, a, y);
221  }
222 }
223 
225 
238 template <typename X, typename Y, typename A, typename Op>
239 void outer(const std::size_t m, const std::size_t n, const X* const x,
240  const Y* const y, A* a, const Op& op) {
241  std::size_t i = 0ul;
242 
243  // Compute block iteration limit
244  const std::size_t mx =
245  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
246  const std::size_t nx =
247  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
248  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
249 
250  for (; i < mx; i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride) {
251  // Load x block
252  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
253  copy_block(x_block, x + i);
254 
255  std::size_t j = 0ul;
256  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
257  // Load y block
258  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
259  copy_block(y_block, y + j);
260 
261  // Load, compute, and store a block
262  OuterVectorOpUnwindN::outer(x_block, y_block, a + j, n, op);
263  }
264 
265  for (; j < n; ++j) {
266  // Load a block
267  A* const a_ij = a + j;
268  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
269  gather_block(a_block, a_ij, n);
270 
271  // Load y block
272  const Y y_j = y[j];
273 
274  // Compute a block
276  [y_j, &op](A& a_ij, const X x_i) { return op(a_ij, x_i, y_j); },
277  a_block, x_block);
278 
279  // Store a block
280  scatter_block(a_ij, n, a_block);
281  }
282  }
283 
284  for (; i < m; ++i, a += n) {
285  const X x_i = x[i];
287  [x_i, &op](A& a_ij, const Y y_j) { return op(a_ij, x_i, y_j); }, n, a,
288  y);
289  }
290 }
291 
293 
314 template <typename X, typename Y, typename A, typename B, typename Op>
315 void outer_fill(const std::size_t m, const std::size_t n,
316  const X* MADNESS_RESTRICT const x,
317  const Y* MADNESS_RESTRICT const y, const A* MADNESS_RESTRICT a,
318  B* MADNESS_RESTRICT b, const Op& op) {
319  std::size_t i = 0ul;
320 
321  // Compute block iteration limit
322  const std::size_t mx =
323  m & index_mask::value; // = m - m % TILEDARRAY_LOOP_UNWIND
324  const std::size_t nx =
325  n & index_mask::value; // = n - n % TILEDARRAY_LOOP_UNWIND
326  const std::size_t a_block_stride = n * TILEDARRAY_LOOP_UNWIND;
327 
328  for (; i < mx;
329  i += TILEDARRAY_LOOP_UNWIND, a += a_block_stride, b += a_block_stride) {
330  // Load x block
331  TILEDARRAY_ALIGNED_STORAGE X x_block[TILEDARRAY_LOOP_UNWIND];
332  copy_block(x_block, x + i);
333 
334  std::size_t j = 0ul;
335  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
336  // Load y block
337  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
338  copy_block(y_block, y + j);
339 
340  // Load, compute, and store a block
341  OuterVectorOpUnwindN::fill(x_block, y_block, a + j, b + j, n, op);
342  }
343 
344  for (; j < n; ++j) {
345  // Load a block
346  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
347  gather_block(a_block, a + j, n);
348 
349  // Load y block
350  const Y y_j = y[j];
351 
352  // Compute a block
354  [y_j, &op](A& a_ij, const X x_i) { return op(a_ij, x_i, y_j); },
355  a_block, x_block);
356 
357  // Store a block
358  scatter_block(b + j, n, a_block);
359  }
360  }
361 
362  for (; i < m; ++i, a += n, b += n) {
363  // Load x block
364  const X x_i = x[i];
365 
366  std::size_t j = 0ul;
367 
368  for (; j < nx; j += TILEDARRAY_LOOP_UNWIND) {
369  // Load a block
370  TILEDARRAY_ALIGNED_STORAGE A a_block[TILEDARRAY_LOOP_UNWIND];
371  copy_block(a_block, a + j);
372 
373  // Load y block
374  TILEDARRAY_ALIGNED_STORAGE Y y_block[TILEDARRAY_LOOP_UNWIND];
375  copy_block(y_block, y + j);
376 
377  // Compute outer block
379  [x_i, &op](A& a_ij, const Y y_j) { return op(a_ij, x_i, y_j); },
380  a_block, y_block);
381 
382  // Store a block
383  copy_block(b + j, a_block);
384  }
385 
386  for (; j < n; ++j) {
387  A a_ij = a[j];
388  const Y y_j = y[j];
389  op(a_ij, x_i, y_j);
390  b[j] = a_ij;
391  }
392  }
393 }
394 
395 } // namespace math
396 } // namespace TiledArray
397 
398 #endif // TILEDARRAY_MATH_OUTER_H__INCLUDED
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, Result *MADNESS_RESTRICT const result, const std::size_t stride, const Op &op)
Definition: outer.h:140
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:162
void outer_fill(const std::size_t m, const std::size_t n, const X *const x, const Y *const y, A *a, const Op &op)
Compute and store outer of x and y in a.
Definition: outer.h:175
::blas::Op Op
Definition: blas.h:46
void outer(const std::size_t m, const std::size_t n, const X *const x, const Y *const y, A *a, const Op &op)
Compute the outer of x and y to modify a.
Definition: outer.h:239
void vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:472
TILEDARRAY_FORCE_INLINE void gather_block(Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:248
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:219
void inplace_vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:391
TILEDARRAY_FORCE_INLINE void scatter_block(Result *const result, const std::size_t stride, const Arg *const arg)
Definition: vector_op.h:233
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:77
static TILEDARRAY_FORCE_INLINE void outer(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t stride, const Op &op)
Definition: outer.h:101
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, const Init *const init, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:63
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:40
Outer algorithm automatic loop unwinding.
Definition: outer.h:94
static TILEDARRAY_FORCE_INLINE void fill(const X *const x_block, const Y *const y_block, const Init *const init, Result *const result, const std::size_t stride, const Op &op)
Definition: outer.h:121
OuterVectorOpUnwind< TILEDARRAY_LOOP_UNWIND - 1 > OuterVectorOpUnwindN
Definition: outer.h:160
OuterVectorOpUnwind< N - 1 > OuterVectorOpUnwindN1
Definition: outer.h:96
static TILEDARRAY_FORCE_INLINE void outer(const X *const x_block, const Y *const y_block, Result *const result, const std::size_t, const Op &op)
Definition: outer.h:47
static const std::size_t offset
Definition: outer.h:98