vector_op.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2013 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * justus
19  * Department of Chemistry, Virginia Tech
20  *
21  * vector_op.h
22  * Nov 17, 2013
23  *
24  */
25 
26 #ifndef TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED
27 #define TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED
28 
29 #include <TiledArray/config.h>
31 
32 #if HAVE_INTEL_TBB
33 #include <tbb/parallel_for.h>
34 #include <tbb/partitioner.h>
35 #include <tbb/tbb_stddef.h>
36 #endif
37 
38 #include <TiledArray/type_traits.h>
39 
40 #define TILEDARRAY_LOOP_UNWIND ::TiledArray::math::LoopUnwind::value
41 
42 namespace TiledArray {
43 namespace math {
44 
45 // Import param_type into
47 
48 // Define compile time constant for loop unwinding.
49 typedef std::integral_constant<std::size_t,
50  TILEDARRAY_CACHELINE_SIZE / sizeof(double)>
52 typedef std::integral_constant<std::size_t,
53  ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)>
55 
56 template <std::size_t>
57 struct VectorOpUnwind;
58 
60 
63 template <>
64 struct VectorOpUnwind<0ul> {
65  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - 1ul;
66 
67  template <typename Op, typename Result, typename... Args>
68  static TILEDARRAY_FORCE_INLINE void for_each(
69  Op&& op, Result* MADNESS_RESTRICT const result,
70  const Args* MADNESS_RESTRICT const... args) {
71  op(result[offset], args[offset]...);
72  }
73 
74  template <typename Op, typename Result, typename... Args>
75  static TILEDARRAY_FORCE_INLINE void for_each_ptr(
76  Op&& op, Result* MADNESS_RESTRICT const result,
77  const Args* MADNESS_RESTRICT const... args) {
78  op(result + offset, args[offset]...);
79  }
80 
81  template <typename Op, typename Result, typename... Args>
82  static TILEDARRAY_FORCE_INLINE void reduce(
83  Op&& op, Result& MADNESS_RESTRICT result,
84  const Args* MADNESS_RESTRICT const... args) {
85  op(result, args[offset]...);
86  }
87 
88  template <typename Result, typename Arg>
89  static TILEDARRAY_FORCE_INLINE void scatter(
90  Result* MADNESS_RESTRICT const result,
91  const Arg* MADNESS_RESTRICT const arg,
92  const std::size_t /*result_stride*/) {
93  *result = arg[offset];
94  }
95 
96  template <typename Result, typename Arg>
97  static TILEDARRAY_FORCE_INLINE void gather(
98  Result* MADNESS_RESTRICT const result,
99  const Arg* MADNESS_RESTRICT const arg, std::size_t /*arg_stride*/) {
100  result[offset] = *arg;
101  }
102 
103 }; // struct VectorOpUnwind
104 
106 
109 template <std::size_t N>
110 struct VectorOpUnwind : public VectorOpUnwind<N - 1ul> {
112 
113  static constexpr std::size_t offset = TILEDARRAY_LOOP_UNWIND - N - 1ul;
114 
115  template <typename Op, typename Result, typename... Args>
116  static TILEDARRAY_FORCE_INLINE void for_each(
117  Op&& op, Result* MADNESS_RESTRICT const result,
118  const Args* MADNESS_RESTRICT const... args) {
119  op(result[offset], args[offset]...);
120  VectorOpUnwindN1::for_each(op, result, args...);
121  }
122 
123  template <typename Op, typename Result, typename... Args>
124  static TILEDARRAY_FORCE_INLINE void for_each_ptr(
125  Op&& op, Result* MADNESS_RESTRICT const result,
126  const Args* MADNESS_RESTRICT const... args) {
127  op(result + offset, args[offset]...);
128  VectorOpUnwindN1::for_each_ptr(op, result, args...);
129  }
130 
131  template <typename Op, typename Result, typename... Args>
132  static TILEDARRAY_FORCE_INLINE void reduce(
133  Op&& op, Result& MADNESS_RESTRICT result,
134  const Args* MADNESS_RESTRICT const... args) {
135  op(result, args[offset]...);
136  VectorOpUnwindN1::reduce(op, result, args...);
137  }
138 
139  template <typename Result, typename Arg>
140  static TILEDARRAY_FORCE_INLINE void scatter(
141  Result* MADNESS_RESTRICT const result,
142  const Arg* MADNESS_RESTRICT const arg, const std::size_t result_stride) {
143  *result = arg[offset];
144  VectorOpUnwindN1::scatter(result + result_stride, arg, result_stride);
145  }
146 
147  template <typename Result, typename Arg>
148  static TILEDARRAY_FORCE_INLINE void gather(
149  Result* MADNESS_RESTRICT const result,
150  const Arg* MADNESS_RESTRICT const arg, std::size_t arg_stride) {
151  result[offset] = *arg;
152  VectorOpUnwindN1::gather(result, arg + arg_stride, arg_stride);
153  }
154 
155 }; // struct VectorOpUnwind
156 
158 template <typename>
159 class Block;
160 
161 template <typename Op, typename Result, typename... Args>
162 TILEDARRAY_FORCE_INLINE void for_each_block(Op&& op, Result* const result,
163  const Args* const... args) {
164  VecOpUnwindN::for_each(op, result, args...);
165 }
166 
167 template <typename Op, typename Result, typename... Args>
168 TILEDARRAY_FORCE_INLINE void for_each_block(Op&& op, Block<Result>& result,
169  Block<Args>&&... args) {
170  VecOpUnwindN::for_each(op, result.data(), args.data()...);
171 }
172 
173 template <typename Op, typename Result, typename... Args>
174 TILEDARRAY_FORCE_INLINE void for_each_block_n(
175  Op&& op, const std::size_t n, Result* MADNESS_RESTRICT const result,
176  const Args* MADNESS_RESTRICT const... args) {
177  for (std::size_t i = 0ul; i < n; ++i) op(result[i], args[i]...);
178 }
179 
180 template <typename Op, typename Result, typename... Args>
181 TILEDARRAY_FORCE_INLINE typename std::enable_if<(sizeof...(Args) >= 0)>::type
182 for_each_block_ptr(Op&& op, Result* const result, const Args* const... args) {
183  VecOpUnwindN::for_each_ptr(op, result, args...);
184 }
185 
186 template <typename Op, typename Result, typename... Args>
187 TILEDARRAY_FORCE_INLINE typename std::enable_if<(sizeof...(Args) > 0)>::type
188 for_each_block_ptr(Op&& op, Result* const result, Block<Args>&&... args) {
189  VecOpUnwindN::for_each_ptr(op, result, args.data()...);
190 }
191 
192 template <typename Op, typename Result, typename... Args>
193 TILEDARRAY_FORCE_INLINE void for_each_block_ptr_n(
194  Op&& op, const std::size_t n, Result* MADNESS_RESTRICT const result,
195  const Args* MADNESS_RESTRICT const... args) {
196  for (std::size_t i = 0ul; i < n; ++i) op(result + i, args[i]...);
197 }
198 
199 template <typename Op, typename Result, typename... Args>
200 TILEDARRAY_FORCE_INLINE void reduce_block(Op&& op, Result& result,
201  const Args* const... args) {
202  VecOpUnwindN::reduce(op, result, args...);
203 }
204 
205 template <typename Op, typename Result, typename... Args>
206 TILEDARRAY_FORCE_INLINE void reduce_block(Op&& op, Result& result,
207  Block<Args>&&... args) {
208  VecOpUnwindN::reduce(op, result, args.data()...);
209 }
210 
211 template <typename Op, typename Result, typename... Args>
212 TILEDARRAY_FORCE_INLINE void reduce_block_n(
213  Op&& op, const std::size_t n, Result& MADNESS_RESTRICT result,
214  const Args* MADNESS_RESTRICT const... args) {
215  for (std::size_t i = 0ul; i < n; ++i) op(result, args[i]...);
216 }
217 
218 template <typename Result, typename Arg>
219 TILEDARRAY_FORCE_INLINE void copy_block(Result* const result,
220  const Arg* const arg) {
221  for_each_block([](Result& lhs, param_type<Arg> rhs) { lhs = rhs; }, result,
222  arg);
223 }
224 
225 template <typename Arg, typename Result>
226 TILEDARRAY_FORCE_INLINE void copy_block_n(std::size_t n, Result* const result,
227  const Arg* const arg) {
228  for_each_block_n([](Result& lhs, param_type<Arg> rhs) { lhs = rhs; }, n,
229  result, arg);
230 }
231 
232 template <typename Arg, typename Result>
233 TILEDARRAY_FORCE_INLINE void scatter_block(Result* const result,
234  const std::size_t stride,
235  const Arg* const arg) {
236  VecOpUnwindN::scatter(result, arg, stride);
237 }
238 
239 template <typename Result, typename Arg>
240 TILEDARRAY_FORCE_INLINE void scatter_block_n(const std::size_t n,
241  Result* result,
242  const std::size_t stride,
243  const Arg* const arg) {
244  for (std::size_t i = 0; i < n; ++i, result += stride) *result = arg[i];
245 }
246 
247 template <typename Result, typename Arg>
248 TILEDARRAY_FORCE_INLINE void gather_block(Result* const result,
249  const Arg* const arg,
250  const std::size_t stride) {
251  VecOpUnwindN::gather(result, arg, stride);
252 }
253 
254 template <typename Arg, typename Result>
255 TILEDARRAY_FORCE_INLINE void gather_block_n(const std::size_t n,
256  Result* const result,
257  const Arg* const arg,
258  const std::size_t stride) {
259  for (std::size_t i = 0; i < n; ++i, arg += stride) result[i] = *arg;
260 }
261 
262 template <typename T>
263 class Block {
264  TILEDARRAY_ALIGNED_STORAGE T block_[TILEDARRAY_LOOP_UNWIND];
265 
266  public:
267  Block() {}
268  explicit Block(const T* const data) { load(data); }
269 
270  void load(const T* const data) { copy_block(block_, data); }
271 
272  void store(T* const data) const { copy_block(data, block_); }
273 
274  Block<T>& gather(const T* const data, const std::size_t stride) {
275  gather_block(block_, data, stride);
276  return *this;
277  }
278 
279  void scatter_to(T* const data, std::size_t stride) const {
280  scatter_block(data, stride, block_);
281  }
282 
283  T* data() { return block_; }
284  const T* data() const { return block_; }
285 
286 }; // class Block
287 
288 #ifdef HAVE_INTEL_TBB
289 
290 struct SizeTRange {
291  static constexpr std::size_t block_size = TILEDARRAY_LOOP_UNWIND;
292 
293  // GRAIN_SIZE is set to 8 to trigger TiledArray Unit Test
294  // in reality, partition is controled by tbb::auto_partitioner instead of
295  // GRAIN_SIZE
296  static constexpr std::size_t GRAIN_SIZE = 8ul;
297 
298  size_t lower;
299  size_t upper;
300 
301  SizeTRange(const size_t start, const size_t end) : lower(start), upper(end) {}
302 
303  // SizeTRange(const size_t n, const size_t g_size)
304  // : lower(0), upper(n - 1), grain_size(g_size) { }
305 
306  SizeTRange() = default;
307  SizeTRange(const SizeTRange& r) = default;
308 
309  ~SizeTRange() {}
310 
311  // void set_grain_size(std::size_t grain_size){
312  // GRAIN_SIZE = grain_size;
313  // }
314 
315  bool empty() const { return lower > upper; }
316 
317  bool is_divisible() const { return size() >= 2 * GRAIN_SIZE; }
318 
319  size_t begin() const { return lower; }
320 
321  size_t end() const { return upper; }
322 
323  size_t size() const { return upper - lower; }
324 
325  SizeTRange(SizeTRange& r, tbb::split) {
326  size_t nblock = (r.upper - r.lower) / block_size;
327  nblock = (nblock + 1) / 2;
328  lower = r.lower + nblock * block_size;
329  upper = r.upper;
330  r.upper = lower;
331  }
332 };
333 
334 // SizeTRange::set_grain_size(1024ul);
335 
336 #endif
337 
338 template <typename Op, typename Result, typename... Args,
339  typename std::enable_if<std::is_void<typename std::result_of<
340  Op(Result&, Args...)>::type>::value>::type* = nullptr>
341 void inplace_vector_op_serial(Op&& op, const std::size_t n,
342  Result* const result, const Args* const... args) {
343  std::size_t i = 0ul;
344 
345  // Compute block iteration limit
346  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
347  const std::size_t nx = n & index_mask;
348 
349  for (; i < nx; i += TILEDARRAY_LOOP_UNWIND) {
350  Block<Result> result_block(result + i);
351  for_each_block(op, result_block, Block<Args>(args + i)...);
352  result_block.store(result + i);
353  }
354 
355  for_each_block_n(op, n - i, result + i, (args + i)...);
356 }
357 
358 #ifdef HAVE_INTEL_TBB
359 
360 template <typename Op, typename Result, typename... Args>
361 class ApplyInplaceVectorOp {
362  public:
363  ApplyInplaceVectorOp(Op& op, Result* const result, const Args* const... args)
364  : op_(op), result_(result), args_(args...) {}
365 
366  ~ApplyInplaceVectorOp() {}
367 
368  template <std::size_t... Is>
369  void helper(SizeTRange& range, const std::index_sequence<Is...>&) const {
370  std::size_t offset = range.begin();
371  std::size_t n_range = range.size();
372  inplace_vector_op_serial(op_, n_range, result_ + offset,
373  (std::get<Is>(args_) + offset)...);
374  }
375 
376  void operator()(SizeTRange& range) const {
377  helper(range, std::make_index_sequence<sizeof...(Args)>());
378  }
379 
380  private:
381  Op& op_;
382  Result* const result_;
383  std::tuple<const Args* const...> args_;
384 };
385 
386 #endif
387 
388 template <typename Op, typename Result, typename... Args,
389  typename std::enable_if<std::is_void<typename std::result_of<
390  Op(Result&, Args...)>::type>::value>::type* = nullptr>
391 void inplace_vector_op(Op&& op, const std::size_t n, Result* const result,
392  const Args* const... args) {
393 #ifdef HAVE_INTEL_TBB
394  // std::cout << "INPLACE_TBB_VECTOR_OP" << std::endl;
395  SizeTRange range(0, n);
396 
397  // if support lambda variadic
398  // auto apply_inplace_vector_op = [op, result, args...](SizeTRange
399  // &range) {
400  // size_t offset = range.begin();
401  // size_t n_range = range.size();
402  // inplace_vector_op_serial(op, n_range, result + offset, (args +
403  // offset)...);
404  // };
405  // else
406  auto apply_inplace_vector_op =
407  ApplyInplaceVectorOp<Op, Result, Args...>(op, result, args...);
408 
409  tbb::parallel_for(range, apply_inplace_vector_op, tbb::auto_partitioner());
410 #else
411  inplace_vector_op_serial(op, n, result, args...);
412 #endif
413 }
414 
415 template <typename Op, typename Result, typename... Args,
416  typename std::enable_if<!std::is_void<typename std::result_of<
417  Op(Args...)>::type>::value>::type* = nullptr>
418 void vector_op_serial(Op&& op, const std::size_t n, Result* const result,
419  const Args* const... args) {
420  auto wrapper_op = [&op](Result& res, param_type<Args>... a) {
421  res = op(a...);
422  };
423 
424  std::size_t i = 0ul;
425 
426  // Compute block iteration limit
427  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
428  const std::size_t nx = n & index_mask;
429 
430  for (; i < nx; i += TILEDARRAY_LOOP_UNWIND) {
431  Block<Result> result_block;
432  for_each_block(wrapper_op, result_block, Block<Args>(args + i)...);
433  result_block.store(result + i);
434  }
435 
436  for_each_block_n(wrapper_op, n - i, result + i, (args + i)...);
437 }
438 
439 #ifdef HAVE_INTEL_TBB
440 
441 template <typename Op, typename Result, typename... Args>
442 class ApplyVectorOp {
443  public:
444  ApplyVectorOp(Op& op, Result* const result, const Args* const... args)
445  : op_(op), result_(result), args_(args...) {}
446 
447  ~ApplyVectorOp() {}
448 
449  template <std::size_t... Is>
450  void helper(SizeTRange& range, const std::index_sequence<Is...>&) const {
451  std::size_t offset = range.begin();
452  std::size_t n_range = range.size();
453  vector_op_serial(op_, n_range, result_ + offset,
454  (std::get<Is>(args_) + offset)...);
455  }
456 
457  void operator()(SizeTRange& range) const {
458  helper(range, std::make_index_sequence<sizeof...(Args)>());
459  }
460 
461  private:
462  Op& op_;
463  Result* const result_;
464  std::tuple<const Args* const...> args_;
465 };
466 
467 #endif
468 
469 template <typename Op, typename Result, typename... Args,
470  typename std::enable_if<!std::is_void<typename std::result_of<
471  Op(Args...)>::type>::value>::type* = nullptr>
472 void vector_op(Op&& op, const std::size_t n, Result* const result,
473  const Args* const... args) {
474 #ifdef HAVE_INTEL_TBB
475  // std::cout << "TBB_VECTOR_OP" << std::endl;
476  SizeTRange range(0, n);
477 
478  // if support lambda variadic
479  // auto apply_vector_op = [op, result, args...](SizeTRange &range) {
480  // size_t offset = range.begin();
481  // size_t n_range = range.size();
482  // vector_op_serial(op, n_range, result + offset, (args +
483  // offset)...);
484  // };
485  // else
486  auto apply_vector_op =
487  ApplyVectorOp<Op, Result, Args...>(op, result, args...);
488 
489  tbb::parallel_for(range, apply_vector_op, tbb::auto_partitioner());
490 #else
491  vector_op_serial(op, n, result, args...);
492 #endif
493 }
494 
495 template <typename Op, typename Result, typename... Args>
496 void vector_ptr_op_serial(Op&& op, const std::size_t n, Result* const result,
497  const Args* const... args) {
498  std::size_t i = 0ul;
499 
500  // Compute block iteration limit
501  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
502  const std::size_t nx = n & index_mask;
503 
504  for (; i < nx; i += TILEDARRAY_LOOP_UNWIND)
505  for_each_block_ptr(op, result + i, Block<Args>(args + i)...);
506  for_each_block_ptr_n(op, n - i, result + i, (args + i)...);
507 }
508 
509 #ifdef HAVE_INTEL_TBB
510 template <typename Op, typename Result, typename... Args>
511 class ApplyVectorPtrOp {
512  public:
513  ApplyVectorPtrOp(Op& op, Result* const result, const Args* const... args)
514  : op_(op), result_(result), args_(args...) {}
515 
516  ~ApplyVectorPtrOp() {}
517 
518  template <std::size_t... Is>
519  void helper(SizeTRange& range, const std::index_sequence<Is...>&) const {
520  std::size_t offset = range.begin();
521  std::size_t n_range = range.size();
522  vector_ptr_op_serial(op_, n_range, result_ + offset,
523  (std::get<Is>(args_) + offset)...);
524  }
525 
526  void operator()(SizeTRange& range) const {
527  helper(range, std::make_index_sequence<sizeof...(Args)>());
528  }
529 
530  private:
531  Op& op_;
532  Result* const result_;
533  std::tuple<const Args* const...> args_;
534 };
535 #endif
536 
537 template <typename Op, typename Result, typename... Args>
538 void vector_ptr_op(Op&& op, const std::size_t n, Result* const result,
539  const Args* const... args) {
540 #ifdef HAVE_INTEL_TBB
541  // std::cout << "TBB_VECTOR_PTR_OP" << std::endl;
542  SizeTRange range(0, n);
543 
544  // if support lambda variadic
545  // auto apply_vector_ptr_op = [op, result, args...](SizeTRange &range)
546  // {
547  // size_t offset = range.begin();
548  // size_t n_range = range.size();
549  // vector_ptr_op_serial(op, n_range, result + offset, (args +
550  // offset)...);
551  // };
552  // else
553  auto apply_vector_ptr_op =
554  ApplyVectorPtrOp<Op, Result, Args...>(op, result, args...);
555  tbb::parallel_for(range, apply_vector_ptr_op, tbb::auto_partitioner());
556 #else
557  vector_ptr_op_serial(op, n, result, args...);
558 #endif
559 }
560 
561 template <typename Op, typename Result, typename... Args>
562 void reduce_op_serial(Op&& op, const std::size_t n, Result& result,
563  const Args* const... args) {
564  std::size_t i = 0ul;
565 
566  // Compute block iteration limit
567  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
568  const std::size_t nx = n & index_mask;
569 
570  for (; i < nx; i += TILEDARRAY_LOOP_UNWIND) {
571  Result temp = result;
572  reduce_block(op, temp, Block<Args>(args + i)...);
573  result = temp;
574  }
575 
576  reduce_block_n(op, n - i, result, (args + i)...);
577 }
578 
579 #ifdef HAVE_INTEL_TBB
580 template <typename ReduceOp, typename JoinOp, typename Result, typename... Args>
583 class ApplyReduceOp {
584  public:
585  ApplyReduceOp(ReduceOp& reduce_op, JoinOp& join_op, const Result& identity,
586  const Result& result, const Args* const... args)
587  : reduce_op_(reduce_op),
588  join_op_(join_op),
589  identity_(identity),
590  result_(result),
591  args_(args...) {}
592 
593  ApplyReduceOp(ApplyReduceOp& rhs, tbb::split)
594  : reduce_op_(rhs.reduce_op_),
595  join_op_(rhs.join_op_),
596  identity_(rhs.identity_),
597  result_(rhs.identity_),
598  args_(rhs.args_) {}
599 
600  ~ApplyReduceOp() {}
601 
602  template <std::size_t... Is>
603  void helper(SizeTRange& range, const std::index_sequence<Is...>&) {
604  std::size_t offset = range.begin();
605  std::size_t n_range = range.size();
606  reduce_op_serial(reduce_op_, n_range, result_,
607  (std::get<Is>(args_) + offset)...);
608  }
609 
610  void operator()(SizeTRange& range) {
611  helper(range, std::make_index_sequence<sizeof...(Args)>());
612  }
613 
614  void join(const ApplyReduceOp& rhs) { join_op_(result_, rhs.result_); }
615 
616  const Result result() const { return result_; }
617 
618  private:
619  ReduceOp& reduce_op_;
620  JoinOp& join_op_;
621  const Result identity_;
622  Result result_;
623  std::tuple<const Args* const...> args_;
624 };
625 #endif
626 
627 template <typename ReduceOp, typename JoinOp, typename Result, typename... Args>
628 void reduce_op(ReduceOp&& reduce_op, JoinOp&& join_op, const Result& identity,
629  const std::size_t n, Result& result, const Args* const... args) {
630  // TODO implement reduce operation with TBB
631 #ifdef HAVE_INTEL_TBB
632  SizeTRange range(0, n);
633 
634  auto apply_reduce_op = ApplyReduceOp<ReduceOp, JoinOp, Result, Args...>(
635  reduce_op, join_op, identity, result, args...);
636 
637  tbb::parallel_reduce(range, apply_reduce_op, tbb::auto_partitioner());
638 
639  result = apply_reduce_op.result();
640 #else
641  reduce_op_serial(reduce_op, n, result, args...);
642 #endif
643 }
644 
645 template <typename Arg, typename Result>
646 typename std::enable_if<!(std::is_same<Arg, Result>::value &&
647  detail::is_scalar_v<Arg>)>::type
648 copy_vector(const std::size_t n, const Arg* const arg, Result* const result) {
649  std::size_t i = 0ul;
650 
651  // Compute block iteration limit
652  constexpr std::size_t index_mask = ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul);
653  const std::size_t nx = n & index_mask;
654 
655  for (; i < nx; i += TILEDARRAY_LOOP_UNWIND) copy_block(result + i, arg + i);
656  copy_block_n(n - i, result + i, arg + i);
657 }
658 
659 template <typename T>
660 inline typename std::enable_if<detail::is_scalar_v<T>>::type copy_vector(
661  const std::size_t n, const T* const arg, T* const result) {
662  std::memcpy(result, arg, n * sizeof(T));
663 }
664 
665 template <typename Arg, typename Result>
666 void fill_vector(const std::size_t n, const Arg& arg, Result* const result) {
667  auto fill_op = [arg](Result& res) { res = arg; };
668  vector_op(fill_op, n, result);
669 }
670 
671 template <typename Arg, typename Result>
672 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
673  detail::is_scalar_v<Result>)>::type
674 uninitialized_copy_vector(const std::size_t n, const Arg* const arg,
675  Result* const result) {
676  auto op = [](Result* const res, param_type<Arg> a) { new (res) Result(a); };
677  vector_ptr_op(op, n, result, arg);
678 }
679 
680 template <typename Arg, typename Result>
681 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
682  detail::is_scalar_v<Result>>::type
683 uninitialized_copy_vector(const std::size_t n, const Arg* const arg,
684  Result* const result) {
685  copy_vector(n, arg, result);
686 }
687 
688 template <typename Arg, typename Result>
689 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
690  detail::is_scalar_v<Result>)>::type
691 uninitialized_fill_vector(const std::size_t n, const Arg& arg,
692  Result* const result) {
693  auto op = [arg](Result* const res) { new (res) Result(arg); };
694  vector_ptr_op(op, n, result);
695 }
696 
697 template <typename Arg, typename Result>
698 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
699  detail::is_scalar_v<Result>>::type
700 uninitialized_fill_vector(const std::size_t n, const Arg& arg,
701  Result* const result) {
702  fill_vector(n, arg, result);
703 }
704 
705 template <typename Arg>
706 typename std::enable_if<!detail::is_scalar_v<Arg>>::type destroy_vector(
707  const std::size_t n, Arg* const arg) {
708  auto op = [](Arg* const a) { a->~Arg(); };
709  vector_ptr_op(op, n, arg);
710 }
711 
712 template <typename Arg>
713 inline typename std::enable_if<detail::is_scalar_v<Arg>>::type destroy_vector(
714  const std::size_t, const Arg* const) {}
715 
716 template <typename Arg, typename Result, typename Op>
717 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
718  detail::is_scalar_v<Result>)>::type
719 uninitialized_unary_vector_op(const std::size_t n, const Arg* const arg,
720  Result* const result, Op&& op) {
721  auto wrapper_op = [&op](Result* const res, param_type<Arg> a) {
722  new (res) Result(op(a));
723  };
724  vector_ptr_op(wrapper_op, n, result, arg);
725 }
726 
727 template <typename Arg, typename Result, typename Op>
728 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
729  detail::is_scalar_v<Result>>::type
730 uninitialized_unary_vector_op(const std::size_t n, const Arg* const arg,
731  Result* const result, Op&& op) {
732  vector_op(op, n, result, arg);
733 }
734 
735 template <typename Left, typename Right, typename Result, typename Op>
736 typename std::enable_if<!(detail::is_scalar_v<Left> &&
737  detail::is_scalar_v<Right> &&
738  detail::is_scalar_v<Result>)>::type
739 uninitialized_binary_vector_op(const std::size_t n, const Left* const left,
740  const Right* const right, Result* const result,
741  Op&& op) {
742  auto wrapper_op = [&op](Result* const res, param_type<Left> l,
743  param_type<Right> r) { new (res) Result(op(l, r)); };
744 
745  vector_ptr_op(op, n, result, left, right);
746 }
747 
748 template <typename Left, typename Right, typename Result, typename Op>
749 typename std::enable_if<detail::is_scalar_v<Left> &&
750  detail::is_scalar_v<Right> &&
751  detail::is_scalar_v<Result>>::type
752 uninitialized_binary_vector_op(const std::size_t n, const Left* const left,
753  const Right* const right, Result* const result,
754  Op&& op) {
755  vector_op(op, n, result, left, right);
756 }
757 
758 } // namespace math
759 } // namespace TiledArray
760 
761 #endif // TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:162
void load(const T *const data)
Definition: vector_op.h:270
::blas::Op Op
Definition: blas.h:46
static TILEDARRAY_FORCE_INLINE void for_each_ptr(Op &&op, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:124
void reduce_op_serial(Op &&op, const std::size_t n, Result &result, const Args *const ... args)
Definition: vector_op.h:562
static TILEDARRAY_FORCE_INLINE void for_each_ptr(Op &&op, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:75
void reduce_op(ReduceOp &&reduce_op, JoinOp &&join_op, const Result &identity, const std::size_t n, Result &result, const Args *const ... args)
Definition: vector_op.h:628
void vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:472
std::enable_if<!(detail::is_scalar_v< Arg > &&detail::is_scalar_v< Result >)>::type uninitialized_unary_vector_op(const std::size_t n, const Arg *const arg, Result *const result, Op &&op)
Definition: vector_op.h:719
TILEDARRAY_FORCE_INLINE void gather_block(Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:248
static constexpr std::size_t offset
Definition: vector_op.h:113
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:219
void fill_vector(const std::size_t n, const Arg &arg, Result *const result)
Definition: vector_op.h:666
TILEDARRAY_FORCE_INLINE void gather_block_n(const std::size_t n, Result *const result, const Arg *const arg, const std::size_t stride)
Definition: vector_op.h:255
void scatter_to(T *const data, std::size_t stride) const
Definition: vector_op.h:279
static TILEDARRAY_FORCE_INLINE void scatter(Result *MADNESS_RESTRICT const result, const Arg *MADNESS_RESTRICT const arg, const std::size_t)
Definition: vector_op.h:89
void vector_op_serial(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:418
void inplace_vector_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:391
static TILEDARRAY_FORCE_INLINE void for_each(Op &&op, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:68
static TILEDARRAY_FORCE_INLINE void scatter(Result *MADNESS_RESTRICT const result, const Arg *MADNESS_RESTRICT const arg, const std::size_t result_stride)
Definition: vector_op.h:140
constexpr auto end(Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:51
TILEDARRAY_FORCE_INLINE void scatter_block(Result *const result, const std::size_t stride, const Arg *const arg)
Definition: vector_op.h:233
std::enable_if<!(detail::is_scalar_v< Arg > &&detail::is_scalar_v< Result >)>::type uninitialized_copy_vector(const std::size_t n, const Arg *const arg, Result *const result)
Definition: vector_op.h:674
std::enable_if<!detail::is_scalar_v< Arg > >::type destroy_vector(const std::size_t n, Arg *const arg)
Definition: vector_op.h:706
static TILEDARRAY_FORCE_INLINE void reduce(Op &&op, Result &MADNESS_RESTRICT result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:132
const T * data() const
Definition: vector_op.h:284
constexpr auto begin(Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:39
TILEDARRAY_FORCE_INLINE std::enable_if<(sizeof...(Args) >=0)>::type for_each_block_ptr(Op &&op, Result *const result, const Args *const ... args)
Definition: vector_op.h:182
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:40
std::enable_if<!(detail::is_scalar_v< Left > &&detail::is_scalar_v< Right > &&detail::is_scalar_v< Result >)>::type uninitialized_binary_vector_op(const std::size_t n, const Left *const left, const Right *const right, Result *const result, Op &&op)
Definition: vector_op.h:739
typename param< U >::type param_type
Definition: type_traits.h:991
static TILEDARRAY_FORCE_INLINE void gather(Result *MADNESS_RESTRICT const result, const Arg *MADNESS_RESTRICT const arg, std::size_t arg_stride)
Definition: vector_op.h:148
Block< T > & gather(const T *const data, const std::size_t stride)
Definition: vector_op.h:274
std::enable_if<!(detail::is_scalar_v< Arg > &&detail::is_scalar_v< Result >)>::type uninitialized_fill_vector(const std::size_t n, const Arg &arg, Result *const result)
Definition: vector_op.h:691
TILEDARRAY_FORCE_INLINE void for_each_block_ptr_n(Op &&op, const std::size_t n, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:193
void inplace_vector_op_serial(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:341
TILEDARRAY_FORCE_INLINE void reduce_block(Op &&op, Result &result, const Args *const ... args)
Definition: vector_op.h:200
TILEDARRAY_FORCE_INLINE void copy_block_n(std::size_t n, Result *const result, const Arg *const arg)
Definition: vector_op.h:226
void vector_ptr_op(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:538
std::enable_if<!(std::is_same< Arg, Result >::value &&detail::is_scalar_v< Arg >)>::type copy_vector(const std::size_t n, const Arg *const arg, Result *const result)
Definition: vector_op.h:648
std::integral_constant< std::size_t, TILEDARRAY_CACHELINE_SIZE/sizeof(double)> LoopUnwind
Definition: vector_op.h:51
void store(T *const data) const
Definition: vector_op.h:272
bool empty(const Tile< Arg > &arg)
Check that arg is empty (no data)
Definition: tile.h:646
TILEDARRAY_FORCE_INLINE void scatter_block_n(const std::size_t n, Result *result, const std::size_t stride, const Arg *const arg)
Definition: vector_op.h:240
static TILEDARRAY_FORCE_INLINE void reduce(Op &&op, Result &MADNESS_RESTRICT result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:82
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
Definition: vector_op.h:54
VectorOpUnwind< N - 1ul > VectorOpUnwindN1
Definition: vector_op.h:111
TILEDARRAY_FORCE_INLINE void reduce_block_n(Op &&op, const std::size_t n, Result &MADNESS_RESTRICT result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:212
VectorOpUnwind< TILEDARRAY_LOOP_UNWIND - 1 > VecOpUnwindN
Definition: vector_op.h:157
Block(const T *const data)
Definition: vector_op.h:268
Vector loop unwind helper class.
Definition: vector_op.h:110
T identity()
identity for group of objects of type T
void vector_ptr_op_serial(Op &&op, const std::size_t n, Result *const result, const Args *const ... args)
Definition: vector_op.h:496
static TILEDARRAY_FORCE_INLINE void gather(Result *MADNESS_RESTRICT const result, const Arg *MADNESS_RESTRICT const arg, std::size_t)
Definition: vector_op.h:97
TILEDARRAY_FORCE_INLINE void for_each_block_n(Op &&op, const std::size_t n, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:174
static TILEDARRAY_FORCE_INLINE void for_each(Op &&op, Result *MADNESS_RESTRICT const result, const Args *MADNESS_RESTRICT const ... args)
Definition: vector_op.h:116