26 #ifndef TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED
27 #define TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED
29 #include <TiledArray/config.h>
33 #include <tbb/parallel_for.h>
34 #include <tbb/partitioner.h>
35 #include <tbb/tbb_stddef.h>
40 #define TILEDARRAY_LOOP_UNWIND ::TiledArray::math::LoopUnwind::value
49 typedef std::integral_constant<std::size_t,
50 TILEDARRAY_CACHELINE_SIZE /
sizeof(double)>
52 typedef std::integral_constant<std::size_t,
56 template <std::
size_t>
67 template <
typename Op,
typename Result,
typename... Args>
69 Op&& op, Result* MADNESS_RESTRICT
const result,
70 const Args* MADNESS_RESTRICT
const... args) {
74 template <
typename Op,
typename Result,
typename... Args>
76 Op&& op, Result* MADNESS_RESTRICT
const result,
77 const Args* MADNESS_RESTRICT
const... args) {
81 template <
typename Op,
typename Result,
typename... Args>
82 static TILEDARRAY_FORCE_INLINE
void reduce(
83 Op&& op, Result& MADNESS_RESTRICT result,
84 const Args* MADNESS_RESTRICT
const... args) {
85 op(result, args[
offset]...);
88 template <
typename Result,
typename Arg>
89 static TILEDARRAY_FORCE_INLINE
void scatter(
90 Result* MADNESS_RESTRICT
const result,
91 const Arg* MADNESS_RESTRICT
const arg,
96 template <
typename Result,
typename Arg>
97 static TILEDARRAY_FORCE_INLINE
void gather(
98 Result* MADNESS_RESTRICT
const result,
99 const Arg* MADNESS_RESTRICT
const arg, std::size_t ) {
109 template <std::
size_t N>
115 template <
typename Op,
typename Result,
typename... Args>
117 Op&& op, Result* MADNESS_RESTRICT
const result,
118 const Args* MADNESS_RESTRICT
const... args) {
123 template <
typename Op,
typename Result,
typename... Args>
125 Op&& op, Result* MADNESS_RESTRICT
const result,
126 const Args* MADNESS_RESTRICT
const... args) {
131 template <
typename Op,
typename Result,
typename... Args>
132 static TILEDARRAY_FORCE_INLINE
void reduce(
133 Op&& op, Result& MADNESS_RESTRICT result,
134 const Args* MADNESS_RESTRICT
const... args) {
135 op(result, args[
offset]...);
139 template <
typename Result,
typename Arg>
141 Result* MADNESS_RESTRICT
const result,
142 const Arg* MADNESS_RESTRICT
const arg,
const std::size_t result_stride) {
147 template <
typename Result,
typename Arg>
148 static TILEDARRAY_FORCE_INLINE
void gather(
149 Result* MADNESS_RESTRICT
const result,
150 const Arg* MADNESS_RESTRICT
const arg, std::size_t arg_stride) {
161 template <
typename Op,
typename Result,
typename... Args>
163 const Args*
const... args) {
167 template <
typename Op,
typename Result,
typename... Args>
173 template <
typename Op,
typename Result,
typename... Args>
175 Op&& op,
const std::size_t n, Result* MADNESS_RESTRICT
const result,
176 const Args* MADNESS_RESTRICT
const... args) {
177 for (std::size_t i = 0ul; i < n; ++i) op(result[i], args[i]...);
180 template <
typename Op,
typename Result,
typename... Args>
181 TILEDARRAY_FORCE_INLINE
typename std::enable_if<(
sizeof...(Args) >= 0)>::type
186 template <
typename Op,
typename Result,
typename... Args>
187 TILEDARRAY_FORCE_INLINE
typename std::enable_if<(
sizeof...(Args) > 0)>::type
192 template <
typename Op,
typename Result,
typename... Args>
194 Op&& op,
const std::size_t n, Result* MADNESS_RESTRICT
const result,
195 const Args* MADNESS_RESTRICT
const... args) {
196 for (std::size_t i = 0ul; i < n; ++i) op(result + i, args[i]...);
199 template <
typename Op,
typename Result,
typename... Args>
201 const Args*
const... args) {
205 template <
typename Op,
typename Result,
typename... Args>
211 template <
typename Op,
typename Result,
typename... Args>
213 Op&& op,
const std::size_t n, Result& MADNESS_RESTRICT result,
214 const Args* MADNESS_RESTRICT
const... args) {
215 for (std::size_t i = 0ul; i < n; ++i) op(result, args[i]...);
218 template <
typename Result,
typename Arg>
219 TILEDARRAY_FORCE_INLINE
void copy_block(Result*
const result,
220 const Arg*
const arg) {
221 for_each_block([](Result& lhs, param_type<Arg> rhs) { lhs = rhs; }, result,
225 template <
typename Arg,
typename Result>
226 TILEDARRAY_FORCE_INLINE
void copy_block_n(std::size_t n, Result*
const result,
227 const Arg*
const arg) {
232 template <
typename Arg,
typename Result>
234 const std::size_t stride,
235 const Arg*
const arg) {
239 template <
typename Result,
typename Arg>
242 const std::size_t stride,
243 const Arg*
const arg) {
244 for (std::size_t i = 0; i < n; ++i, result += stride) *result = arg[i];
247 template <
typename Result,
typename Arg>
249 const Arg*
const arg,
250 const std::size_t stride) {
254 template <
typename Arg,
typename Result>
256 Result*
const result,
257 const Arg*
const arg,
258 const std::size_t stride) {
259 for (std::size_t i = 0; i < n; ++i, arg += stride) result[i] = *arg;
262 template <
typename T>
284 const T*
data()
const {
return block_; }
288 #ifdef HAVE_INTEL_TBB
296 static constexpr std::size_t GRAIN_SIZE = 8ul;
301 SizeTRange(
const size_t start,
const size_t end) : lower(start), upper(end) {}
306 SizeTRange() =
default;
307 SizeTRange(
const SizeTRange& r) =
default;
315 bool empty()
const {
return lower > upper; }
317 bool is_divisible()
const {
return size() >= 2 * GRAIN_SIZE; }
319 size_t begin()
const {
return lower; }
321 size_t end()
const {
return upper; }
323 size_t size()
const {
return upper - lower; }
325 SizeTRange(SizeTRange& r, tbb::split) {
326 size_t nblock = (r.upper - r.lower) / block_size;
327 nblock = (nblock + 1) / 2;
328 lower = r.lower + nblock * block_size;
338 template <
typename Op,
typename Result,
typename... Args,
339 typename std::enable_if<std::is_void<
typename std::result_of<
340 Op(Result&, Args...)>::type>::value>::type* =
nullptr>
342 Result*
const result,
const Args*
const... args) {
352 result_block.
store(result + i);
358 #ifdef HAVE_INTEL_TBB
360 template <
typename Op,
typename Result,
typename... Args>
361 class ApplyInplaceVectorOp {
363 ApplyInplaceVectorOp(
Op& op, Result*
const result,
const Args*
const... args)
364 : op_(op), result_(result), args_(args...) {}
366 ~ApplyInplaceVectorOp() {}
368 template <std::size_t... Is>
369 void helper(SizeTRange& range,
const std::index_sequence<Is...>&)
const {
370 std::size_t offset = range.begin();
371 std::size_t n_range = range.size();
373 (std::get<Is>(args_) + offset)...);
376 void operator()(SizeTRange& range)
const {
377 helper(range, std::make_index_sequence<
sizeof...(Args)>());
382 Result*
const result_;
383 std::tuple<
const Args*
const...> args_;
388 template <
typename Op,
typename Result,
typename... Args,
389 typename std::enable_if<std::is_void<
typename std::result_of<
390 Op(Result&, Args...)>::type>::value>::type* =
nullptr>
392 const Args*
const... args) {
393 #ifdef HAVE_INTEL_TBB
395 SizeTRange range(0, n);
406 auto apply_inplace_vector_op =
407 ApplyInplaceVectorOp<
Op, Result, Args...>(op, result, args...);
409 tbb::parallel_for(range, apply_inplace_vector_op, tbb::auto_partitioner());
415 template <
typename Op,
typename Result,
typename... Args,
416 typename std::enable_if<!std::is_void<
typename std::result_of<
417 Op(Args...)>::type>::value>::type* =
nullptr>
419 const Args*
const... args) {
420 auto wrapper_op = [&op](Result& res, param_type<Args>... a) {
433 result_block.
store(result + i);
439 #ifdef HAVE_INTEL_TBB
441 template <
typename Op,
typename Result,
typename... Args>
442 class ApplyVectorOp {
444 ApplyVectorOp(
Op& op, Result*
const result,
const Args*
const... args)
445 : op_(op), result_(result), args_(args...) {}
449 template <std::size_t... Is>
450 void helper(SizeTRange& range,
const std::index_sequence<Is...>&)
const {
451 std::size_t offset = range.begin();
452 std::size_t n_range = range.size();
454 (std::get<Is>(args_) + offset)...);
457 void operator()(SizeTRange& range)
const {
458 helper(range, std::make_index_sequence<
sizeof...(Args)>());
463 Result*
const result_;
464 std::tuple<
const Args*
const...> args_;
469 template <
typename Op,
typename Result,
typename... Args,
470 typename std::enable_if<!std::is_void<
typename std::result_of<
471 Op(Args...)>::type>::value>::type* =
nullptr>
472 void vector_op(
Op&& op,
const std::size_t n, Result*
const result,
473 const Args*
const... args) {
474 #ifdef HAVE_INTEL_TBB
476 SizeTRange range(0, n);
486 auto apply_vector_op =
487 ApplyVectorOp<
Op, Result, Args...>(op, result, args...);
489 tbb::parallel_for(range, apply_vector_op, tbb::auto_partitioner());
495 template <
typename Op,
typename Result,
typename... Args>
497 const Args*
const... args) {
509 #ifdef HAVE_INTEL_TBB
510 template <
typename Op,
typename Result,
typename... Args>
511 class ApplyVectorPtrOp {
513 ApplyVectorPtrOp(
Op& op, Result*
const result,
const Args*
const... args)
514 : op_(op), result_(result), args_(args...) {}
516 ~ApplyVectorPtrOp() {}
518 template <std::size_t... Is>
519 void helper(SizeTRange& range,
const std::index_sequence<Is...>&)
const {
520 std::size_t offset = range.begin();
521 std::size_t n_range = range.size();
523 (std::get<Is>(args_) + offset)...);
526 void operator()(SizeTRange& range)
const {
527 helper(range, std::make_index_sequence<
sizeof...(Args)>());
532 Result*
const result_;
533 std::tuple<
const Args*
const...> args_;
537 template <
typename Op,
typename Result,
typename... Args>
539 const Args*
const... args) {
540 #ifdef HAVE_INTEL_TBB
542 SizeTRange range(0, n);
553 auto apply_vector_ptr_op =
554 ApplyVectorPtrOp<
Op, Result, Args...>(op, result, args...);
555 tbb::parallel_for(range, apply_vector_ptr_op, tbb::auto_partitioner());
561 template <
typename Op,
typename Result,
typename... Args>
563 const Args*
const... args) {
571 Result temp = result;
579 #ifdef HAVE_INTEL_TBB
580 template <
typename ReduceOp,
typename JoinOp,
typename Result,
typename... Args>
583 class ApplyReduceOp {
586 const Result& result,
const Args*
const... args)
593 ApplyReduceOp(ApplyReduceOp& rhs, tbb::split)
594 : reduce_op_(rhs.reduce_op_),
595 join_op_(rhs.join_op_),
596 identity_(rhs.identity_),
597 result_(rhs.identity_),
602 template <std::size_t... Is>
603 void helper(SizeTRange& range,
const std::index_sequence<Is...>&) {
604 std::size_t offset = range.begin();
605 std::size_t n_range = range.size();
607 (std::get<Is>(args_) + offset)...);
610 void operator()(SizeTRange& range) {
611 helper(range, std::make_index_sequence<
sizeof...(Args)>());
614 void join(
const ApplyReduceOp& rhs) { join_op_(result_, rhs.result_); }
616 const Result result()
const {
return result_; }
619 ReduceOp& reduce_op_;
621 const Result identity_;
623 std::tuple<
const Args*
const...> args_;
627 template <
typename ReduceOp,
typename JoinOp,
typename Result,
typename... Args>
629 const std::size_t n, Result& result,
const Args*
const... args) {
631 #ifdef HAVE_INTEL_TBB
632 SizeTRange range(0, n);
634 auto apply_reduce_op = ApplyReduceOp<ReduceOp, JoinOp, Result, Args...>(
637 tbb::parallel_reduce(range, apply_reduce_op, tbb::auto_partitioner());
639 result = apply_reduce_op.result();
645 template <
typename Arg,
typename Result>
646 typename std::enable_if<!(std::is_same<Arg, Result>::value &&
647 detail::is_scalar_v<Arg>)>::type
648 copy_vector(
const std::size_t n,
const Arg*
const arg, Result*
const result) {
659 template <
typename T>
660 inline typename std::enable_if<detail::is_scalar_v<T>>::type
copy_vector(
661 const std::size_t n,
const T*
const arg, T*
const result) {
662 std::memcpy(result, arg, n *
sizeof(T));
665 template <
typename Arg,
typename Result>
666 void fill_vector(
const std::size_t n,
const Arg& arg, Result*
const result) {
667 auto fill_op = [arg](Result& res) { res = arg; };
671 template <
typename Arg,
typename Result>
672 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
673 detail::is_scalar_v<Result>)>::type
675 Result*
const result) {
676 auto op = [](Result*
const res, param_type<Arg> a) {
new (res) Result(a); };
680 template <
typename Arg,
typename Result>
681 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
682 detail::is_scalar_v<Result>>::type
684 Result*
const result) {
688 template <
typename Arg,
typename Result>
689 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
690 detail::is_scalar_v<Result>)>::type
692 Result*
const result) {
693 auto op = [arg](Result*
const res) {
new (res) Result(arg); };
697 template <
typename Arg,
typename Result>
698 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
699 detail::is_scalar_v<Result>>::type
701 Result*
const result) {
705 template <
typename Arg>
707 const std::size_t n, Arg*
const arg) {
708 auto op = [](Arg*
const a) { a->~Arg(); };
712 template <
typename Arg>
714 const std::size_t,
const Arg*
const) {}
716 template <
typename Arg,
typename Result,
typename Op>
717 typename std::enable_if<!(detail::is_scalar_v<Arg> &&
718 detail::is_scalar_v<Result>)>::type
720 Result*
const result,
Op&& op) {
721 auto wrapper_op = [&op](Result*
const res, param_type<Arg> a) {
722 new (res) Result(op(a));
727 template <
typename Arg,
typename Result,
typename Op>
728 inline typename std::enable_if<detail::is_scalar_v<Arg> &&
729 detail::is_scalar_v<Result>>::type
731 Result*
const result,
Op&& op) {
735 template <
typename Left,
typename Right,
typename Result,
typename Op>
736 typename std::enable_if<!(detail::is_scalar_v<Left> &&
737 detail::is_scalar_v<Right> &&
738 detail::is_scalar_v<Result>)>::type
740 const Right*
const right, Result*
const result,
742 auto wrapper_op = [&op](Result*
const res, param_type<Left> l,
743 param_type<Right> r) {
new (res) Result(op(l, r)); };
748 template <
typename Left,
typename Right,
typename Result,
typename Op>
749 typename std::enable_if<detail::is_scalar_v<Left> &&
750 detail::is_scalar_v<Right> &&
751 detail::is_scalar_v<Result>>::type
753 const Right*
const right, Result*
const result,
761 #endif // TILEDARRAY_MATH_VECTOR_OP_H__INCLUDED