contraction_eval.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2013 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #ifndef TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
21 #define TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
22 
23 #include <vector>
24 
25 #include <TiledArray/config.h>
27 #include <TiledArray/proc_grid.h>
28 #include <TiledArray/reduce_task.h>
29 #include <TiledArray/shape.h>
30 #include <TiledArray/type_traits.h>
31 
33 
34 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
35 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
36 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
37 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
38 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
39 
40 namespace TiledArray {
41 namespace detail {
42 
44 
54 template <typename Left, typename Right, typename Op, typename Policy>
55 class Summa
56  : public DistEvalImpl<typename Op::result_type, Policy>,
57  public std::enable_shared_from_this<Summa<Left, Right, Op, Policy>> {
58  public:
62  typedef typename DistEvalImpl_::TensorImpl_
64  typedef Left left_type;
65  typedef Right right_type;
69  typedef typename DistEvalImpl_::pmap_interface
71  typedef
74  typedef
76  typedef Op op_type;
77 
78  private:
79  static ordinal_type max_memory_;
80  static ordinal_type
81  max_depth_;
82 
83  // Arguments and operation
84  left_type left_;
85  right_type right_;
86  op_type op_;
87 
88  // Broadcast groups for dense arguments (empty for non-dense arguments)
89  madness::Group row_group_;
90  madness::Group col_group_;
91 
92  // Dimension information
93  const ordinal_type k_;
94  const ProcGrid proc_grid_;
95 
96  // Contraction results
97  ReducePairTask<op_type>* reduce_tasks_;
98 
99  // Constants used to iterate over columns and rows of left_ and right_,
100  // respectively.
101  const ordinal_type
102  left_start_local_;
103  const ordinal_type left_end_;
105  const ordinal_type left_stride_;
106  const ordinal_type
107  left_stride_local_;
108  const ordinal_type right_stride_;
109  const ordinal_type
110  right_stride_local_;
111 
113  right_future;
115  left_future;
116  typedef std::pair<ordinal_type, right_future>
117  row_datum;
118  typedef std::pair<ordinal_type, left_future>
119  col_datum;
120 
121  static constexpr const bool trace_tasks =
122 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
123  true
124 #else
125  false
126 #endif
127  ;
128 
129  protected:
130  // Import base class functions
131  using std::enable_shared_from_this<Summa_>::shared_from_this;
132 
133  private:
134  // Static variable initialization ----------------------------------------
135 
137  static ordinal_type init_max_memory() {
138  const char* max_memory = getenv("TA_SUMMA_MAX_MEMORY");
139  if (max_memory) {
140  // Convert the string into bytes
141  std::stringstream ss(max_memory);
142  double memory = 0.0;
143  if (ss >> memory) {
144  if (memory > 0.0) {
145  std::string unit;
146  if (ss >> unit) { // Failure == assume bytes
147  if (unit == "KB" || unit == "kB") {
148  memory *= 1000.0;
149  } else if (unit == "KiB" || unit == "kiB") {
150  memory *= 1024.0;
151  } else if (unit == "MB") {
152  memory *= 1000000.0;
153  } else if (unit == "MiB") {
154  memory *= 1048576.0;
155  } else if (unit == "GB") {
156  memory *= 1000000000.0;
157  } else if (unit == "GiB") {
158  memory *= 1073741824.0;
159  }
160  }
161  }
162  }
163 
164  memory = std::max(memory, 104857600.0); // Minimum 100 MiB
165  return memory;
166  }
167 
168  return 0ul;
169  }
170 
171  static ordinal_type init_max_depth() {
172  const char* max_depth = getenv("TA_SUMMA_MAX_DEPTH");
173  if (max_depth) return std::stoul(max_depth);
174  return 0ul;
175  }
176 
177  // Process groups --------------------------------------------------------
178 
180 
202  template <typename Shape, typename ProcMap>
203  madness::Group make_group(const Shape& shape,
204  const std::vector<bool>& process_mask,
205  ordinal_type index, const ordinal_type end,
206  const ordinal_type stride,
207  const ordinal_type max_group_size,
208  const ordinal_type k, const ordinal_type key_offset,
209  const ProcMap& proc_map) const {
210  // Generate the list of processes in rank_row
211  std::vector<ProcessID> proc_list(max_group_size, -1);
212 
213  // Flag the root processes of the broadcast, which may not be included
214  // by shape.
215  ordinal_type p = k % max_group_size;
216  proc_list[p] = proc_map(p);
217  ordinal_type count = 1ul;
218 
219  // Flag all processes that have non-zero tiles
220  for (p = 0ul; (index < end) && (count < max_group_size);
221  index += stride, p = (p + 1u) % max_group_size) {
222  if ((proc_list[p] != -1) || (shape.is_zero(index)) || !process_mask.at(p))
223  continue;
224 
225  proc_list[p] = proc_map(p);
226  ++count;
227  }
228 
229  // Remove processes from the list that will not be in the group
230  for (ordinal_type x = 0ul, p = 0ul; x < count; ++p) {
231  if (proc_list[p] == -1) continue;
232  proc_list[x++] = proc_list[p];
233  }
234 
235  // Truncate invalid process id's
236  proc_list.resize(count);
237 
238  return madness::Group(
239  TensorImpl_::world(), proc_list,
240  madness::DistributedID(DistEvalImpl_::id(), k + key_offset));
241  }
242 
244 
247  madness::Group make_row_group(const ordinal_type k) const {
248  // Construct the sparse broadcast group
249  const ordinal_type right_begin_k = k * proc_grid_.cols();
250  const ordinal_type right_end_k = right_begin_k + proc_grid_.cols();
251  // make the row mask; using the same mask for all tiles avoids having to
252  // compute mask for every tile and use of masked broadcasts
253  auto result_row_mask_k = make_row_mask(k);
254 
255  // return empty group if I am not in this group, otherwise make a group
256  if (result_row_mask_k[proc_grid_.rank_col()])
257  return make_group(right_.shape(), result_row_mask_k, right_begin_k,
258  right_end_k, right_stride_, proc_grid_.proc_cols(), k,
259  k_, [&](const ProcGrid::size_type col) {
260  return proc_grid_.map_col(col);
261  });
262  else
263  return madness::Group();
264  }
265 
267 
270  madness::Group make_col_group(const ordinal_type k) const {
271  // make the column mask; using the same mask for all tiles avoids having to
272  // compute mask for every tile and use of masked broadcasts
273  auto result_col_mask_k = make_col_mask(k);
274 
275  // return empty group if I am not in this group, otherwise make a group
276  if (result_col_mask_k[proc_grid_.rank_row()])
277  return make_group(
278  left_.shape(), result_col_mask_k, k, left_end_, left_stride_,
279  proc_grid_.proc_rows(), k, 0ul,
280  [&](const ordinal_type row) { return proc_grid_.map_row(row); });
281  else
282  return madness::Group();
283  }
284 
286 
291  std::vector<bool> make_row_mask(const ordinal_type k) const {
292  // "local" A[i][k] (i.e. for all i assigned to my row of processes) will
293  // produce C[i][*] for each process in my row of the process grid determine
294  // whether there are any nonzero C[i][*] located on that node
295 
296  const auto nproc_cols = proc_grid_.proc_cols();
297  const auto my_proc_row = proc_grid_.rank_row();
298 
299  // result shape
300  const auto& result_shape = TensorImpl_::shape();
301 
302  // if result is dense, include all processors
303  if (result_shape.is_dense()) return std::vector<bool>(nproc_cols, true);
304 
305  // initialize the mask
306  std::vector<bool> mask(nproc_cols, false);
307 
308  // number of tiles in the col dimension of the result
309  const auto nj = proc_grid_.cols();
310  // number of tiles in contraction dim
311  const auto nk = k_;
312 
313  // for each i assigned to my row of processes ...
314  ordinal_type i_start, i_fence, i_stride;
315  std::tie(i_start, i_fence, i_stride) = result_row_range(my_proc_row);
316  const auto ik_stride = i_stride * nk;
317  for (ordinal_type i = i_start, ik = i_start * nk + k; i < i_fence;
318  i += i_stride, ik += ik_stride) {
319  // ... such that A[i][k] exists ...
320  if (!left_.shape().is_zero(ik)) {
321  // ... the owner of А[i][k] is always in the group ...
322  const auto k_proc_col = k % nproc_cols;
323  mask[k_proc_col] = true;
324  // ... loop over processes in my row ...
325  for (ordinal_type proc_col = 0; proc_col != nproc_cols; ++proc_col) {
326  // ... that are not the owner of A[i][k] ...
327  if (proc_col != k_proc_col) {
328  // ... loop over all C[i][j] tiles that belong to this process ...
329  ordinal_type j_start, j_fence, j_stride;
330  std::tie(j_start, j_fence, j_stride) = result_col_range(proc_col);
331  const auto ij_stride = j_stride;
332  for (ordinal_type j = j_start, ij = i * nj + j_start; j < j_fence;
333  j += j_stride, ij += ij_stride) {
334  // ... if any such C[i][j] exists, update the mask, and move
335  // on to next process
336  if (!result_shape.is_zero(
338  mask[proc_col] = true;
339  break;
340  }
341  }
342  }
343  }
344  }
345  }
346 
347  return mask;
348  }
349 
351 
356  std::vector<bool> make_col_mask(const ordinal_type k) const {
357  // "local" B[k][j] (i.e. for all j assigned to my column of processes)
358  // will produce C[*][j]
359  // for each process in my column of the process grid determine whether
360  // there are any
361  // nonzero C[*][j] located on that node
362 
363  const auto nproc_rows = proc_grid_.proc_rows();
364  const auto my_proc_col = proc_grid_.rank_col();
365 
366  // result shape
367  const auto& result_shape = TensorImpl_::shape();
368 
369  // if result is dense, include all processors
370  if (result_shape.is_dense()) return std::vector<bool>(nproc_rows, true);
371 
372  // initialize the mask
373  std::vector<bool> mask(nproc_rows, false);
374 
375  // number of tiles in col dim of the result
376  const auto nj = proc_grid_.cols();
377 
378  // for each j assigned to my column of processes ...
379  ordinal_type j_start, j_fence, j_stride;
380  std::tie(j_start, j_fence, j_stride) = result_col_range(my_proc_col);
381  const auto kj_stride = j_stride;
382  for (ordinal_type j = j_start, kj = k * nj + j_start; j < j_fence;
383  j += j_stride, kj += kj_stride) {
384  // ... such that B[k][j] exists ...
385  if (!right_.shape().is_zero(kj)) {
386  // ... the owner of B[k][j] is always in the group ...
387  auto k_proc_row = k % nproc_rows;
388  mask[k_proc_row] = true;
389  // ... loop over processes in my col ...
390  for (ordinal_type proc_row = 0; proc_row != nproc_rows; ++proc_row) {
391  // ... that are not the owner of B[k][j] ...
392  if (proc_row != k_proc_row) {
393  // ... loop over all C[i][j] tiles that belong to this process
394  ordinal_type i_start, i_fence, i_stride;
395  std::tie(i_start, i_fence, i_stride) = result_row_range(proc_row);
396  const auto ij_stride = i_stride * nj;
397  for (ordinal_type i = i_start, ij = i_start * nj + j; i < i_fence;
398  i += i_stride, ij += ij_stride) {
399  // ... if any such C[i][j] exists, update the mask, and move
400  // on to next process
401  if (!result_shape.is_zero(
403  mask[proc_row] = true;
404  break;
405  }
406  }
407  }
408  }
409  }
410  }
411 
412  return mask;
413  }
414 
416 
421  inline std::tuple<ordinal_type, ordinal_type, ordinal_type> result_row_range(
422  ordinal_type proc_row) const {
423  const ordinal_type start = proc_row;
424  const ordinal_type fence = proc_grid_.rows();
425  const ordinal_type stride = proc_grid_.proc_rows();
426  return std::make_tuple(start, fence, stride);
427  }
428 
430 
435  std::tuple<ordinal_type, ordinal_type, ordinal_type> result_col_range(
436  ordinal_type proc_col) const {
437  const ordinal_type start = proc_col;
438  const ordinal_type fence = proc_grid_.cols();
439  const ordinal_type stride = proc_grid_.proc_cols();
440  return std::make_tuple(start, fence, stride);
441  }
442 
443  // Broadcast kernels -----------------------------------------------------
444 
446 
450  template <typename Tile>
451  static auto convert_tile(const Tile& tile) {
453  return cast(tile);
454  }
455 
457 
463  template <typename Arg>
464  static typename std::enable_if<!is_lazy_tile<typename Arg::value_type>::value,
465  Future<typename Arg::eval_type>>::type
466  get_tile(Arg& arg, const typename Arg::ordinal_type index) {
467  return arg.get(index);
468  }
469 
471 
478  template <typename Arg>
479  static typename std::enable_if<
480  is_lazy_tile<typename Arg::value_type>::value
481 #ifdef TILEDARRAY_HAS_CUDA
482  && !detail::is_cuda_tile_v<typename Arg::value_type>
483 #endif
484  ,
485  Future<typename Arg::eval_type>>::type
486  get_tile(Arg& arg, const typename Arg::ordinal_type index) {
487  auto convert_tile_fn =
488  &Summa_::template convert_tile<typename Arg::value_type>;
489  return arg.world().taskq.add(convert_tile_fn, arg.get(index),
490  madness::TaskAttributes::hipri());
491  }
492 
493 #ifdef TILEDARRAY_HAS_CUDA
494 
502  template <typename Arg>
503  static typename std::enable_if<
504  is_lazy_tile<typename Arg::value_type>::value &&
505  detail::is_cuda_tile_v<typename Arg::value_type>,
506  Future<typename Arg::eval_type>>::type
507  get_tile(Arg& arg, const typename Arg::ordinal_type index) {
508  auto convert_tile_fn =
509  &Summa_::template convert_tile<typename Arg::value_type>;
510  return madness::add_cuda_task(arg.world(), convert_tile_fn, arg.get(index),
511  madness::TaskAttributes::hipri());
512  }
513 #endif
514 
516 
524  template <typename Arg, typename Datum>
525  void get_vector(Arg& arg, ordinal_type index, const ordinal_type end,
526  const ordinal_type stride, std::vector<Datum>& vec) const {
527  TA_ASSERT(vec.size() == 0ul);
528 
529  // Iterate over vector of tiles
530  if (arg.is_local(index)) {
531  for (ordinal_type i = 0ul; index < end; ++i, index += stride) {
532  if (arg.shape().is_zero(index)) continue;
533  vec.emplace_back(i, get_tile(arg, index));
534  }
535  } else {
536  for (ordinal_type i = 0ul; index < end; ++i, index += stride) {
537  if (arg.shape().is_zero(index)) continue;
538  vec.emplace_back(i, Future<typename Arg::eval_type>());
539  }
540  }
541 
542  TA_ASSERT(vec.size() > 0ul);
543  }
544 
546 
549  void get_col(const ordinal_type k, std::vector<col_datum>& col) const {
550  col.reserve(proc_grid_.local_rows());
551  get_vector(left_, left_start_local_ + k, left_end_, left_stride_local_,
552  col);
553  }
554 
556 
559  void get_row(const ordinal_type k, std::vector<row_datum>& row) const {
560  row.reserve(proc_grid_.local_cols());
561 
562  // Compute local iteration limits for row k of right_.
563  ordinal_type begin = k * proc_grid_.cols();
564  const ordinal_type end = begin + proc_grid_.cols();
565  begin += proc_grid_.rank_col();
566 
567  get_vector(right_, begin, end, right_stride_local_, row);
568  }
569 
571 
578  template <typename Datum>
579  void bcast(const ordinal_type start, const ordinal_type stride,
580  const madness::Group& group, const ProcessID group_root,
581  const ordinal_type key_offset, std::vector<Datum>& vec) const {
582  TA_ASSERT(vec.size() != 0ul);
583  TA_ASSERT(group.size() > 0);
584  TA_ASSERT(group_root < group.size());
585 
586 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
587  std::stringstream ss;
588  ss << "bcast: rank=" << TensorImpl_::world().rank()
589  << " root=" << group.world_rank(group_root) << " groupid=("
590  << group.id().first << "," << group.id().second
591  << ") keyoffset=" << key_offset << " group={ ";
592  for (ProcessID group_proc = 0; group_proc < group.size(); ++group_proc)
593  ss << group.world_rank(group_proc) << " ";
594  ss << "} tiles={ ";
595 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
596 
597  // Iterate over tiles to be broadcast
598  for (typename std::vector<Datum>::iterator it = vec.begin();
599  it != vec.end(); ++it) {
600  const ordinal_type index = it->first * stride + start;
601 
602  // Broadcast the tile
603  const madness::DistributedID key(DistEvalImpl_::id(), index + key_offset);
604  TensorImpl_::world().gop.bcast(key, it->second, group_root, group);
605 
606 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
607  ss << index << " ";
608 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
609  }
610 
611  TA_ASSERT(vec.size() > 0ul);
612 
613 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
614  ss << "}\n";
615  printf(ss.str().c_str());
616 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
617  }
618 
619  // Broadcast specialization for left and right arguments -----------------
620 
621  ProcessID get_row_group_root(const ordinal_type k,
622  const madness::Group& row_group) const {
623  ProcessID group_root = k % proc_grid_.proc_cols();
624  if (!right_.shape().is_dense() &&
625  row_group.size() < static_cast<ProcessID>(proc_grid_.proc_cols())) {
626  const ProcessID world_root =
627  proc_grid_.rank_row() * proc_grid_.proc_cols() + group_root;
628  group_root = row_group.rank(world_root);
629  }
630  return group_root;
631  }
632 
633  ProcessID get_col_group_root(const ordinal_type k,
634  const madness::Group& col_group) const {
635  ProcessID group_root = k % proc_grid_.proc_rows();
636  if (!left_.shape().is_dense() &&
637  col_group.size() < static_cast<ProcessID>(proc_grid_.proc_rows())) {
638  const ProcessID world_root =
639  group_root * proc_grid_.proc_cols() + proc_grid_.rank_col();
640  group_root = col_group.rank(world_root);
641  }
642  return group_root;
643  }
644 
646 
649  void bcast_col(const ordinal_type k, std::vector<col_datum>& col,
650  const madness::Group& row_group) const {
651  // broadcast if I'm part of the broadcast group
652  if (!row_group.empty()) {
653  // Broadcast column k of left_.
654  ProcessID group_root = get_row_group_root(k, row_group);
655  bcast(left_start_local_ + k, left_stride_local_, row_group, group_root,
656  0ul, col);
657  }
658  }
659 
661 
664  void bcast_row(const ordinal_type k, std::vector<row_datum>& row,
665  const madness::Group& col_group) const {
666  // broadcast if I'm part of the broadcast group
667  if (!col_group.empty()) {
668  // Compute the group root process.
669  ProcessID group_root = get_col_group_root(k, col_group);
670 
671  // Broadcast row k of right_.
672  bcast(k * proc_grid_.cols() + proc_grid_.rank_col(), right_stride_local_,
673  col_group, group_root, left_.size(), row);
674  }
675  }
676 
677  void bcast_col_range_task(ordinal_type k, const ordinal_type end) const {
678  // Compute the first local row of right
679  const ordinal_type Pcols = proc_grid_.proc_cols();
680  k += (Pcols - ((k + Pcols - proc_grid_.rank_col()) % Pcols)) % Pcols;
681 
682  for (; k < end; k += Pcols) {
683  // Compute local iteration limits for column k of left_.
684  ordinal_type index = left_start_local_ + k;
685 
686  // will create broadcast group only if needed
687  bool have_group = false;
688  madness::Group row_group;
689  ProcessID group_root;
690  bool do_broadcast;
691 
692  // Search column k of left for non-zero tiles
693  for (; index < left_end_; index += left_stride_local_) {
694  if (left_.shape().is_zero(index)) continue;
695 
696  // Construct broadcast group, if needed
697  if (!have_group) {
698  have_group = true;
699  row_group = make_row_group(k);
700  // broadcast if I am in this group and this group has others
701  do_broadcast = !row_group.empty() && row_group.size() > 1;
702  if (do_broadcast) group_root = get_row_group_root(k, row_group);
703  }
704 
705  if (do_broadcast) {
706  // Broadcast the tile
707  const madness::DistributedID key(DistEvalImpl_::id(), index);
708  auto tile = get_tile(left_, index);
709  TensorImpl_::world().gop.bcast(key, tile, group_root, row_group);
710  } else {
711  // Discard the tile
712  left_.discard(index);
713  }
714  }
715  }
716  }
717 
718  void bcast_row_range_task(ordinal_type k, const ordinal_type end) const {
719  // Compute the first local row of right
720  const ordinal_type Prows = proc_grid_.proc_rows();
721  k += (Prows - ((k + Prows - proc_grid_.rank_row()) % Prows)) % Prows;
722 
723  for (; k < end; k += Prows) {
724  // Compute local iteration limits for row k of right_.
725  ordinal_type index = k * proc_grid_.cols();
726  const ordinal_type row_end = index + proc_grid_.cols();
727  index += proc_grid_.rank_col();
728 
729  // will create broadcast group only if needed
730  bool have_group = false;
731  madness::Group col_group;
732  ProcessID group_root;
733  bool do_broadcast;
734 
735  // Search for and broadcast non-zero row
736  for (; index < row_end; index += right_stride_local_) {
737  if (right_.shape().is_zero(index)) continue;
738 
739  // Construct broadcast group
740  if (!have_group) {
741  have_group = true;
742  col_group = make_col_group(k);
743  // broadcast if I am in this group and this group has others
744  do_broadcast = !col_group.empty() && col_group.size() > 1;
745  if (do_broadcast) group_root = get_col_group_root(k, col_group);
746  }
747 
748  if (do_broadcast) {
749  // Broadcast the tile
750  const madness::DistributedID key(DistEvalImpl_::id(),
751  index + left_.size());
752  auto tile = get_tile(right_, index);
753  TensorImpl_::world().gop.bcast(key, tile, group_root, col_group);
754  } else {
755  // Discard the tile
756  right_.discard(index);
757  }
758  }
759  }
760  }
761 
762  // Row and column iteration functions ------------------------------------
763 
765 
772  ordinal_type iterate_row(ordinal_type k) const {
773  // Iterate over k's until a non-zero tile is found or the end of the
774  // matrix is reached.
775  ordinal_type end = k * proc_grid_.cols();
776  for (; k < k_; ++k) {
777  // Search for non-zero tiles in row k of right
778  ordinal_type i = end + proc_grid_.rank_col();
779  end += proc_grid_.cols();
780  for (; i < end; i += right_stride_local_)
781  if (!right_.shape().is_zero(i)) return k;
782  }
783 
784  return k;
785  }
786 
788 
795  ordinal_type iterate_col(ordinal_type k) const {
796  // Iterate over k's until a non-zero tile is found or the end of the
797  // matrix is reached.
798  for (; k < k_; ++k)
799  // Search row k for non-zero tiles
800  for (ordinal_type i = left_start_local_ + k; i < left_end_;
801  i += left_stride_local_)
802  if (!left_.shape().is_zero(i)) return k;
803 
804  return k;
805  }
806 
809 
818  ordinal_type iterate_sparse(const ordinal_type k) const {
819  // Initial step for k_col and k_row.
820  ordinal_type k_col = iterate_col(k);
821  ordinal_type k_row = iterate_row(k_col);
822 
823  // Search for a row and column that both have non-zero tiles
824  while (k_col != k_row) {
825  if (k_col < k_row) {
826  k_col = iterate_col(k_row);
827  } else {
828  k_row = iterate_row(k_col);
829  }
830  }
831 
832  if (k < k_row) {
833  // Spawn a task to broadcast any local columns of left that were skipped
834  TensorImpl_::world().taskq.add(shared_from_this(),
835  &Summa_::bcast_col_range_task, k, k_row,
836  madness::TaskAttributes::hipri());
837 
838  // Spawn a task to broadcast any local rows of right that were skipped
839  TensorImpl_::world().taskq.add(shared_from_this(),
840  &Summa_::bcast_row_range_task, k, k_col,
841  madness::TaskAttributes::hipri());
842  }
843 
844  return k_col;
845  }
846 
849 
858  ordinal_type iterate(const ordinal_type k) const {
859  return (left_.shape().is_dense() && right_.shape().is_dense()
860  ? k
861  : iterate_sparse(k));
862  }
863 
864  // Initialization functions ----------------------------------------------
865 
867  ordinal_type initialize(const DenseShape&) {
868  // Construct static broadcast groups for dense arguments
869  const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
870  col_group_ = proc_grid_.make_col_group(col_did);
871  const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
872  row_group_ = proc_grid_.make_row_group(row_did);
873 
874 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
875  std::stringstream ss;
876  ss << "init: rank=" << TensorImpl_::world().rank() << "\n col_group_=("
877  << col_did.first << ", " << col_did.second << ") { ";
878  for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
879  ss << col_group_.world_rank(gproc) << " ";
880  ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second
881  << ") { ";
882  for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
883  ss << row_group_.world_rank(gproc) << " ";
884  ss << "}\n";
885  printf(ss.str().c_str());
886 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
887 
888  // Allocate memory for the reduce pair tasks.
889  std::allocator<ReducePairTask<op_type>> alloc;
890  reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
891 
892  // Iterate over all local tiles
893  const ordinal_type n = proc_grid_.local_size();
894  for (ordinal_type t = 0ul; t < n; ++t) {
895  // Initialize the reduction task
896  ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
897  reduce_tasks_ + t;
898  new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
899  }
900 
901  return proc_grid_.local_size();
902  }
903 
905  template <typename Shape>
906  ordinal_type initialize(const Shape& shape) {
907 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
908  std::stringstream ss;
909  ss << " initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
910 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
911 
912  // Allocate memory for the reduce pair tasks.
913  std::allocator<ReducePairTask<op_type>> alloc;
914  reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
915 
916  // Initialize iteration variables
917  ordinal_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
918  ordinal_type row_end = row_start + proc_grid_.cols();
919  row_start += proc_grid_.rank_col();
920  const ordinal_type col_stride = // The stride to iterate down a column
921  proc_grid_.proc_rows() * proc_grid_.cols();
922  const ordinal_type row_stride = // The stride to iterate across a row
923  proc_grid_.proc_cols();
925 
926  // Iterate over all local tiles
927  ordinal_type tile_count = 0ul;
928  ReducePairTask<op_type>* MADNESS_RESTRICT reduce_task = reduce_tasks_;
929  // this loops over result tiles arranged in block-cyclic order
930  // index = tile index (row major)
931  for (; row_start < end; row_start += col_stride, row_end += col_stride) {
932  for (ordinal_type index = row_start; index < row_end;
933  index += row_stride, ++reduce_task) {
934  // Initialize the reduction task
935 
936  // Skip zero tiles
937  if (!shape.is_zero(DistEvalImpl_::perm_index_to_target(index))) {
938 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
939  ss << index << " ";
940 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
941 
942  new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
943  ++tile_count;
944  } else {
945  // Construct an empty task to represent zero tiles.
946  new (reduce_task) ReducePairTask<op_type>();
947  }
948  }
949  }
950 
951 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
952  ss << "}\n";
953  printf(ss.str().c_str());
954 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
955 
956  return tile_count;
957  }
958 
959  ordinal_type initialize() {
960 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
961  printf("init: start rank=%i\n", TensorImpl_::world().rank());
962 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
963 
964  const ordinal_type result = initialize(TensorImpl_::shape());
965 
966 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
967  printf("init: finish rank=%i\n", TensorImpl_::world().rank());
968 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
969 
970  return result;
971  }
972 
973  // Finalize functions ----------------------------------------------------
974 
976  void finalize(const DenseShape&) {
977  // Initialize iteration variables
978  ordinal_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
979  ordinal_type row_end = row_start + proc_grid_.cols();
980  row_start += proc_grid_.rank_col();
981  const ordinal_type col_stride = // The stride to iterate down a column
982  proc_grid_.proc_rows() * proc_grid_.cols();
983  const ordinal_type row_stride = // The stride to iterate across a row
984  proc_grid_.proc_cols();
986 
987  // Iterate over all local tiles
988  for (ReducePairTask<op_type>* reduce_task = reduce_tasks_; row_start < end;
989  row_start += col_stride, row_end += col_stride) {
990  for (ordinal_type index = row_start; index < row_end;
991  index += row_stride, ++reduce_task) {
992  // Set the result tile
994  reduce_task->submit());
995 
996  // Destroy the reduce task
997  reduce_task->~ReducePairTask<op_type>();
998  }
999  }
1000 
1001  // Deallocate the memory for the reduce pair tasks.
1002  std::allocator<ReducePairTask<op_type>>().deallocate(
1003  reduce_tasks_, proc_grid_.local_size());
1004  }
1005 
1007  template <typename Shape>
1008  void finalize(const Shape& shape) {
1009 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1010  std::stringstream ss;
1011  ss << " finalize rank=" << TensorImpl_::world().rank() << " tiles={ ";
1012 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1013 
1014  // Initialize iteration variables
1015  ordinal_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
1016  ordinal_type row_end = row_start + proc_grid_.cols();
1017  row_start += proc_grid_.rank_col();
1018  const ordinal_type col_stride = // The stride to iterate down a column
1019  proc_grid_.proc_rows() * proc_grid_.cols();
1020  const ordinal_type row_stride = // The stride to iterate across a row
1021  proc_grid_.proc_cols();
1023 
1024  // Iterate over all local tiles
1025  for (ReducePairTask<op_type>* reduce_task = reduce_tasks_; row_start < end;
1026  row_start += col_stride, row_end += col_stride) {
1027  for (ordinal_type index = row_start; index < row_end;
1028  index += row_stride, ++reduce_task) {
1029  // Compute the permuted index
1030  const ordinal_type perm_index =
1032 
1033  // Skip zero tiles
1034  if (!shape.is_zero(perm_index)) {
1035 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1036  ss << index << " ";
1037 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1038 
1039  // Set the result tile
1040  DistEvalImpl_::set_tile(perm_index, reduce_task->submit());
1041  }
1042 
1043  // Destroy the reduce task
1044  reduce_task->~ReducePairTask<op_type>();
1045  }
1046  }
1047  // Deallocate the memory for the reduce pair tasks.
1048  std::allocator<ReducePairTask<op_type>>().deallocate(
1049  reduce_tasks_, proc_grid_.local_size());
1050 
1051 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1052  ss << "}\n";
1053  printf(ss.str().c_str());
1054 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1055  }
1056 
1057  void finalize() {
1058 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1059  printf("finalize: start rank=%i\n", TensorImpl_::world().rank());
1060 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1061 
1062  finalize(TensorImpl_::shape());
1063 
1064 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1065  printf("finalize: finish rank=%i\n", TensorImpl_::world().rank());
1066 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1067  }
1068 
1070 
1072  class FinalizeTask : public madness::TaskInterface {
1073  private:
1074  std::shared_ptr<Summa_> owner_;
1075 
1076  public:
1077  FinalizeTask(const std::shared_ptr<Summa_>& owner, const int ndep)
1078  : madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
1079  owner_(owner) {}
1080 
1081  virtual ~FinalizeTask() {}
1082 
1083  virtual void run(const madness::TaskThreadEnv&) { owner_->finalize(); }
1084 
1085  }; // class FinalizeTask
1086 
1087  // Contraction functions -------------------------------------------------
1088 
1090 
1097  void contract(const DenseShape&, const ordinal_type,
1098  const std::vector<col_datum>& col,
1099  const std::vector<row_datum>& row,
1100  madness::TaskInterface* const task) {
1101  // Iterate over the row
1102  for (ordinal_type i = 0ul; i < col.size(); ++i) {
1103  // Compute the local, result-tile offset
1104  const ordinal_type reduce_task_offset =
1105  col[i].first * proc_grid_.local_cols();
1106 
1107  // Iterate over columns
1108  for (ordinal_type j = 0ul; j < row.size(); ++j) {
1109  const ordinal_type reduce_task_index =
1110  reduce_task_offset + row[j].first;
1111 
1112  // Schedule task for contraction pairs
1113  if (task) task->inc();
1114  const left_future left = col[i].second;
1115  const right_future right = row[j].second;
1116  reduce_tasks_[reduce_task_index].add(left, right, task);
1117  }
1118  }
1119  }
1120 
1122 
1129  template <typename Shape>
1130  void contract(const Shape&, const ordinal_type,
1131  const std::vector<col_datum>& col,
1132  const std::vector<row_datum>& row,
1133  madness::TaskInterface* const task) {
1134  // Iterate over the row
1135  for (ordinal_type i = 0ul; i < col.size(); ++i) {
1136  // Compute the local, result-tile offset
1137  const ordinal_type reduce_task_offset =
1138  col[i].first * proc_grid_.local_cols();
1139 
1140  // Iterate over columns
1141  for (ordinal_type j = 0ul; j < row.size(); ++j) {
1142  const ordinal_type reduce_task_index =
1143  reduce_task_offset + row[j].first;
1144 
1145  // Skip zero tiles
1146  if (!reduce_tasks_[reduce_task_index]) continue;
1147 
1148  // Schedule task for contraction pairs
1149  if (task) {
1150  if (trace_tasks)
1151  task->inc_debug("destroy(*ReduceObject)");
1152  else
1153  task->inc();
1154  }
1155  const left_future left = col[i].second;
1156  const right_future right = row[j].second;
1157  reduce_tasks_[reduce_task_index].add(left, right, task);
1158  }
1159  }
1160  }
1161 
1162 #define TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1163 #ifndef TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1164 
1176  template <typename T>
1177  typename std::enable_if<std::is_floating_point<T>::value>::type contract(
1178  const SparseShape<T>&, const ordinal_type k,
1179  const std::vector<col_datum>& col, const std::vector<row_datum>& row,
1180  madness::TaskInterface* const task) {
1181  // Cache row shape data.
1182  std::vector<typename SparseShape<T>::value_type> row_shape_values;
1183  row_shape_values.reserve(row.size());
1184  const ordinal_type row_start =
1185  k * proc_grid_.cols() + proc_grid_.rank_col();
1186  for (ordinal_type j = 0ul; j < row.size(); ++j)
1187  row_shape_values.push_back(
1188  right_.shape()[row_start + (row[j].first * right_stride_local_)]);
1189 
1190  const ordinal_type col_start = left_start_local_ + k;
1191  const float threshold_k = TensorImpl_::shape().threshold() /
1192  typename SparseShape<T>::value_type(k_);
1193  // Iterate over the row
1194  for (ordinal_type i = 0ul; i != col.size(); ++i) {
1195  // Compute the local, result-tile offset
1196  const ordinal_type offset = col[i].first * proc_grid_.local_cols();
1197 
1198  // Get the shape data for col_it tile
1199  const typename SparseShape<T>::value_type col_shape_value =
1200  left_.shape()[col_start + (col[i].first * left_stride_local_)];
1201 
1202  // Iterate over columns
1203  for (ordinal_type j = 0ul; j < row.size(); ++j) {
1204  if ((col_shape_value * row_shape_values[j]) < threshold_k) continue;
1205 
1206  const ordinal_type reduce_task_index = offset + row[j].first;
1207 
1208  // Skip zero tiles
1209  if (!reduce_tasks_[reduce_task_index]) continue;
1210 
1211  if (task) task->inc();
1212  reduce_tasks_[reduce_task_index].add(col[i].second, row[j].second,
1213  task);
1214  }
1215  }
1216  }
1217 #endif // TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1218 
1219  void contract(const ordinal_type k, const std::vector<col_datum>& col,
1220  const std::vector<row_datum>& row,
1221  madness::TaskInterface* const task) {
1222  contract(TensorImpl_::shape(), k, col, row, task);
1223  }
1224 
1225  // SUMMA step task -------------------------------------------------------
1226 
1228 
1231  class StepTask : public madness::TaskInterface {
1232  protected:
1233  // Member variables
1234  std::shared_ptr<Summa_> owner_;
1235  World& world_;
1236  std::vector<col_datum> col_{};
1237  std::vector<row_datum> row_{};
1238  FinalizeTask* finalize_task_;
1239  StepTask* next_step_task_ = nullptr;
1240  StepTask* tail_step_task_ =
1241  nullptr;
1242 
1243  void get_col(const ordinal_type k) {
1244  owner_->get_col(k, col_);
1245  if (trace_tasks)
1246  this->notify_debug("StepTask::spawn_col");
1247  else
1248  this->notify();
1249  }
1250 
1251  void get_row(const ordinal_type k) {
1252  owner_->get_row(k, row_);
1253  if (trace_tasks)
1254  this->notify_debug("StepTask::spawn_row");
1255  else
1256  this->notify();
1257  }
1258 
1259  public:
1260  StepTask(const std::shared_ptr<Summa_>& owner, int finalize_ndep)
1261  :
1262 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1263  madness::TaskInterface(0ul, "StepTask 1st ctor",
1264  madness::TaskAttributes::hipri()),
1265 #else
1266  madness::TaskInterface(0ul, madness::TaskAttributes::hipri()),
1267 #endif
1268  owner_(owner),
1269  world_(owner->world()),
1270  finalize_task_(new FinalizeTask(owner, finalize_ndep)) {
1271  TA_ASSERT(owner_);
1272  owner_->world().taskq.add(finalize_task_);
1273  }
1274 
1276 
1279  StepTask(StepTask* const parent, const int ndep)
1280  :
1281 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1282  madness::TaskInterface(ndep, "StepTask nth ctor",
1283  madness::TaskAttributes::hipri()),
1284 #else
1285  madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
1286 #endif
1287  owner_(parent->owner_),
1288  world_(parent->world_),
1289  finalize_task_(parent->finalize_task_) {
1290  TA_ASSERT(parent);
1291  parent->next_step_task_ = this;
1292  }
1293 
1294  virtual ~StepTask() {}
1295 
1296  void spawn_get_row_col_tasks(const ordinal_type k) {
1297  // Submit the task to collect column tiles of left for iteration k
1298  if (trace_tasks)
1299  madness::DependencyInterface::inc_debug("StepTask::spawn_col");
1300  else
1301  madness::DependencyInterface::inc();
1302  world_.taskq.add(this, &StepTask::get_col, k,
1303  madness::TaskAttributes::hipri());
1304 
1305  // Submit the task to collect row tiles of right for iteration k
1306  if (trace_tasks)
1307  madness::DependencyInterface::inc_debug("StepTask::spawn_row");
1308  else
1309  madness::DependencyInterface::inc();
1310  world_.taskq.add(this, &StepTask::get_row, k,
1311  madness::TaskAttributes::hipri());
1312  }
1313 
1314  template <typename Derived>
1315  void make_next_step_tasks(Derived* task, ordinal_type depth) {
1316  TA_ASSERT(depth > 0);
1317  // Set the depth to be no greater than the maximum number steps
1318  if (depth > owner_->k_) depth = owner_->k_;
1319 
1320  // Spawn n=depth step tasks
1321  for (; depth > 0ul; --depth) {
1322  // Set dep count of the tail task to 1, it will not start until this
1323  // task commands
1324  Derived* const next = new Derived(task, depth == 1 ? 1 : 0);
1325  task = next;
1326  }
1327 
1328  // Keep track of the tail ptr
1329  tail_step_task_ = task;
1330  }
1331 
1332  template <typename Derived, typename GroupType>
1333  void run(const ordinal_type k, const GroupType& row_group,
1334  const GroupType& col_group) {
1335 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1336  printf("step: start rank=%i k=%lu\n", owner_->world().rank(), k);
1337 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1338 
1339  if (k < owner_->k_) {
1340  // Initialize next tail task and submit next task
1341  TA_ASSERT(next_step_task_);
1342  next_step_task_->tail_step_task_ = new Derived(
1343  static_cast<Derived*>(tail_step_task_),
1344  1); // <- ndep=1, will control its scheduling by this task
1345  // submit next step task ... even if it's same as tail_step_task_ it is
1346  // safe to submit because its ndep > 0 (see
1347  // StepTask::make_next_step_tasks)
1348  TA_ASSERT(tail_step_task_->ndep() > 0);
1349  world_.taskq.add(next_step_task_);
1350  next_step_task_ = nullptr;
1351 
1352  // Start broadcast of column and row tiles for this step
1353  world_.taskq.add(owner_, &Summa_::bcast_col, k, col_, row_group,
1354  madness::TaskAttributes::hipri());
1355  world_.taskq.add(owner_, &Summa_::bcast_row, k, row_, col_group,
1356  madness::TaskAttributes::hipri());
1357 
1358  // Submit tasks for the contraction of col and row tiles.
1359  owner_->contract(k, col_, row_, tail_step_task_);
1360 
1361  // Notify task dependencies
1362  TA_ASSERT(tail_step_task_);
1363  if (trace_tasks)
1364  tail_step_task_->notify_debug("StepTask nth ctor");
1365  else
1366  tail_step_task_->notify();
1367  finalize_task_->notify();
1368 
1369  } else if (finalize_task_) {
1370  // Signal the finalize task so it can run after all non-zero step
1371  // tasks have completed.
1372  finalize_task_->notify();
1373 
1374  // Cleanup any remaining step tasks
1375  StepTask* step_task = next_step_task_;
1376  while (step_task) {
1377  StepTask* const next_step_task = step_task->next_step_task_;
1378  step_task->next_step_task_ = nullptr;
1379  step_task->finalize_task_ = nullptr;
1380  world_.taskq.add(step_task);
1381  step_task = next_step_task;
1382  }
1383 
1384  if (trace_tasks)
1385  tail_step_task_->notify_debug("StepTask nth ctor");
1386  else
1387  tail_step_task_->notify();
1388  }
1389 
1390 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1391  printf("step: finish rank=%i k=%lu\n", owner_->world().rank(), k);
1392 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1393  }
1394 
1395  }; // class StepTask
1396 
1397  class DenseStepTask : public StepTask {
1398  protected:
1399  const ordinal_type k_;
1400  using StepTask::owner_;
1401 
1402  public:
1403  DenseStepTask(const std::shared_ptr<Summa_>& owner,
1404  const ordinal_type depth)
1405  : StepTask(owner, owner->k_ + 1ul), k_(0) {
1406  StepTask::make_next_step_tasks(this, depth);
1407  StepTask::spawn_get_row_col_tasks(k_);
1408  }
1409 
1410  DenseStepTask(DenseStepTask* const parent, const int ndep)
1411  : StepTask(parent, ndep), k_(parent->k_ + 1ul) {
1412  // Spawn tasks to get k-th row and column tiles
1413  if (k_ < owner_->k_) StepTask::spawn_get_row_col_tasks(k_);
1414  }
1415 
1416  virtual ~DenseStepTask() {}
1417 
1418  virtual void run(const madness::TaskThreadEnv&) {
1419  StepTask::template run<DenseStepTask>(k_, owner_->row_group_,
1420  owner_->col_group_);
1421  }
1422  }; // class DenseStepTask
1423 
1424  class SparseStepTask : public StepTask {
1425  protected:
1426  Future<ordinal_type> k_{};
1427  Future<madness::Group> row_group_{};
1428  Future<madness::Group> col_group_{};
1429  using StepTask::finalize_task_;
1430  using StepTask::next_step_task_;
1431  using StepTask::owner_;
1432  using StepTask::world_;
1433 
1434  private:
1436  void iterate_task(ordinal_type k, const ordinal_type offset) {
1437  // Search for the next non-zero row and column
1438  k = owner_->iterate_sparse(k + offset);
1439  k_.set(k);
1440 
1441  if (k < owner_->k_) {
1442  // NOTE: The order of task submissions is dependent on the order in
1443  // which we want the tasks to complete.
1444 
1445  // Spawn tasks to get k-th row and column tiles
1446  StepTask::spawn_get_row_col_tasks(k);
1447 
1448  // Spawn tasks to construct the row and column broadcast group
1449  row_group_ = world_.taskq.add(owner_, &Summa_::make_row_group, k,
1450  madness::TaskAttributes::hipri());
1451  col_group_ = world_.taskq.add(owner_, &Summa_::make_col_group, k,
1452  madness::TaskAttributes::hipri());
1453 
1454  // Increment the finalize task dependency counter, which indicates
1455  // that this task is not the terminating step task.
1456  TA_ASSERT(finalize_task_);
1457  finalize_task_->inc();
1458  }
1459 
1460  if (trace_tasks)
1461  madness::DependencyInterface::notify_debug("SparseStepTask ctor");
1462  else
1463  madness::DependencyInterface::notify();
1464  }
1465 
1466  public:
1467  SparseStepTask(const std::shared_ptr<Summa_>& owner, ordinal_type depth)
1468  : StepTask(owner, 1ul) {
1469  StepTask::make_next_step_tasks(this, depth);
1470 
1471  // Spawn a task to find the next non-zero iteration
1472  if (trace_tasks)
1473  madness::DependencyInterface::inc_debug("SparseStepTask ctor");
1474  else
1475  madness::DependencyInterface::inc();
1476  world_.taskq.add(this, &SparseStepTask::iterate_task, 0ul, 0ul,
1477  madness::TaskAttributes::hipri());
1478  }
1479 
1480  SparseStepTask(SparseStepTask* const parent, const int ndep)
1481  : StepTask(parent, ndep) {
1482  if (parent->k_.probe() && (parent->k_.get() >= owner_->k_)) {
1483  // Avoid running extra tasks if not needed.
1484  k_.set(parent->k_.get());
1485  TA_ASSERT(ndep ==
1486  1); // ensure that this does not get executed immediately
1487  } else {
1488  // Spawn a task to find the next non-zero iteration
1489  if (trace_tasks)
1490  madness::DependencyInterface::inc_debug("SparseStepTask ctor");
1491  else
1492  madness::DependencyInterface::inc();
1493  world_.taskq.add(this, &SparseStepTask::iterate_task, parent->k_, 1ul,
1494  madness::TaskAttributes::hipri());
1495  }
1496  }
1497 
1498  virtual ~SparseStepTask() {}
1499 
1500  virtual void run(const madness::TaskThreadEnv&) {
1501  StepTask::template run<SparseStepTask>(k_, row_group_, col_group_);
1502  }
1503  }; // class SparseStepTask
1504 
1505  public:
1507 
1522  template <typename Perm, typename = std::enable_if_t<
1523  TiledArray::detail::is_permutation_v<Perm>>>
1524  Summa(const left_type& left, const right_type& right, World& world,
1525  const trange_type trange, const shape_type& shape,
1526  const std::shared_ptr<pmap_interface>& pmap, const Perm& perm,
1527  const op_type& op, const ordinal_type k, const ProcGrid& proc_grid)
1528  : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
1529  left_(left),
1530  right_(right),
1531  op_(op),
1532  row_group_(),
1533  col_group_(),
1534  k_(k),
1535  proc_grid_(proc_grid),
1536  reduce_tasks_(NULL),
1537  left_start_local_(proc_grid_.rank_row() * k),
1538  left_end_(left.size()),
1539  left_stride_(k),
1540  left_stride_local_(proc_grid.proc_rows() * k),
1541  right_stride_(1ul),
1542  right_stride_local_(proc_grid.proc_cols()) {}
1543 
1544  virtual ~Summa() {}
1545 
1547 
1555 
1556  const ordinal_type source_index = DistEvalImpl_::perm_index_to_source(i);
1557 
1558  // Compute tile coordinate in tile grid
1559  const ordinal_type tile_row = source_index / proc_grid_.cols();
1560  const ordinal_type tile_col = source_index % proc_grid_.cols();
1561  // Compute process coordinate of tile in the process grid
1562  const ordinal_type proc_row = tile_row % proc_grid_.proc_rows();
1563  const ordinal_type proc_col = tile_col % proc_grid_.proc_cols();
1564  // Compute the process that owns tile
1565  const ProcessID source = proc_row * proc_grid_.proc_cols() + proc_col;
1566 
1567  const madness::DistributedID key(DistEvalImpl_::id(), i);
1568  return TensorImpl_::world().gop.template recv<value_type>(source, key);
1569  }
1570 
1572 
1576  virtual void discard_tile(ordinal_type i) const { get_tile(i); }
1577 
1578  private:
1580 
1587  ordinal_type mem_bound_depth(ordinal_type depth, const float left_sparsity,
1588  const float right_sparsity) {
1589  // Check if a memory bound has been set
1590  const ordinal_type available_memory = max_memory_;
1591  if (available_memory) {
1592  // Compute the average memory requirement per iteration of this process
1593  const std::size_t local_memory_per_iter_left =
1594  (left_.trange().elements_range().volume() /
1595  left_.trange().tiles_range().volume()) *
1597  proc_grid_.local_rows() * (1.0f - left_sparsity);
1598  const std::size_t local_memory_per_iter_right =
1599  (right_.trange().elements_range().volume() /
1600  right_.trange().tiles_range().volume()) *
1602  proc_grid_.local_cols() * (1.0f - right_sparsity);
1603 
1604  // Compute the maximum number of iterations based on available memory
1605  const ordinal_type mem_bound_depth =
1606  ((local_memory_per_iter_left + local_memory_per_iter_right) /
1607  available_memory);
1608 
1609  // Check if the memory bounded depth is less than the optimal depth
1610  if (depth > mem_bound_depth) {
1611  // Adjust the depth based on the available memory
1612  switch (mem_bound_depth) {
1613  case 0:
1614  // When memory bound depth is
1615  TA_EXCEPTION("Insufficient memory available for SUMMA");
1616  break;
1617  case 1:
1618  if (TensorImpl_::world().rank() == 0)
1619  printf(
1620  "!! WARNING TiledArray: Memory constraints limit the SUMMA "
1621  "depth depth to 1.\n"
1622  "!! WARNING TiledArray: Performance may be slow.\n");
1623  default:
1624  depth = mem_bound_depth;
1625  }
1626  }
1627  }
1628 
1629  return depth;
1630  }
1631 
1633 
1639  virtual int internal_eval() {
1640 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1641  printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank());
1642 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1643 
1644  // Start evaluate child tensors
1645  left_.eval();
1646  right_.eval();
1647 
1648 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1649  printf("eval: finished eval children rank=%i\n",
1650  TensorImpl_::world().rank());
1651 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1652 
1653  ordinal_type tile_count = 0ul;
1654  if (proc_grid_.local_size() > 0ul) {
1655  tile_count = initialize();
1656 
1657  // depth controls the number of simultaneous SUMMA iterations
1658  // that are scheduled.
1659 
1660  // The optimal depth is equal to the smallest dimension of the process
1661  // grid, but no less than 2
1662  ordinal_type depth =
1664  std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
1665 
1666  // Construct the first SUMMA iteration task
1667  if (TensorImpl_::shape().is_dense()) {
1668  // We cannot have more iterations than there are blocks in the k
1669  // dimension
1670  if (depth > k_) depth = k_;
1671 
1672  // Modify the number of concurrent iterations based on the available
1673  // memory.
1674  depth = mem_bound_depth(depth, 0.0f, 0.0f);
1675 
1676  // Enforce user defined depth bound
1677  if (max_depth_) depth = std::min(depth, max_depth_);
1678 
1679  TensorImpl_::world().taskq.add(
1680  new DenseStepTask(shared_from_this(), depth));
1681  } else {
1682  // Increase the depth based on the amount of sparsity in an iteration.
1683 
1684  // Get the sparsity fractions for the left- and right-hand arguments.
1685  const float left_sparsity = left_.shape().sparsity();
1686  const float right_sparsity = right_.shape().sparsity();
1687 
1688  // Compute the fraction of non-zero result tiles in a single SUMMA
1689  // iteration.
1690  const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
1691  (1.0f - std::min(right_sparsity, 0.9f));
1692 
1693  // Compute the new depth based on sparsity of the arguments
1694  depth =
1695  float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
1696 
1697  // We cannot have more iterations than there are blocks in the k
1698  // dimension
1699  if (depth > k_) depth = k_;
1700 
1701  // Modify the number of concurrent iterations based on the available
1702  // memory and sparsity of the argument tensors.
1703  depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
1704 
1705  // Enforce user defined depth bound
1706  if (max_depth_) depth = std::min(depth, max_depth_);
1707 
1708  TensorImpl_::world().taskq.add(
1709  new SparseStepTask(shared_from_this(), depth));
1710  }
1711  }
1712 
1713 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1714  printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
1715 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1716 
1717  // Wait for child tensors to be evaluated, and process tasks while waiting.
1718  left_.wait();
1719  right_.wait();
1720 
1721 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1722  printf("eval: finished wait children rank=%i\n",
1723  TensorImpl_::world().rank());
1724 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1725 
1726  return tile_count;
1727  }
1728 
1729 }; // class Summa
1730 
1731 // Initialize static member variables for Summa
1732 
1733 template <typename Left, typename Right, typename Op, typename Policy>
1735  Summa<Left, Right, Op, Policy>::max_depth_ =
1736  Summa<Left, Right, Op, Policy>::init_max_depth();
1737 
1738 template <typename Left, typename Right, typename Op, typename Policy>
1740  Summa<Left, Right, Op, Policy>::max_memory_ =
1741  Summa<Left, Right, Op, Policy>::init_max_memory();
1742 } // namespace detail
1743 } // namespace TiledArray
1744 
1745 #endif // TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
void add(const L &left, const R &right, madness::CallbackInterface *callback=nullptr)
Add a pair of arguments to the reduction task.
Definition: reduce_task.h:966
::blas::Op Op
Definition: blas.h:46
World & world() const
World accessor.
Definition: tensor_impl.h:175
DistEvalImpl_::ordinal_type ordinal_type
Ordinal type.
ordinal_type size() const
Tensor tile volume accessor.
Definition: tensor_impl.h:101
A 2D processor grid.
Definition: proc_grid.h:58
Summa< Left, Right, Op, Policy > Summa_
This object type.
bool is_zero(const Index &i) const
Query for a zero tile.
Definition: tensor_impl.h:147
ordinal_type perm_index_to_target(ordinal_type index) const
Permute index from a source index to a target index.
Definition: dist_eval.h:85
bool is_dense() const
Query the density of the tensor.
Definition: tensor_impl.h:156
size_type proc_rows() const
Process row count accessor.
Definition: proc_grid.h:426
DistEvalImpl_::pmap_interface pmap_interface
Process map interface type.
DistEvalImpl_::range_type range_type
Range type.
void set_tile(ordinal_type i, const value_type &value)
Set tensor value.
Definition: dist_eval.h:154
virtual void discard_tile(ordinal_type i) const
Discard a tile that is not needed.
ProcessID rank_row() const
Rank row accessor.
Definition: proc_grid.h:416
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)
DistEvalImpl_::value_type value_type
Tile type.
#define TA_EXCEPTION(m)
Definition: error.h:83
eval_trait< value_type >::type eval_type
Tile evaluation type.
Definition: dist_eval.h:62
const std::shared_ptr< pmap_interface > & pmap() const
Tensor process map accessor.
Definition: tensor_impl.h:89
Distributed evaluator implementation object.
Definition: dist_eval.h:46
constexpr auto end(Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:51
auto rank(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1617
auto outer(const Permutation &p)
Definition: permutation.h:820
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
Definition: proc_grid.h:469
Left left_type
The left-hand argument type.
size_type local_rows() const
Local element row count accessor.
Definition: proc_grid.h:401
Distributed contraction evaluator implementation.
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Definition: proc_grid.h:443
DistEvalImpl_::TensorImpl_ TensorImpl_
The base, base class type.
Tensor implementation and base for other tensor implementation objects.
Definition: tensor_impl.h:39
constexpr auto begin(Eigen::Matrix< _Scalar, _Rows, 1, _Options, _MaxRows, 1 > &m)
Definition: eigen.h:39
Tile cast operation.
Definition: cast.h:168
#define TA_ASSERT(EXPR,...)
Definition: error.h:39
DistEvalImpl< typename Op::result_type, Policy > DistEvalImpl_
The base class type.
DistEvalImpl_::trange_type trange_type
Tiled range type.
ProcessID owner(const Index &i) const
Query a tile owner.
Definition: tensor_impl.h:122
size_type local_cols() const
Local element column count accessor.
Definition: proc_grid.h:406
ordinal_type perm_index_to_source(ordinal_type index) const
Permute index from a target index to a source index.
Definition: dist_eval.h:94
KroneckerDeltaTile< _N >::numeric_type max(const KroneckerDeltaTile< _N > &arg)
size_type proc_cols() const
Process column count accessor.
Definition: proc_grid.h:431
size_type local_size() const
Local element count accessor.
Definition: proc_grid.h:411
TensorImpl_::ordinal_type ordinal_type
Ordinal type.
Definition: dist_eval.h:52
DistEvalImpl_::eval_type eval_type
Tile evaluation type.
TensorImpl_::range_type range_type
Range type this tensor.
Definition: dist_eval.h:56
ProcessID rank_col() const
Rank row accessor.
Definition: proc_grid.h:421
TensorImpl_::shape_type shape_type
Shape type.
Definition: dist_eval.h:57
virtual void notify()
Tile set notification.
Definition: dist_eval.h:179
Summa(const left_type &left, const right_type &right, World &world, const trange_type trange, const shape_type &shape, const std::shared_ptr< pmap_interface > &pmap, const Perm &perm, const op_type &op, const ordinal_type k, const ProcGrid &proc_grid)
Constructor.
TensorImpl_::pmap_interface pmap_interface
process map interface type
Definition: dist_eval.h:59
Op op_type
Tile evaluation operator type.
Type trait for extracting the numeric type of tensors and arrays.
Definition: type_traits.h:709
size_type rows() const
Element row count accessor.
Definition: proc_grid.h:386
const trange_type & trange() const
Tiled range accessor.
Definition: tensor_impl.h:167
const shape_type & shape() const
Tensor shape accessor.
Definition: tensor_impl.h:162
T value_type
The norm value type.
Definition: sparse_shape.h:77
const madness::uniqueidT & id() const
Unique object id accessor.
Definition: dist_eval.h:133
Right right_type
The right-hand argument type.
size_type cols() const
Element column count accessor.
Definition: proc_grid.h:391
TensorImpl_::trange_type trange_type
Tiled range type for this object.
Definition: dist_eval.h:54
An N-dimensional shallow copy wrapper for tile objects.
Definition: tile.h:82
bool is_local(const Index &i) const
Query for a locally owned tile.
Definition: tensor_impl.h:134
DistEvalImpl_::shape_type shape_type
Shape type.
virtual Future< value_type > get_tile(ordinal_type i) const
Get tile at index i.