TiledArray  0.7.0
contraction_eval.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2013 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #ifndef TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
21 #define TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
22 
23 #include <vector>
24 
25 #include <TiledArray/config.h>
27 #include <TiledArray/proc_grid.h>
28 #include <TiledArray/reduce_task.h>
29 #include <TiledArray/type_traits.h>
30 #include <TiledArray/shape.h>
31 
32 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
33 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
34 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
35 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
36 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
37 
38 namespace TiledArray {
39  namespace detail {
40 
42 
52  template <typename Left, typename Right, typename Op, typename Policy>
53  class Summa :
54  public DistEvalImpl<typename Op::result_type, Policy>,
55  public std::enable_shared_from_this<Summa<Left, Right, Op, Policy> >
56  {
57  public:
61  typedef Left left_type;
62  typedef Right right_type;
70  typedef Op op_type;
71 
72  private:
73  static size_type max_memory_;
74  static size_type max_depth_;
75 
76  // Arguments and operation
77  left_type left_;
78  right_type right_;
79  op_type op_;
80 
81  // Broadcast groups for dense arguments (empty for non-dense arguments)
82  madness::Group row_group_;
83  madness::Group col_group_;
84 
85  // Dimension information
86  const size_type k_;
87  const ProcGrid proc_grid_;
88 
89  // Contraction results
90  ReducePairTask<op_type>* reduce_tasks_;
91 
92  // Constants used to iterate over columns and rows of left_ and right_, respectively.
93  const size_type left_start_local_;
94  const size_type left_end_;
95  const size_type left_stride_;
96  const size_type left_stride_local_;
97  const size_type right_stride_;
98  const size_type right_stride_local_;
99 
100 
101  typedef Future<typename right_type::eval_type> right_future;
102  typedef Future<typename left_type::eval_type> left_future;
103  typedef std::pair<size_type, right_future> row_datum;
104  typedef std::pair<size_type, left_future> col_datum;
105 
106  static constexpr const bool trace_tasks =
107 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
108  true
109 #else
110  false
111 #endif
112  ;
113 
114  protected:
115 
116  // Import base class functions
117  using std::enable_shared_from_this<Summa_>::shared_from_this;
118 
119  private:
120 
121  // Static variable initialization ----------------------------------------
122 
123 
125  static size_type init_max_memory() {
126  const char* max_memory = getenv("TA_SUMMA_MAX_MEMORY");
127  if(max_memory) {
128  // Convert the string into bytes
129  std::stringstream ss(max_memory);
130  double memory = 0.0;
131  if(ss >> memory) {
132  if(memory > 0.0) {
133  std::string unit;
134  if(ss >> unit) { // Failure == assume bytes
135  if(unit == "KB" || unit == "kB") {
136  memory *= 1000.0;
137  } else if(unit == "KiB" || unit == "kiB") {
138  memory *= 1024.0;
139  } else if(unit == "MB") {
140  memory *= 1000000.0;
141  } else if(unit == "MiB") {
142  memory *= 1048576.0;
143  } else if(unit == "GB") {
144  memory *= 1000000000.0;
145  } else if(unit == "GiB") {
146  memory *= 1073741824.0;
147  }
148  }
149  }
150  }
151 
152  memory = std::max(memory, 104857600.0); // Minimum 100 MiB
153  return memory;
154  }
155 
156  return 0ul;
157  }
158 
159 
160  static size_type init_max_depth() {
161  const char* max_depth = getenv("TA_SUMMA_MAX_DEPTH");
162  if(max_depth)
163  return std::stoul(max_depth);
164  return 0ul;
165  }
166 
167 
168  // Process groups --------------------------------------------------------
169 
171 
193  template <typename Shape, typename ProcMap>
194  madness::Group make_group(const Shape& shape, const std::vector<bool>& process_mask, size_type index,
195  const size_type end, const size_type stride, const size_type max_group_size,
196  const size_type k, const size_type key_offset, const ProcMap& proc_map) const
197  {
198  // Generate the list of processes in rank_row
199  std::vector<ProcessID> proc_list(max_group_size, -1);
200 
201  // Flag the root processes of the broadcast, which may not be included
202  // by shape.
203  size_type p = k % max_group_size;
204  proc_list[p] = proc_map(p);
205  size_type count = 1ul;
206 
207  // Flag all processes that have non-zero tiles
208  for(p = 0ul; (index < end) && (count < max_group_size); index += stride,
209  p = (p + 1u) % max_group_size)
210  {
211  if((proc_list[p] != -1) || (shape.is_zero(index)) || !process_mask.at(p)) continue;
212 
213  proc_list[p] = proc_map(p);
214  ++count;
215  }
216 
217  // Remove processes from the list that will not be in the group
218  for(size_type x = 0ul, p = 0ul; x < count; ++p) {
219  if(proc_list[p] == -1) continue;
220  proc_list[x++] = proc_list[p];
221  }
222 
223  // Truncate invalid process id's
224  proc_list.resize(count);
225 
226  return madness::Group(TensorImpl_::world(), proc_list,
227  madness::DistributedID(DistEvalImpl_::id(), k + key_offset));
228  }
229 
231 
234  madness::Group make_row_group(const size_type k) const {
235  // Construct the sparse broadcast group
236  const size_type right_begin_k = k * proc_grid_.cols();
237  const size_type right_end_k = right_begin_k + proc_grid_.cols();
238  // make the row mask; using the same mask for all tiles avoids having to compute mask
239  // for every tile and use of masked broadcasts
240  auto result_row_mask_k = make_row_mask(k);
241 
242  // return empty group if I am not in this group, otherwise make a group
243  if (result_row_mask_k[proc_grid_.rank_col()])
244  return make_group(right_.shape(), result_row_mask_k, right_begin_k, right_end_k,
245  right_stride_, proc_grid_.proc_cols(), k, k_,
246  [&](const ProcGrid::size_type col) { return proc_grid_.map_col(col); });
247  else
248  return madness::Group();
249  }
250 
251 
253 
256  madness::Group make_col_group(const size_type k) const {
257 
258  // make the column mask; using the same mask for all tiles avoids having to compute mask
259  // for every tile and use of masked broadcasts
260  auto result_col_mask_k = make_col_mask(k);
261 
262  // return empty group if I am not in this group, otherwise make a group
263  if (result_col_mask_k[proc_grid_.rank_row()])
264  return make_group(left_.shape(), result_col_mask_k, k, left_end_, left_stride_,
265  proc_grid_.proc_rows(), k, 0ul,
266  [&](const ProcGrid::size_type row) { return proc_grid_.map_row(row); });
267  else
268  return madness::Group();
269  }
270 
272 
276  std::vector<bool> make_row_mask(const size_type k) const {
277 
278  // "local" A[i][k] (i.e. for all i assigned to my row of processes) will produce C[i][*]
279  // for each process in my row of the process grid determine whether there are any
280  // nonzero C[i][*] located on that node
281 
282  const auto nproc_cols = proc_grid_.proc_cols();
283  const auto my_proc_row = proc_grid_.rank_row();
284 
285  // result shape
286  const auto& result_shape = TensorImpl_::shape();
287 
288  // if result is dense, include all processors
289  if (result_shape.is_dense())
290  return std::vector<bool>(nproc_cols, true);
291 
292  // initialize the mask
293  std::vector<bool> mask(nproc_cols, false);
294 
295  // number of tiles in the col dimension of the result
296  const auto nj = proc_grid_.cols();
297  // number of tiles in contraction dim
298  const auto nk = k_;
299 
300  // for each i assigned to my row of processes ...
301  size_type i_start, i_fence, i_stride;
302  std::tie(i_start, i_fence, i_stride) =
303  result_row_range(my_proc_row);
304  const auto ik_stride = i_stride * nk;
305  for (size_type i = i_start, ik = i_start * nk + k; i < i_fence;
306  i += i_stride, ik += ik_stride) {
307  // ... such that A[i][k] exists ...
308  if (!left_.shape().is_zero(ik)) {
309  // ... the owner of А[i][k] is always in the group ...
310  const auto k_proc_col = k % nproc_cols;
311  mask[k_proc_col] = true;
312  // ... loop over processes in my row ...
313  for (size_type proc_col = 0; proc_col != nproc_cols; ++proc_col) {
314  // ... that are not the owner of A[i][k] ...
315  if (proc_col != k_proc_col) {
316  // ... loop over all C[i][j] tiles that belong to this process ...
317  size_type j_start, j_fence, j_stride;
318  std::tie(j_start, j_fence, j_stride) =
319  result_col_range(proc_col);
320  const auto ij_stride = j_stride;
321  for (size_type j = j_start, ij = i * nj + j_start; j < j_fence;
322  j += j_stride, ij += ij_stride) {
323  // ... if any such C[i][j] exists, update the mask, and move
324  // on to next process
325  if (!result_shape.is_zero(DistEvalImpl_::perm_index_to_target(ij))) {
326  mask[proc_col] = true;
327  break;
328  }
329  }
330  }
331  }
332  }
333  }
334 
335  return mask;
336  }
337 
339 
344  std::vector<bool> make_col_mask(const size_type k) const {
345  // "local" B[k][j] (i.e. for all j assigned to my column of processes)
346  // will produce C[*][j]
347  // for each process in my column of the process grid determine whether
348  // there are any
349  // nonzero C[*][j] located on that node
350 
351  const auto nproc_rows = proc_grid_.proc_rows();
352  const auto my_proc_col = proc_grid_.rank_col();
353 
354  // result shape
355  const auto& result_shape = TensorImpl_::shape();
356 
357  // if result is dense, include all processors
358  if (result_shape.is_dense())
359  return std::vector<bool>(nproc_rows, true);
360 
361  // initialize the mask
362  std::vector<bool> mask(nproc_rows, false);
363 
364  // number of tiles in col dim of the result
365  const auto nj = proc_grid_.cols();
366 
367  // for each j assigned to my column of processes ...
368  size_type j_start, j_fence, j_stride;
369  std::tie(j_start, j_fence, j_stride) = result_col_range(my_proc_col);
370  const auto kj_stride = j_stride;
371  for (size_type j = j_start, kj = k * nj + j_start; j < j_fence;
372  j += j_stride, kj += kj_stride) {
373  // ... such that B[k][j] exists ...
374  if (!right_.shape().is_zero(kj)) {
375  // ... the owner of B[k][j] is always in the group ...
376  auto k_proc_row = k % nproc_rows;
377  mask[k_proc_row] = true;
378  // ... loop over processes in my col ...
379  for (size_type proc_row = 0; proc_row != nproc_rows; ++proc_row) {
380  // ... that are not the owner of B[k][j] ...
381  if (proc_row != k_proc_row) {
382  // ... loop over all C[i][j] tiles that belong to this process
383  size_type i_start, i_fence, i_stride;
384  std::tie(i_start, i_fence, i_stride) =
385  result_row_range(proc_row);
386  const auto ij_stride = i_stride * nj;
387  for (size_type i = i_start, ij = i_start * nj + j; i < i_fence;
388  i += i_stride, ij += ij_stride) {
389  // ... if any such C[i][j] exists, update the mask, and move
390  // on to next process
391  if (!result_shape.is_zero(
393  mask[proc_row] = true;
394  break;
395  }
396  }
397  }
398  }
399  }
400  }
401 
402  return mask;
403  }
404 
406 
411  inline std::tuple<size_type, size_type, size_type> result_row_range(
412  size_type proc_row) const {
413  const size_type start = proc_row;
414  const size_type fence = proc_grid_.rows();
415  const size_type stride = proc_grid_.proc_rows();
416  return std::make_tuple(start, fence, stride);
417  }
418 
420 
425  std::tuple<size_type, size_type, size_type> result_col_range(
426  size_type proc_col) const {
427  const size_type start = proc_col;
428  const size_type fence = proc_grid_.cols();
429  const size_type stride = proc_grid_.proc_cols();
430  return std::make_tuple(start, fence, stride);
431  }
432 
433  // Broadcast kernels -----------------------------------------------------
434 
436 
440  template <typename Tile>
441  static auto convert_tile(const Tile& tile) {
443  return cast(tile);
444  }
445 
447 
453  template <typename Arg>
454  static typename std::enable_if<
455  ! is_lazy_tile<typename Arg::value_type>::value,
456  Future<typename Arg::eval_type> >::type
457  get_tile(Arg& arg, const typename Arg::size_type index) { return arg.get(index); }
458 
459 
461 
468  template <typename Arg>
469  static typename std::enable_if<
470  is_lazy_tile<typename Arg::value_type>::value,
471  Future<typename Arg::eval_type> >::type
472  get_tile(Arg& arg, const typename Arg::size_type index) {
473  auto convert_tile_fn =
474  &Summa_::template convert_tile<typename Arg::value_type>;
475  return arg.world().taskq.add(convert_tile_fn, arg.get(index),
476  madness::TaskAttributes::hipri());
477  }
478 
479 
481 
489  template <typename Arg, typename Datum>
490  void get_vector(Arg& arg, size_type index, const size_type end,
491  const size_type stride, std::vector<Datum>& vec) const
492  {
493  TA_ASSERT(vec.size() == 0ul);
494 
495  // Iterate over vector of tiles
496  if(arg.is_local(index)) {
497  for(size_type i = 0ul; index < end; ++i, index += stride) {
498  if(arg.shape().is_zero(index)) continue;
499  vec.emplace_back(i, get_tile(arg, index));
500  }
501  } else {
502  for(size_type i = 0ul; index < end; ++i, index += stride) {
503  if(arg.shape().is_zero(index)) continue;
504  vec.emplace_back(i, Future<typename Arg::eval_type>());
505  }
506  }
507 
508  TA_ASSERT(vec.size() > 0ul);
509  }
510 
512 
515  void get_col(const size_type k, std::vector<col_datum>& col) const {
516  col.reserve(proc_grid_.local_rows());
517  get_vector(left_, left_start_local_ + k, left_end_, left_stride_local_, col);
518  }
519 
521 
524  void get_row(const size_type k, std::vector<row_datum>& row) const {
525  row.reserve(proc_grid_.local_cols());
526 
527  // Compute local iteration limits for row k of right_.
528  size_type begin = k * proc_grid_.cols();
529  const size_type end = begin + proc_grid_.cols();
530  begin += proc_grid_.rank_col();
531 
532  get_vector(right_, begin, end, right_stride_local_, row);
533  }
534 
536 
543  template <typename Datum>
544  void bcast(const size_type start, const size_type stride,
545  const madness::Group& group, const ProcessID group_root,
546  const size_type key_offset, std::vector<Datum>& vec) const
547  {
548  TA_ASSERT(vec.size() != 0ul);
549  TA_ASSERT(group.size() > 0);
550  TA_ASSERT(group_root < group.size());
551 
552 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
553  std::stringstream ss;
554  ss << "bcast: rank=" << TensorImpl_::world().rank()
555  << " root=" << group.world_rank(group_root)
556  << " groupid=(" << group.id().first << "," << group.id().second
557  << ") keyoffset=" << key_offset << " group={ ";
558  for(ProcessID group_proc = 0; group_proc < group.size(); ++group_proc)
559  ss << group.world_rank(group_proc) << " ";
560  ss << "} tiles={ ";
561 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
562 
563  // Iterate over tiles to be broadcast
564  for(typename std::vector<Datum>::iterator it = vec.begin(); it != vec.end(); ++it) {
565  const size_type index = it->first * stride + start;
566 
567  // Broadcast the tile
568  const madness::DistributedID key(DistEvalImpl_::id(), index + key_offset);
569  TensorImpl_::world().gop.bcast(key, it->second, group_root, group);
570 
571 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
572  ss << index << " ";
573 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
574  }
575 
576  TA_ASSERT(vec.size() > 0ul);
577 
578 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
579  ss << "}\n";
580  printf(ss.str().c_str());
581 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
582  }
583 
584  // Broadcast specialization for left and right arguments -----------------
585 
586 
587  ProcessID get_row_group_root(const size_type k, const madness::Group& row_group) const {
588  ProcessID group_root = k % proc_grid_.proc_cols();
589  if(! right_.shape().is_dense() && row_group.size() < static_cast<ProcessID>(proc_grid_.proc_cols())) {
590  const ProcessID world_root = proc_grid_.rank_row() * proc_grid_.proc_cols() + group_root;
591  group_root = row_group.rank(world_root);
592  }
593  return group_root;
594  }
595 
596  ProcessID get_col_group_root(const size_type k, const madness::Group& col_group) const {
597  ProcessID group_root = k % proc_grid_.proc_rows();
598  if(! left_.shape().is_dense() && col_group.size() < static_cast<ProcessID>(proc_grid_.proc_rows())) {
599  const ProcessID world_root = group_root * proc_grid_.proc_cols() + proc_grid_.rank_col();
600  group_root = col_group.rank(world_root);
601  }
602  return group_root;
603  }
604 
606 
609  void bcast_col(const size_type k, std::vector<col_datum>& col, const madness::Group& row_group) const {
610  // broadcast if I'm part of the broadcast group
611  if (!row_group.empty()) {
612  // Broadcast column k of left_.
613  ProcessID group_root = get_row_group_root(k, row_group);
614  bcast(left_start_local_ + k, left_stride_local_, row_group, group_root, 0ul, col);
615  }
616  }
617 
619 
622  void bcast_row(const size_type k, std::vector<row_datum>& row, const madness::Group& col_group) const {
623  // broadcast if I'm part of the broadcast group
624  if (!col_group.empty()) {
625  // Compute the group root process.
626  ProcessID group_root = get_col_group_root(k, col_group);
627 
628  // Broadcast row k of right_.
629  bcast(k * proc_grid_.cols() + proc_grid_.rank_col(),
630  right_stride_local_, col_group, group_root, left_.size(), row);
631  }
632  }
633 
634  void bcast_col_range_task(size_type k, const size_type end) const {
635  // Compute the first local row of right
636  const size_type Pcols = proc_grid_.proc_cols();
637  k += (Pcols - ((k + Pcols - proc_grid_.rank_col()) % Pcols)) % Pcols;
638 
639  for(; k < end; k += Pcols) {
640 
641  // Compute local iteration limits for column k of left_.
642  size_type index = left_start_local_ + k;
643 
644  // will create broadcast group only if needed
645  bool have_group = false;
646  madness::Group row_group;
647  ProcessID group_root;
648  bool do_broadcast;
649 
650  // Search column k of left for non-zero tiles
651  for(; index < left_end_; index += left_stride_local_) {
652  if(left_.shape().is_zero(index)) continue;
653 
654  // Construct broadcast group, if needed
655  if (!have_group) {
656  have_group = true;
657  row_group = make_row_group(k);
658  // broadcast if I am in this group and this group has others
659  do_broadcast = !row_group.empty() && row_group.size() > 1;
660  if (do_broadcast)
661  group_root = get_row_group_root(k, row_group);
662  }
663 
664  if(do_broadcast) {
665  // Broadcast the tile
666  const madness::DistributedID key(DistEvalImpl_::id(), index);
667  auto tile = get_tile(left_, index);
668  TensorImpl_::world().gop.bcast(key, tile, group_root, row_group);
669  } else {
670  // Discard the tile
671  left_.discard(index);
672  }
673  }
674  }
675  }
676 
677  void bcast_row_range_task(size_type k, const size_type end) const {
678  // Compute the first local row of right
679  const size_type Prows = proc_grid_.proc_rows();
680  k += (Prows - ((k + Prows - proc_grid_.rank_row()) % Prows)) % Prows;
681 
682  for(; k < end; k += Prows) {
683 
684  // Compute local iteration limits for row k of right_.
685  size_type index = k * proc_grid_.cols();
686  const size_type row_end = index + proc_grid_.cols();
687  index += proc_grid_.rank_col();
688 
689  // will create broadcast group only if needed
690  bool have_group = false;
691  madness::Group col_group;
692  ProcessID group_root;
693  bool do_broadcast;
694 
695  // Search for and broadcast non-zero row
696  for(; index < row_end; index += right_stride_local_) {
697  if(right_.shape().is_zero(index)) continue;
698 
699  // Construct broadcast group
700  if (!have_group) {
701  have_group = true;
702  col_group = make_col_group(k);
703  // broadcast if I am in this group and this group has others
704  do_broadcast = !col_group.empty() && col_group.size() > 1;
705  if (do_broadcast)
706  group_root = get_col_group_root(k, col_group);
707  }
708 
709  if(do_broadcast) {
710  // Broadcast the tile
711  const madness::DistributedID key(DistEvalImpl_::id(), index + left_.size());
712  auto tile = get_tile(right_, index);
713  TensorImpl_::world().gop.bcast(key, tile, group_root, col_group);
714  } else {
715  // Discard the tile
716  right_.discard(index);
717  }
718  }
719  }
720  }
721 
722 
723  // Row and column iteration functions ------------------------------------
724 
726 
733  size_type iterate_row(size_type k) const {
734  // Iterate over k's until a non-zero tile is found or the end of the
735  // matrix is reached.
736  size_type end = k * proc_grid_.cols();
737  for(; k < k_; ++k) {
738  // Search for non-zero tiles in row k of right
739  size_type i = end + proc_grid_.rank_col();
740  end += proc_grid_.cols();
741  for(; i < end; i += right_stride_local_)
742  if(! right_.shape().is_zero(i))
743  return k;
744  }
745 
746  return k;
747  }
748 
750 
757  size_type iterate_col(size_type k) const {
758  // Iterate over k's until a non-zero tile is found or the end of the
759  // matrix is reached.
760  for(; k < k_; ++k)
761  // Search row k for non-zero tiles
762  for(size_type i = left_start_local_ + k; i < left_end_; i += left_stride_local_)
763  if(! left_.shape().is_zero(i))
764  return k;
765 
766  return k;
767  }
768 
769 
771 
780  size_type iterate_sparse(const size_type k) const {
781  // Initial step for k_col and k_row.
782  size_type k_col = iterate_col(k);
783  size_type k_row = iterate_row(k_col);
784 
785  // Search for a row and column that both have non-zero tiles
786  while(k_col != k_row) {
787  if(k_col < k_row) {
788  k_col = iterate_col(k_row);
789  } else {
790  k_row = iterate_row(k_col);
791  }
792  }
793 
794  if(k < k_row) {
795  // Spawn a task to broadcast any local columns of left that were skipped
796  TensorImpl_::world().taskq.add(shared_from_this(),
797  & Summa_::bcast_col_range_task, k, k_row,
798  madness::TaskAttributes::hipri());
799 
800  // Spawn a task to broadcast any local rows of right that were skipped
801  TensorImpl_::world().taskq.add(shared_from_this(),
802  & Summa_::bcast_row_range_task, k, k_col,
803  madness::TaskAttributes::hipri());
804  }
805 
806  return k_col;
807  }
808 
809 
811 
820  size_type iterate(const size_type k) const {
821  return (left_.shape().is_dense() && right_.shape().is_dense() ?
822  k : iterate_sparse(k));
823  }
824 
825 
826  // Initialization functions ----------------------------------------------
827 
829  size_type initialize(const DenseShape&) {
830  // Construct static broadcast groups for dense arguments
831  const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
832  col_group_ = proc_grid_.make_col_group(col_did);
833  const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
834  row_group_ = proc_grid_.make_row_group(row_did);
835 
836 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
837  std::stringstream ss;
838  ss << "init: rank=" << TensorImpl_::world().rank()
839  << "\n col_group_=(" << col_did.first << ", " << col_did.second << ") { ";
840  for(ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
841  ss << col_group_.world_rank(gproc) << " ";
842  ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second << ") { ";
843  for(ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
844  ss << row_group_.world_rank(gproc) << " ";
845  ss << "}\n";
846  printf(ss.str().c_str());
847 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
848 
849  // Allocate memory for the reduce pair tasks.
850  std::allocator<ReducePairTask<op_type> > alloc;
851  reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
852 
853  // Iterate over all local tiles
854  const size_type n = proc_grid_.local_size();
855  for(size_type t = 0ul; t < n; ++t) {
856  // Initialize the reduction task
857  ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task = reduce_tasks_ + t;
858  new(reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
859  }
860 
861  return proc_grid_.local_size();
862  }
863 
865  template <typename Shape>
866  size_type initialize(const Shape& shape) {
867 
868 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
869  std::stringstream ss;
870  ss << " initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
871 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
872 
873  // Allocate memory for the reduce pair tasks.
874  std::allocator<ReducePairTask<op_type> > alloc;
875  reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
876 
877  // Initialize iteration variables
878  size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
879  size_type row_end = row_start + proc_grid_.cols();
880  row_start += proc_grid_.rank_col();
881  const size_type col_stride = // The stride to iterate down a column
882  proc_grid_.proc_rows() * proc_grid_.cols();
883  const size_type row_stride = // The stride to iterate across a row
884  proc_grid_.proc_cols();
885  const size_type end = TensorImpl_::size();
886 
887  // Iterate over all local tiles
888  size_type tile_count = 0ul;
889  ReducePairTask<op_type>* MADNESS_RESTRICT reduce_task = reduce_tasks_;
890  // this loops over result tiles arranged in block-cyclic order
891  // index = tile index (row major)
892  for(; row_start < end; row_start += col_stride, row_end += col_stride) {
893  for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
894 
895  // Initialize the reduction task
896 
897  // Skip zero tiles
898  if(! shape.is_zero(DistEvalImpl_::perm_index_to_target(index))) {
899 
900 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
901  ss << index << " ";
902 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
903 
904  new(reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
905  ++tile_count;
906  } else {
907  // Construct an empty task to represent zero tiles.
908  new(reduce_task) ReducePairTask<op_type>();
909  }
910  }
911  }
912 
913 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
914  ss << "}\n";
915  printf(ss.str().c_str());
916 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
917 
918  return tile_count;
919  }
920 
921  size_type initialize() {
922 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
923  printf("init: start rank=%i\n", TensorImpl_::world().rank());
924 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
925 
926  const size_type result = initialize(TensorImpl_::shape());
927 
928 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
929  printf("init: finish rank=%i\n", TensorImpl_::world().rank());
930 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
931 
932  return result;
933  }
934 
935 
936  // Finalize functions ----------------------------------------------------
937 
939  void finalize(const DenseShape&) {
940  // Initialize iteration variables
941  size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
942  size_type row_end = row_start + proc_grid_.cols();
943  row_start += proc_grid_.rank_col();
944  const size_type col_stride = // The stride to iterate down a column
945  proc_grid_.proc_rows() * proc_grid_.cols();
946  const size_type row_stride = // The stride to iterate across a row
947  proc_grid_.proc_cols();
948  const size_type end = TensorImpl_::size();
949 
950  // Iterate over all local tiles
951  for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
952  row_start < end; row_start += col_stride, row_end += col_stride) {
953  for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
954 
955 
956  // Set the result tile
958  reduce_task->submit());
959 
960  // Destroy the reduce task
961  reduce_task->~ReducePairTask<op_type>();
962  }
963  }
964 
965  // Deallocate the memory for the reduce pair tasks.
966  std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
967  proc_grid_.local_size());
968  }
969 
971  template <typename Shape>
972  void finalize(const Shape& shape) {
973 
974 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
975  std::stringstream ss;
976  ss << " finalize rank=" << TensorImpl_::world().rank() << " tiles={ ";
977 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
978 
979  // Initialize iteration variables
980  size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
981  size_type row_end = row_start + proc_grid_.cols();
982  row_start += proc_grid_.rank_col();
983  const size_type col_stride = // The stride to iterate down a column
984  proc_grid_.proc_rows() * proc_grid_.cols();
985  const size_type row_stride = // The stride to iterate across a row
986  proc_grid_.proc_cols();
987  const size_type end = TensorImpl_::size();
988 
989  // Iterate over all local tiles
990  for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
991  row_start < end; row_start += col_stride, row_end += col_stride) {
992  for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
993  // Compute the permuted index
994  const size_type perm_index = DistEvalImpl_::perm_index_to_target(index);
995 
996  // Skip zero tiles
997  if(! shape.is_zero(perm_index)) {
998 
999 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1000  ss << index << " ";
1001 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1002 
1003  // Set the result tile
1004  DistEvalImpl_::set_tile(perm_index, reduce_task->submit());
1005  }
1006 
1007  // Destroy the reduce task
1008  reduce_task->~ReducePairTask<op_type>();
1009  }
1010  }
1011 
1012  // Deallocate the memory for the reduce pair tasks.
1013  std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
1014  proc_grid_.local_size());
1015 
1016 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1017  ss << "}\n";
1018  printf(ss.str().c_str());
1019 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1020  }
1021 
1022  void finalize() {
1023 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1024  printf("finalize: start rank=%i\n", TensorImpl_::world().rank());
1025 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1026 
1027  finalize(TensorImpl_::shape());
1028 
1029 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1030  printf("finalize: finish rank=%i\n", TensorImpl_::world().rank());
1031 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
1032  }
1033 
1035 
1037  class FinalizeTask : public madness::TaskInterface {
1038  private:
1039  std::shared_ptr<Summa_> owner_;
1040 
1041  public:
1042  FinalizeTask(const std::shared_ptr<Summa_>& owner, const int ndep) :
1043  madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
1044  owner_(owner)
1045  { }
1046 
1047  virtual ~FinalizeTask() { }
1048 
1049  virtual void run(const madness::TaskThreadEnv&) { owner_->finalize(); }
1050 
1051  }; // class FinalizeTask
1052 
1053 
1054  // Contraction functions -------------------------------------------------
1055 
1057 
1064  void contract(const DenseShape&, const size_type,
1065  const std::vector<col_datum>& col, const std::vector<row_datum>& row,
1066  madness::TaskInterface* const task)
1067  {
1068  // Iterate over the row
1069  for(size_type i = 0ul; i < col.size(); ++i) {
1070  // Compute the local, result-tile offset
1071  const size_type reduce_task_offset = col[i].first * proc_grid_.local_cols();
1072 
1073  // Iterate over columns
1074  for(size_type j = 0ul; j < row.size(); ++j) {
1075  const size_type reduce_task_index = reduce_task_offset + row[j].first;
1076 
1077  // Schedule task for contraction pairs
1078  if(task)
1079  task->inc();
1080  const left_future left = col[i].second;
1081  const right_future right = row[j].second;
1082  reduce_tasks_[reduce_task_index].add(left, right, task);
1083  }
1084  }
1085  }
1086 
1088 
1095  template <typename Shape>
1096  void contract(const Shape&, const size_type,
1097  const std::vector<col_datum>& col, const std::vector<row_datum>& row,
1098  madness::TaskInterface* const task)
1099  {
1100  // Iterate over the row
1101  for(size_type i = 0ul; i < col.size(); ++i) {
1102  // Compute the local, result-tile offset
1103  const size_type reduce_task_offset = col[i].first * proc_grid_.local_cols();
1104 
1105  // Iterate over columns
1106  for(size_type j = 0ul; j < row.size(); ++j) {
1107  const size_type reduce_task_index = reduce_task_offset + row[j].first;
1108 
1109  // Skip zero tiles
1110  if(! reduce_tasks_[reduce_task_index])
1111  continue;
1112 
1113  // Schedule task for contraction pairs
1114  if(task) {
1115  if (trace_tasks)
1116  task->inc_debug("destroy(*ReduceObject)");
1117  else
1118  task->inc();
1119  }
1120  const left_future left = col[i].second;
1121  const right_future right = row[j].second;
1122  reduce_tasks_[reduce_task_index].add(left, right, task);
1123  }
1124  }
1125  }
1126 
1127 #define TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1128 #ifndef TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1129 
1141  template <typename T>
1142  typename std::enable_if<std::is_floating_point<T>::value>::type
1143  contract(const SparseShape<T>&, const size_type k,
1144  const std::vector<col_datum>& col, const std::vector<row_datum>& row,
1145  madness::TaskInterface* const task)
1146  {
1147  // Cache row shape data.
1148  std::vector<typename SparseShape<T>::value_type> row_shape_values;
1149  row_shape_values.reserve(row.size());
1150  const size_type row_start = k * proc_grid_.cols() + proc_grid_.rank_col();
1151  for(size_type j = 0ul; j < row.size(); ++j)
1152  row_shape_values.push_back(right_.shape()[row_start + (row[j].first * right_stride_local_)]);
1153 
1154  const size_type col_start = left_start_local_ + k;
1155  const float threshold_k = TensorImpl_::shape().threshold() / typename SparseShape<T>::value_type(k_);
1156  // Iterate over the row
1157  for(size_type i = 0ul; i != col.size(); ++i) {
1158  // Compute the local, result-tile offset
1159  const size_type offset = col[i].first * proc_grid_.local_cols();
1160 
1161  // Get the shape data for col_it tile
1162  const typename SparseShape<T>::value_type col_shape_value =
1163  left_.shape()[col_start + (col[i].first * left_stride_local_)];
1164 
1165  // Iterate over columns
1166  for(size_type j = 0ul; j < row.size(); ++j) {
1167  if((col_shape_value * row_shape_values[j]) < threshold_k)
1168  continue;
1169 
1170  const size_type reduce_task_index = offset + row[j].first;
1171 
1172  // Skip zero tiles
1173  if(! reduce_tasks_[reduce_task_index])
1174  continue;
1175 
1176  if(task)
1177  task->inc();
1178  reduce_tasks_[reduce_task_index].add(col[i].second, row[j].second, task);
1179  }
1180  }
1181  }
1182 #endif // TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
1183 
1184  void contract(const size_type k, const std::vector<col_datum>& col,
1185  const std::vector<row_datum>& row, madness::TaskInterface* const task)
1186  { contract(TensorImpl_::shape(), k, col, row, task); }
1187 
1188 
1189  // SUMMA step task -------------------------------------------------------
1190 
1191 
1193 
1196  class StepTask : public madness::TaskInterface {
1197  protected:
1198  // Member variables
1199  std::shared_ptr<Summa_> owner_;
1200  World& world_;
1201  std::vector<col_datum> col_{};
1202  std::vector<row_datum> row_{};
1203  FinalizeTask* finalize_task_;
1204  StepTask* next_step_task_ = nullptr;
1205  StepTask* tail_step_task_ = nullptr;
1206 
1207  void get_col(const size_type k) {
1208  owner_->get_col(k, col_);
1209  if (trace_tasks)
1210  this->notify_debug("StepTask::spawn_col");
1211  else
1212  this->notify();
1213  }
1214 
1215  void get_row(const size_type k) {
1216  owner_->get_row(k, row_);
1217  if (trace_tasks)
1218  this->notify_debug("StepTask::spawn_row");
1219  else
1220  this->notify();
1221  }
1222 
1223  public:
1224 
1225  StepTask(const std::shared_ptr<Summa_>& owner, int finalize_ndep) :
1226 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1227  madness::TaskInterface(0ul, "StepTask 1st ctor", madness::TaskAttributes::hipri()),
1228 #else
1229  madness::TaskInterface(0ul, madness::TaskAttributes::hipri()),
1230 #endif
1231  owner_(owner), world_(owner->world()),
1232  finalize_task_(new FinalizeTask(owner, finalize_ndep))
1233  {
1234  TA_ASSERT(owner_);
1235  owner_->world().taskq.add(finalize_task_);
1236  }
1237 
1239 
1242  StepTask(StepTask* const parent, const int ndep) :
1243 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1244  madness::TaskInterface(ndep, "StepTask nth ctor", madness::TaskAttributes::hipri()),
1245 #else
1246  madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
1247 #endif
1248  owner_(parent->owner_), world_(parent->world_),
1249  finalize_task_(parent->finalize_task_)
1250  {
1251  TA_ASSERT(parent);
1252  parent->next_step_task_ = this;
1253  }
1254 
1255  virtual ~StepTask() { }
1256 
1257  void spawn_get_row_col_tasks(const size_type k) {
1258  // Submit the task to collect column tiles of left for iteration k
1259  if (trace_tasks)
1260  madness::DependencyInterface::inc_debug("StepTask::spawn_col");
1261  else
1262  madness::DependencyInterface::inc();
1263  world_.taskq.add(this, & StepTask::get_col, k, madness::TaskAttributes::hipri());
1264 
1265  // Submit the task to collect row tiles of right for iteration k
1266  if (trace_tasks)
1267  madness::DependencyInterface::inc_debug("StepTask::spawn_row");
1268  else
1269  madness::DependencyInterface::inc();
1270  world_.taskq.add(this, & StepTask::get_row, k, madness::TaskAttributes::hipri());
1271  }
1272 
1273  template <typename Derived>
1274  void make_next_step_tasks(Derived* task, size_type depth) {
1275  TA_ASSERT(depth > 0);
1276  // Set the depth to be no greater than the maximum number steps
1277  if(depth > owner_->k_)
1278  depth = owner_->k_;
1279 
1280  // Spawn n=depth step tasks
1281  for(; depth > 0ul; --depth) {
1282  // Set dep count of the tail task to 1, it will not start until this task commands
1283  Derived* const next = new Derived(task, depth == 1 ? 1 : 0);
1284  task = next;
1285  }
1286 
1287  // Keep track of the tail ptr
1288  tail_step_task_ = task;
1289  }
1290 
1291  template <typename Derived, typename GroupType>
1292  void run(const size_type k, const GroupType& row_group, const GroupType& col_group) {
1293 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1294  printf("step: start rank=%i k=%lu\n", owner_->world().rank(), k);
1295 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1296 
1297  if(k < owner_->k_) {
1298  // Initialize next tail task and submit next task
1299  TA_ASSERT(next_step_task_);
1300  next_step_task_->tail_step_task_ =
1301  new Derived(static_cast<Derived*>(tail_step_task_), 1); // <- ndep=1, will control its scheduling by this task
1302  // submit next step task ... even if it's same as tail_step_task_ it is safe to submit
1303  // because its ndep > 0 (see StepTask::make_next_step_tasks)
1304  TA_ASSERT(tail_step_task_->ndep() > 0);
1305  world_.taskq.add(next_step_task_);
1306  next_step_task_ = nullptr;
1307 
1308  // Start broadcast of column and row tiles for this step
1309  world_.taskq.add(owner_, & Summa_::bcast_col, k, col_, row_group,
1310  madness::TaskAttributes::hipri());
1311  world_.taskq.add(owner_, & Summa_::bcast_row, k, row_, col_group,
1312  madness::TaskAttributes::hipri());
1313 
1314  // Submit tasks for the contraction of col and row tiles.
1315  owner_->contract(k, col_, row_, tail_step_task_);
1316 
1317  // Notify task dependencies
1318  TA_ASSERT(tail_step_task_);
1319  if (trace_tasks)
1320  tail_step_task_->notify_debug("StepTask nth ctor");
1321  else
1322  tail_step_task_->notify();
1323  finalize_task_->notify();
1324 
1325  } else if(finalize_task_) {
1326  // Signal the finalize task so it can run after all non-zero step
1327  // tasks have completed.
1328  finalize_task_->notify();
1329 
1330  // Cleanup any remaining step tasks
1331  StepTask* step_task = next_step_task_;
1332  while(step_task) {
1333  StepTask* const next_step_task = step_task->next_step_task_;
1334  step_task->next_step_task_ = nullptr;
1335  step_task->finalize_task_ = nullptr;
1336  world_.taskq.add(step_task);
1337  step_task = next_step_task;
1338  }
1339 
1340  if (trace_tasks)
1341  tail_step_task_->notify_debug("StepTask nth ctor");
1342  else
1343  tail_step_task_->notify();
1344  }
1345 
1346 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1347  printf("step: finish rank=%i k=%lu\n", owner_->world().rank(), k);
1348 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
1349  }
1350 
1351  }; // class StepTask
1352 
1353  class DenseStepTask : public StepTask {
1354  protected:
1355  const size_type k_;
1356  using StepTask::owner_;
1357 
1358  public:
1359  DenseStepTask(const std::shared_ptr<Summa_>& owner, const size_type depth) :
1360  StepTask(owner, owner->k_ + 1ul), k_(0)
1361  {
1362  StepTask::make_next_step_tasks(this, depth);
1363  StepTask::spawn_get_row_col_tasks(k_);
1364  }
1365 
1366  DenseStepTask(DenseStepTask* const parent, const int ndep) :
1367  StepTask(parent, ndep), k_(parent->k_ + 1ul)
1368  {
1369  // Spawn tasks to get k-th row and column tiles
1370  if(k_ < owner_->k_)
1371  StepTask::spawn_get_row_col_tasks(k_);
1372  }
1373 
1374  virtual ~DenseStepTask() { }
1375 
1376  virtual void run(const madness::TaskThreadEnv&) {
1377  StepTask::template run<DenseStepTask>(k_, owner_->row_group_, owner_->col_group_);
1378  }
1379  }; // class DenseStepTask
1380 
1381  class SparseStepTask : public StepTask {
1382  protected:
1383  Future<size_type> k_{};
1384  Future<madness::Group> row_group_{};
1385  Future<madness::Group> col_group_{};
1386  using StepTask::owner_;
1387  using StepTask::world_;
1388  using StepTask::finalize_task_;
1389  using StepTask::next_step_task_;
1390 
1391  private:
1392 
1394  void iterate_task(size_type k, const size_type offset) {
1395  // Search for the next non-zero row and column
1396  k = owner_->iterate_sparse(k + offset);
1397  k_.set(k);
1398 
1399  if(k < owner_->k_) {
1400  // NOTE: The order of task submissions is dependent on the order in
1401  // which we want the tasks to complete.
1402 
1403  // Spawn tasks to get k-th row and column tiles
1404  StepTask::spawn_get_row_col_tasks(k);
1405 
1406  // Spawn tasks to construct the row and column broadcast group
1407  row_group_ = world_.taskq.add(owner_, & Summa_::make_row_group, k,
1408  madness::TaskAttributes::hipri());
1409  col_group_ = world_.taskq.add(owner_, & Summa_::make_col_group, k,
1410  madness::TaskAttributes::hipri());
1411 
1412  // Increment the finalize task dependency counter, which indicates
1413  // that this task is not the terminating step task.
1414  TA_ASSERT(finalize_task_);
1415  finalize_task_->inc();
1416  }
1417 
1418  if (trace_tasks)
1419  madness::DependencyInterface::notify_debug("SparseStepTask ctor");
1420  else
1421  madness::DependencyInterface::notify();
1422  }
1423 
1424  public:
1425 
1426  SparseStepTask(const std::shared_ptr<Summa_>& owner, size_type depth) :
1427  StepTask(owner, 1ul)
1428  {
1429  StepTask::make_next_step_tasks(this, depth);
1430 
1431  // Spawn a task to find the next non-zero iteration
1432  if (trace_tasks)
1433  madness::DependencyInterface::inc_debug("SparseStepTask ctor");
1434  else
1435  madness::DependencyInterface::inc();
1436  world_.taskq.add(this, & SparseStepTask::iterate_task,
1437  0ul, 0ul, madness::TaskAttributes::hipri());
1438  }
1439 
1440  SparseStepTask(SparseStepTask* const parent, const int ndep) :
1441  StepTask(parent, ndep)
1442  {
1443  if(parent->k_.probe() && (parent->k_.get() >= owner_->k_)) {
1444  // Avoid running extra tasks if not needed.
1445  k_.set(parent->k_.get());
1446  MADNESS_ASSERT(ndep == 1); // ensure that this does not get executed immediately
1447  } else {
1448  // Spawn a task to find the next non-zero iteration
1449  if (trace_tasks)
1450  madness::DependencyInterface::inc_debug("SparseStepTask ctor");
1451  else
1452  madness::DependencyInterface::inc();
1453  world_.taskq.add(this, & SparseStepTask::iterate_task,
1454  parent->k_, 1ul, madness::TaskAttributes::hipri());
1455  }
1456  }
1457 
1458  virtual ~SparseStepTask() { }
1459 
1460  virtual void run(const madness::TaskThreadEnv&) {
1461  StepTask::template run<SparseStepTask>(k_, row_group_, col_group_);
1462  }
1463  }; // class SparseStepTask
1464 
1465  public:
1466 
1468 
1483  Summa(const left_type& left, const right_type& right,
1484  World& world, const trange_type trange, const shape_type& shape,
1485  const std::shared_ptr<pmap_interface>& pmap, const Permutation& perm,
1486  const op_type& op, const size_type k, const ProcGrid& proc_grid) :
1487  DistEvalImpl_(world, trange, shape, pmap, perm),
1488  left_(left), right_(right), op_(op),
1489  row_group_(), col_group_(),
1490  k_(k), proc_grid_(proc_grid),
1491  reduce_tasks_(NULL),
1492  left_start_local_(proc_grid_.rank_row() * k),
1493  left_end_(left.size()),
1494  left_stride_(k),
1495  left_stride_local_(proc_grid.proc_rows() * k),
1496  right_stride_(1ul),
1497  right_stride_local_(proc_grid.proc_cols())
1498  { }
1499 
1500  virtual ~Summa() { }
1501 
1503 
1508  virtual Future<value_type> get_tile(size_type i) const {
1511 
1512  const size_type source_index = DistEvalImpl_::perm_index_to_source(i);
1513 
1514  // Compute tile coordinate in tile grid
1515  const size_type tile_row = source_index / proc_grid_.cols();
1516  const size_type tile_col = source_index % proc_grid_.cols();
1517  // Compute process coordinate of tile in the process grid
1518  const size_type proc_row = tile_row % proc_grid_.proc_rows();
1519  const size_type proc_col = tile_col % proc_grid_.proc_cols();
1520  // Compute the process that owns tile
1521  const ProcessID source = proc_row * proc_grid_.proc_cols() + proc_col;
1522 
1523  const madness::DistributedID key(DistEvalImpl_::id(), i);
1524  return TensorImpl_::world().gop.template recv<value_type>(source, key);
1525  }
1526 
1527 
1529 
1533  virtual void discard_tile(size_type i) const { get_tile(i); }
1534 
1535  private:
1536 
1538 
1545  size_type mem_bound_depth(size_type depth, const float left_sparsity, const float right_sparsity) {
1546 
1547  // Check if a memory bound has been set
1548  const size_type available_memory = max_memory_;
1549  if(available_memory) {
1550 
1551  // Compute the average memory requirement per iteration of this process
1552  const std::size_t local_memory_per_iter_left =
1553  (left_.trange().elements_range().volume() / left_.trange().tiles_range().volume()) *
1555  proc_grid_.local_rows() * (1.0f - left_sparsity);
1556  const std::size_t local_memory_per_iter_right =
1557  (right_.trange().elements_range().volume() / right_.trange().tiles_range().volume()) *
1559  proc_grid_.local_cols() * (1.0f - right_sparsity);
1560 
1561  // Compute the maximum number of iterations based on available memory
1562  const size_type mem_bound_depth =
1563  ((local_memory_per_iter_left + local_memory_per_iter_right) /
1564  available_memory);
1565 
1566  // Check if the memory bounded depth is less than the optimal depth
1567  if(depth > mem_bound_depth) {
1568 
1569  // Adjust the depth based on the available memory
1570  switch(mem_bound_depth) {
1571  case 0:
1572  // When memory bound depth is
1573  TA_EXCEPTION("Insufficient memory available for SUMMA");
1574  break;
1575  case 1:
1576  if(TensorImpl_::world().rank() == 0)
1577  printf("!! WARNING TiledArray: Memory constraints limit the SUMMA depth depth to 1.\n"
1578  "!! WARNING TiledArray: Performance may be slow.\n");
1579  default:
1580  depth = mem_bound_depth;
1581  }
1582  }
1583  }
1584 
1585  return depth;
1586  }
1587 
1589 
1595  virtual int internal_eval() {
1596 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1597  printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank());
1598 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1599 
1600  // Start evaluate child tensors
1601  left_.eval();
1602  right_.eval();
1603 
1604 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1605  printf("eval: finished eval children rank=%i\n", TensorImpl_::world().rank());
1606 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1607 
1608  size_type tile_count = 0ul;
1609  if(proc_grid_.local_size() > 0ul) {
1610  tile_count = initialize();
1611 
1612  // depth controls the number of simultaneous SUMMA iterations
1613  // that are scheduled.
1614 
1615  // The optimal depth is equal to the smallest dimension of the process
1616  // grid, but no less than 2
1617  size_type depth =
1618  std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
1619 
1620  // Construct the first SUMMA iteration task
1621  if(TensorImpl_::shape().is_dense()) {
1622  // We cannot have more iterations than there are blocks in the k
1623  // dimension
1624  if(depth > k_) depth = k_;
1625 
1626  // Modify the number of concurrent iterations based on the available
1627  // memory.
1628  depth = mem_bound_depth(depth, 0.0f, 0.0f);
1629 
1630  // Enforce user defined depth bound
1631  if(max_depth_) std::min(depth, max_depth_);
1632 
1633  TensorImpl_::world().taskq.add(new DenseStepTask(shared_from_this(),
1634  depth));
1635  } else {
1636  // Increase the depth based on the amount of sparsity in an iteration.
1637 
1638  // Get the sparsity fractions for the left- and right-hand arguments.
1639  const float left_sparsity = left_.shape().sparsity();
1640  const float right_sparsity = right_.shape().sparsity();
1641 
1642  // Compute the fraction of non-zero result tiles in a single SUMMA iteration.
1643  const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f))
1644  * (1.0f - std::min(right_sparsity, 0.9f));
1645 
1646  // Compute the new depth based on sparsity of the arguments
1647  depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
1648 
1649  // We cannot have more iterations than there are blocks in the k
1650  // dimension
1651  if(depth > k_) depth = k_;
1652 
1653  // Modify the number of concurrent iterations based on the available
1654  // memory and sparsity of the argument tensors.
1655  depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
1656 
1657  // Enforce user defined depth bound
1658  if(max_depth_) std::min(depth, max_depth_);
1659 
1660  TensorImpl_::world().taskq.add(new SparseStepTask(shared_from_this(),
1661  depth));
1662  }
1663  }
1664 
1665 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1666  printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
1667 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1668 
1669  // Wait for child tensors to be evaluated, and process tasks while waiting.
1670  left_.wait();
1671  right_.wait();
1672 
1673 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1674  printf("eval: finished wait children rank=%i\n", TensorImpl_::world().rank());
1675 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1676 
1677  return tile_count;
1678  }
1679 
1680  }; // class Summa
1681 
1682 
1683  // Initialize static member variables for Summa
1684 
1685  template <typename Left, typename Right, typename Op, typename Policy>
1687  Summa<Left, Right, Op, Policy>::max_depth_ =
1688  Summa<Left, Right, Op, Policy>::init_max_depth();
1689 
1690  template <typename Left, typename Right, typename Op, typename Policy>
1692  Summa<Left, Right, Op, Policy>::max_memory_ =
1693  Summa<Left, Right, Op, Policy>::init_max_memory();
1694  } // namespace detail
1695 } // namespace TiledArray
1696 
1697 #endif // TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
Left left_type
The left-hand argument type.
TensorImpl_::size_type size_type
Size type.
Definition: dist_eval.h:46
Tile cast operation.
Definition: cast.h:34
Right right_type
The right-hand argument type.
const shape_type & shape() const
Tensor shape accessor.
Definition: tensor_impl.h:156
const trange_type & trange() const
Tiled range accessor.
Definition: tensor_impl.h:161
bool is_dense() const
Query the density of the tensor.
Definition: tensor_impl.h:150
ProcessID rank_row() const
Rank row accessor.
Definition: proc_grid.h:389
Distributed evaluator implementation object.
Definition: dist_eval.h:40
ProcessID rank_col() const
Rank row accessor.
Definition: proc_grid.h:394
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Definition: proc_grid.h:417
Summa< Left, Right, Op, Policy > Summa_
This object type.
TensorImpl_::pmap_interface pmap_interface
process map interface type
Definition: dist_eval.h:50
ProcessID map_col(const size_type col) const
Map a column to the process in this process&#39;s row.
Definition: proc_grid.h:479
ProcessID map_row(const size_type row) const
Map a row to the process in this process&#39;s column.
Definition: proc_grid.h:470
Tensor implementation and base for other tensor implementation objects.
Definition: tensor_impl.h:39
Type trait for extracting the numeric type of tensors and arrays.
Definition: type_traits.h:479
void set_tile(size_type i, const value_type &value)
Set tensor value.
Definition: dist_eval.h:146
DistEvalImpl_::shape_type shape_type
Shape type.
KroneckerDeltaTile< _N >::numeric_type max(const KroneckerDeltaTile< _N > &arg)
size_type cols() const
Element column count accessor.
Definition: proc_grid.h:364
TensorImpl_::trange_type trange_type
Tiled range type for this object.
Definition: dist_eval.h:47
T value_type
The norm value type.
Definition: sparse_shape.h:58
void add(const L &left, const R &right, madness::CallbackInterface *callback=nullptr)
Add a pair of arguments to the reduction task.
Definition: reduce_task.h:705
Op op_type
Tile evaluation operator type.
size_type perm_index_to_target(size_type index) const
Permute index from a source index to a target index.
Definition: dist_eval.h:75
virtual void discard_tile(size_type i) const
Discard a tile that is not needed.
DistEvalImpl_::pmap_interface pmap_interface
Process map interface type.
Summa(const left_type &left, const right_type &right, World &world, const trange_type trange, const shape_type &shape, const std::shared_ptr< pmap_interface > &pmap, const Permutation &perm, const op_type &op, const size_type k, const ProcGrid &proc_grid)
Constructor.
TensorImpl_::shape_type shape_type
Shape type.
Definition: dist_eval.h:49
ProcessID owner(const Index &i) const
Query a tile owner.
Definition: tensor_impl.h:116
DistEvalImpl_::trange_type trange_type
Tiled range type.
bool is_local(const Index &i) const
Query for a locally owned tile.
Definition: tensor_impl.h:128
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
Definition: proc_grid.h:444
#define TA_ASSERT(a)
Definition: error.h:107
const madness::uniqueidT & id() const
Unique object id accessor.
Definition: dist_eval.h:125
size_type rows() const
Element row count accessor.
Definition: proc_grid.h:359
bool is_zero(const Index &i) const
Query for a zero tile.
Definition: tensor_impl.h:141
DistEvalImpl_::range_type range_type
Range type.
DistEvalImpl_::TensorImpl_ TensorImpl_
The base, base class type.
size_type proc_cols() const
Process column count accessor.
Definition: proc_grid.h:404
size_type local_rows() const
Local element row count accessor.
Definition: proc_grid.h:374
size_type size() const
Tensor tile volume accessor.
Definition: tensor_impl.h:97
TensorImpl_::range_type range_type
Range type this tensor.
Definition: dist_eval.h:48
eval_trait< value_type >::type eval_type
Tile evaluation type.
Definition: dist_eval.h:52
Permutation of a sequence of objects indexed by base-0 indices.
Definition: permutation.h:119
size_type proc_rows() const
Process row count accessor.
Definition: proc_grid.h:399
#define TA_EXCEPTION(m)
Definition: error.h:72
DistEvalImpl_::size_type size_type
Size type.
size_type local_cols() const
Local element column count accessor.
Definition: proc_grid.h:379
size_type local_size() const
Local element count accessor.
Definition: proc_grid.h:384
World & world() const
World accessor.
Definition: tensor_impl.h:169
Distributed contraction evaluator implementation.
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)
size_type perm_index_to_source(size_type index) const
Permute index from a target index to a source index.
Definition: dist_eval.h:84
const std::shared_ptr< pmap_interface > & pmap() const
Tensor process map accessor.
Definition: tensor_impl.h:85
A 2D processor grid.
Definition: proc_grid.h:59
virtual Future< value_type > get_tile(size_type i) const
Get tile at index i.
DistEvalImpl_::value_type value_type
Tile type.
DistEvalImpl< typename Op::result_type, Policy > DistEvalImpl_
The base class type.
An N-dimensional shallow copy wrapper for tile objects.
Definition: tile.h:80
virtual void notify()
Tile set notification.
Definition: dist_eval.h:171
DistEvalImpl_::eval_type eval_type
Tile evaluation type.