doc/html/contraction__eval_8h_source.html

 /*
  *  This file is a part of TiledArray.
  *  Copyright (C) 2013  Virginia Tech
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  */

 #ifndef TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
 #define TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED

 #include <vector>

 #include <TiledArray/config.h>
 #include <TiledArray/dist_eval/dist_eval.h>
 #include <TiledArray/proc_grid.h>
 #include <TiledArray/reduce_task.h>
 #include <TiledArray/type_traits.h>
 #include <TiledArray/shape.h>

 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
 //#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1

 namespace TiledArray {
   namespace detail {


     template <typename Left, typename Right, typename Op, typename Policy>
     class Summa :
         public DistEvalImpl<typename Op::result_type, Policy>,
         public std::enable_shared_from_this<Summa<Left, Right, Op, Policy> >
     {
     public:
       typedef Summa<Left, Right, Op, Policy> Summa_;
       typedef DistEvalImpl<typename Op::result_type, Policy> DistEvalImpl_;
       typedef typename DistEvalImpl_::TensorImpl_ TensorImpl_;
       typedef Left left_type;
       typedef Right right_type;
       typedef typename DistEvalImpl_::size_type size_type;
       typedef typename DistEvalImpl_::range_type range_type;
       typedef typename DistEvalImpl_::shape_type shape_type;
       typedef typename DistEvalImpl_::pmap_interface pmap_interface;
       typedef typename DistEvalImpl_::trange_type trange_type;
       typedef typename DistEvalImpl_::value_type value_type;
       typedef typename DistEvalImpl_::eval_type eval_type;
       typedef Op op_type;

     private:
       static size_type max_memory_;
       static size_type max_depth_;

       // Arguments and operation
       left_type left_;
       right_type right_;
       op_type op_;

       // Broadcast groups for dense arguments (empty for non-dense arguments)
       madness::Group row_group_;
       madness::Group col_group_;

       // Dimension information
       const size_type k_;
       const ProcGrid proc_grid_;

       // Contraction results
       ReducePairTask<op_type>* reduce_tasks_;

       // Constants used to iterate over columns and rows of left_ and right_, respectively.
       const size_type left_start_local_;
       const size_type left_end_;
       const size_type left_stride_;
       const size_type left_stride_local_;
       const size_type right_stride_;
       const size_type right_stride_local_;


       typedef Future<typename right_type::eval_type> right_future;
       typedef Future<typename left_type::eval_type> left_future;
       typedef std::pair<size_type, right_future> row_datum;
       typedef std::pair<size_type, left_future> col_datum;

       static constexpr const bool trace_tasks =
 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
           true
 #else
           false
 #endif
       ;

     protected:

       // Import base class functions
       using std::enable_shared_from_this<Summa_>::shared_from_this;

     private:

       // Static variable initialization ----------------------------------------


       static size_type init_max_memory() {
         const char* max_memory = getenv("TA_SUMMA_MAX_MEMORY");
         if(max_memory) {
             // Convert the string into bytes
             std::stringstream ss(max_memory);
             double memory = 0.0;
             if(ss >> memory) {
                 if(memory > 0.0) {
                     std::string unit;
                     if(ss >> unit) { // Failure == assume bytes
                         if(unit == "KB" || unit == "kB") {
                           memory *= 1000.0;
                         } else if(unit == "KiB" || unit == "kiB") {
                           memory *= 1024.0;
                         } else if(unit == "MB") {
                           memory *= 1000000.0;
                         } else if(unit == "MiB") {
                           memory *= 1048576.0;
                         } else if(unit == "GB") {
                           memory *= 1000000000.0;
                         } else if(unit == "GiB") {
                           memory *= 1073741824.0;
                         }
                     }
                 }
             }

             memory = std::max(memory, 104857600.0); // Minimum 100 MiB
             return memory;
         }

         return 0ul;
       }


       static size_type init_max_depth() {
         const char* max_depth = getenv("TA_SUMMA_MAX_DEPTH");
         if(max_depth)
           return std::stoul(max_depth);
         return 0ul;
       }


       // Process groups --------------------------------------------------------


       template <typename Shape, typename ProcMap>
       madness::Group make_group(const Shape& shape, const std::vector<bool>& process_mask, size_type index,
           const size_type end, const size_type stride, const size_type max_group_size,
           const size_type k, const size_type key_offset, const ProcMap& proc_map) const
       {
         // Generate the list of processes in rank_row
         std::vector<ProcessID> proc_list(max_group_size, -1);

         // Flag the root processes of the broadcast, which may not be included
         // by shape.
         size_type p = k % max_group_size;
         proc_list[p] = proc_map(p);
         size_type count = 1ul;

         // Flag all processes that have non-zero tiles
         for(p = 0ul; (index < end) && (count < max_group_size); index += stride,
             p = (p + 1u) % max_group_size)
         {
           if((proc_list[p] != -1) || (shape.is_zero(index)) || !process_mask.at(p)) continue;

           proc_list[p] = proc_map(p);
           ++count;
         }

         // Remove processes from the list that will not be in the group
         for(size_type x = 0ul, p = 0ul; x < count; ++p) {
           if(proc_list[p] == -1) continue;
           proc_list[x++] = proc_list[p];
         }

         // Truncate invalid process id's
         proc_list.resize(count);

         return madness::Group(TensorImpl_::world(), proc_list,
             madness::DistributedID(DistEvalImpl_::id(), k + key_offset));
       }


       madness::Group make_row_group(const size_type k) const {
         // Construct the sparse broadcast group
         const size_type right_begin_k = k * proc_grid_.cols();
         const size_type right_end_k = right_begin_k + proc_grid_.cols();
         // make the row mask; using the same mask for all tiles avoids having to compute mask
         // for every tile and use of masked broadcasts
         auto result_row_mask_k = make_row_mask(k);

         // return empty group if I am not in this group, otherwise make a group
         if (result_row_mask_k[proc_grid_.rank_col()])
           return make_group(right_.shape(), result_row_mask_k, right_begin_k, right_end_k,
                             right_stride_, proc_grid_.proc_cols(), k, k_,
                             [&](const ProcGrid::size_type col) { return proc_grid_.map_col(col); });
         else
           return madness::Group();
       }


       madness::Group make_col_group(const size_type k) const {

         // make the column mask; using the same mask for all tiles avoids having to compute mask
         // for every tile and use of masked broadcasts
         auto result_col_mask_k = make_col_mask(k);

         // return empty group if I am not in this group, otherwise make a group
         if (result_col_mask_k[proc_grid_.rank_row()])
           return make_group(left_.shape(), result_col_mask_k, k, left_end_, left_stride_,
                             proc_grid_.proc_rows(), k, 0ul,
                             [&](const ProcGrid::size_type row) { return proc_grid_.map_row(row); });
         else
           return madness::Group();
       }


       std::vector<bool> make_row_mask(const size_type k) const {

         // "local" A[i][k] (i.e. for all i assigned to my row of processes) will produce C[i][*]
         // for each process in my row of the process grid determine whether there are any
         // nonzero C[i][*] located on that node

         const auto nproc_cols = proc_grid_.proc_cols();
         const auto my_proc_row = proc_grid_.rank_row();

         // result shape
         const auto& result_shape = TensorImpl_::shape();

         // if result is dense, include all processors
         if (result_shape.is_dense())
           return std::vector<bool>(nproc_cols, true);

         // initialize the mask
         std::vector<bool> mask(nproc_cols, false);

         // number of tiles in the col dimension of the result
         const auto nj = proc_grid_.cols();
         // number of tiles in contraction dim
         const auto nk = k_;

         // for each i assigned to my row of processes ...
         size_type i_start, i_fence, i_stride;
         std::tie(i_start, i_fence, i_stride) =
             result_row_range(my_proc_row);
         const auto ik_stride = i_stride * nk;
         for (size_type i = i_start, ik = i_start * nk + k; i < i_fence;
              i += i_stride, ik += ik_stride) {
           // ... such that A[i][k] exists ...
           if (!left_.shape().is_zero(ik)) {
             // ... the owner of А[i][k] is always in the group ...
             const auto k_proc_col = k % nproc_cols;
             mask[k_proc_col] = true;
             // ... loop over processes in my row ...
             for (size_type proc_col = 0; proc_col != nproc_cols; ++proc_col) {
               // ... that are not the owner of A[i][k] ...
               if (proc_col != k_proc_col) {
                 // ... loop over all C[i][j] tiles that belong to this process ...
                 size_type j_start, j_fence, j_stride;
                 std::tie(j_start, j_fence, j_stride) =
                     result_col_range(proc_col);
                 const auto ij_stride = j_stride;
                 for (size_type j = j_start, ij = i * nj + j_start; j < j_fence;
                      j += j_stride, ij += ij_stride) {
                   // ... if any such C[i][j] exists, update the mask, and move
                   // on to next process
                   if (!result_shape.is_zero(DistEvalImpl_::perm_index_to_target(ij))) {
                     mask[proc_col] = true;
                     break;
                   }
                 }
               }
             }
           }
         }

         return mask;
       }


       std::vector<bool> make_col_mask(const size_type k) const {
         // "local" B[k][j] (i.e. for all j assigned to my column of processes)
         // will produce C[*][j]
         // for each process in my column of the process grid determine whether
         // there are any
         // nonzero C[*][j] located on that node

         const auto nproc_rows = proc_grid_.proc_rows();
         const auto my_proc_col = proc_grid_.rank_col();

         // result shape
         const auto& result_shape = TensorImpl_::shape();

         // if result is dense, include all processors
         if (result_shape.is_dense())
           return std::vector<bool>(nproc_rows, true);

         // initialize the mask
         std::vector<bool> mask(nproc_rows, false);

         // number of tiles in col dim of the result
         const auto nj = proc_grid_.cols();

         // for each j assigned to my column of processes ...
         size_type j_start, j_fence, j_stride;
         std::tie(j_start, j_fence, j_stride) = result_col_range(my_proc_col);
         const auto kj_stride = j_stride;
         for (size_type j = j_start, kj = k * nj + j_start; j < j_fence;
              j += j_stride, kj += kj_stride) {
           // ... such that B[k][j] exists ...
           if (!right_.shape().is_zero(kj)) {
             // ... the owner of B[k][j] is always in the group ...
             auto k_proc_row = k % nproc_rows;
             mask[k_proc_row] = true;
             // ... loop over processes in my col ...
             for (size_type proc_row = 0; proc_row != nproc_rows; ++proc_row) {
               // ... that are not the owner of B[k][j] ...
               if (proc_row != k_proc_row) {
                 // ... loop over all C[i][j] tiles that belong to this process
                 size_type i_start, i_fence, i_stride;
                 std::tie(i_start, i_fence, i_stride) =
                     result_row_range(proc_row);
                 const auto ij_stride = i_stride * nj;
                 for (size_type i = i_start, ij = i_start * nj + j; i < i_fence;
                      i += i_stride, ij += ij_stride) {
                   // ... if any such C[i][j] exists, update the mask, and move
                   // on to next process
                   if (!result_shape.is_zero(
                           DistEvalImpl_::perm_index_to_target(ij))) {
                     mask[proc_row] = true;
                     break;
                   }
                 }
               }
             }
           }
         }

         return mask;
       }


       inline std::tuple<size_type, size_type, size_type> result_row_range(
           size_type proc_row) const {
         const size_type start = proc_row;
         const size_type fence = proc_grid_.rows();
         const size_type stride = proc_grid_.proc_rows();
         return std::make_tuple(start, fence, stride);
       }


       std::tuple<size_type, size_type, size_type> result_col_range(
           size_type proc_col) const {
         const size_type start = proc_col;
         const size_type fence = proc_grid_.cols();
         const size_type stride = proc_grid_.proc_cols();
         return std::make_tuple(start, fence, stride);
       }

       // Broadcast kernels -----------------------------------------------------


       template <typename Tile>
       static auto convert_tile(const Tile& tile) {
         TiledArray::Cast<typename eval_trait<Tile>::type, Tile> cast;
         return cast(tile);
       }


       template <typename Arg>
       static typename std::enable_if<
           ! is_lazy_tile<typename Arg::value_type>::value,
           Future<typename Arg::eval_type> >::type
       get_tile(Arg& arg, const typename Arg::size_type index) { return arg.get(index); }


       template <typename Arg>
       static typename std::enable_if<
           is_lazy_tile<typename Arg::value_type>::value,
           Future<typename Arg::eval_type> >::type
       get_tile(Arg& arg, const typename Arg::size_type index) {
         auto convert_tile_fn =
             &Summa_::template convert_tile<typename Arg::value_type>;
         return arg.world().taskq.add(convert_tile_fn, arg.get(index),
                                      madness::TaskAttributes::hipri());
       }


       template <typename Arg, typename Datum>
       void get_vector(Arg& arg, size_type index, const size_type end,
           const size_type stride, std::vector<Datum>& vec) const
       {
         TA_ASSERT(vec.size() == 0ul);

         // Iterate over vector of tiles
         if(arg.is_local(index)) {
           for(size_type i = 0ul; index < end; ++i, index += stride) {
             if(arg.shape().is_zero(index)) continue;
             vec.emplace_back(i, get_tile(arg, index));
           }
         } else {
           for(size_type i = 0ul; index < end; ++i, index += stride) {
             if(arg.shape().is_zero(index)) continue;
             vec.emplace_back(i, Future<typename Arg::eval_type>());
           }
         }

         TA_ASSERT(vec.size() > 0ul);
       }


       void get_col(const size_type k, std::vector<col_datum>& col) const {
         col.reserve(proc_grid_.local_rows());
         get_vector(left_, left_start_local_ + k, left_end_, left_stride_local_, col);
       }


       void get_row(const size_type k, std::vector<row_datum>& row) const {
         row.reserve(proc_grid_.local_cols());

         // Compute local iteration limits for row k of right_.
         size_type begin = k * proc_grid_.cols();
         const size_type end = begin + proc_grid_.cols();
         begin += proc_grid_.rank_col();

         get_vector(right_, begin, end, right_stride_local_, row);
       }


       template <typename Datum>
       void bcast(const size_type start, const size_type stride,
           const madness::Group& group, const ProcessID group_root,
           const size_type key_offset, std::vector<Datum>& vec) const
       {
         TA_ASSERT(vec.size() != 0ul);
         TA_ASSERT(group.size() > 0);
         TA_ASSERT(group_root < group.size());

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
         std::stringstream ss;
         ss  << "bcast: rank=" << TensorImpl_::world().rank()
             << " root=" << group.world_rank(group_root)
             << " groupid=(" << group.id().first << "," << group.id().second
             << ") keyoffset=" << key_offset << " group={ ";
         for(ProcessID group_proc = 0; group_proc < group.size(); ++group_proc)
           ss << group.world_rank(group_proc) << " ";
         ss << "} tiles={ ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST

         // Iterate over tiles to be broadcast
         for(typename std::vector<Datum>::iterator it = vec.begin(); it != vec.end(); ++it) {
           const size_type index = it->first * stride + start;

           // Broadcast the tile
           const madness::DistributedID key(DistEvalImpl_::id(), index + key_offset);
           TensorImpl_::world().gop.bcast(key, it->second, group_root, group);

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
           ss  << index << " ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
         }

         TA_ASSERT(vec.size() > 0ul);

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
         ss << "}\n";
         printf(ss.str().c_str());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST
       }

       // Broadcast specialization for left and right arguments -----------------


       ProcessID get_row_group_root(const size_type k, const madness::Group& row_group) const {
         ProcessID group_root = k % proc_grid_.proc_cols();
         if(! right_.shape().is_dense() && row_group.size() < static_cast<ProcessID>(proc_grid_.proc_cols())) {
           const ProcessID world_root = proc_grid_.rank_row() * proc_grid_.proc_cols() + group_root;
           group_root = row_group.rank(world_root);
         }
         return group_root;
       }

       ProcessID get_col_group_root(const size_type k, const madness::Group& col_group) const {
         ProcessID group_root = k % proc_grid_.proc_rows();
         if(! left_.shape().is_dense() && col_group.size() < static_cast<ProcessID>(proc_grid_.proc_rows())) {
           const ProcessID world_root = group_root * proc_grid_.proc_cols() + proc_grid_.rank_col();
           group_root = col_group.rank(world_root);
         }
         return group_root;
       }


       void bcast_col(const size_type k, std::vector<col_datum>& col, const madness::Group& row_group) const {
         // broadcast if I'm part of the broadcast group
         if (!row_group.empty()) {
           // Broadcast column k of left_.
           ProcessID group_root = get_row_group_root(k, row_group);
           bcast(left_start_local_ + k, left_stride_local_, row_group, group_root, 0ul, col);
         }
       }


       void bcast_row(const size_type k, std::vector<row_datum>& row, const madness::Group& col_group) const {
         // broadcast if I'm part of the broadcast group
         if (!col_group.empty()) {
           // Compute the group root process.
           ProcessID group_root = get_col_group_root(k, col_group);

           // Broadcast row k of right_.
           bcast(k * proc_grid_.cols() + proc_grid_.rank_col(),
                 right_stride_local_, col_group, group_root, left_.size(), row);
         }
       }

       void bcast_col_range_task(size_type k, const size_type end) const {
         // Compute the first local row of right
         const size_type Pcols = proc_grid_.proc_cols();
         k += (Pcols - ((k + Pcols - proc_grid_.rank_col()) % Pcols)) % Pcols;

         for(; k < end; k += Pcols) {

           // Compute local iteration limits for column k of left_.
           size_type index = left_start_local_ + k;

           // will create broadcast group only if needed
           bool have_group = false;
           madness::Group row_group;
           ProcessID group_root;
           bool do_broadcast;

           // Search column k of left for non-zero tiles
           for(; index < left_end_; index += left_stride_local_) {
             if(left_.shape().is_zero(index)) continue;

             // Construct broadcast group, if needed
             if (!have_group) {
               have_group = true;
               row_group = make_row_group(k);
               // broadcast if I am in this group and this group has others
               do_broadcast = !row_group.empty() && row_group.size() > 1;
               if (do_broadcast)
                 group_root = get_row_group_root(k, row_group);
             }

             if(do_broadcast) {
               // Broadcast the tile
               const madness::DistributedID key(DistEvalImpl_::id(), index);
               auto tile = get_tile(left_, index);
               TensorImpl_::world().gop.bcast(key, tile, group_root, row_group);
             } else {
               // Discard the tile
               left_.discard(index);
             }
           }
         }
       }

       void bcast_row_range_task(size_type k, const size_type end) const {
         // Compute the first local row of right
         const size_type Prows = proc_grid_.proc_rows();
         k += (Prows - ((k + Prows - proc_grid_.rank_row()) % Prows)) % Prows;

         for(; k < end; k += Prows) {

           // Compute local iteration limits for row k of right_.
           size_type index = k * proc_grid_.cols();
           const size_type row_end = index + proc_grid_.cols();
           index += proc_grid_.rank_col();

           // will create broadcast group only if needed
           bool have_group = false;
           madness::Group col_group;
           ProcessID group_root;
           bool do_broadcast;

           // Search for and broadcast non-zero row
           for(; index < row_end; index += right_stride_local_) {
             if(right_.shape().is_zero(index)) continue;

             // Construct broadcast group
             if (!have_group) {
               have_group = true;
               col_group = make_col_group(k);
               // broadcast if I am in this group and this group has others
               do_broadcast = !col_group.empty() && col_group.size() > 1;
               if (do_broadcast)
                 group_root = get_col_group_root(k, col_group);
             }

             if(do_broadcast) {
               // Broadcast the tile
               const madness::DistributedID key(DistEvalImpl_::id(), index + left_.size());
               auto tile = get_tile(right_, index);
               TensorImpl_::world().gop.bcast(key, tile, group_root, col_group);
             } else {
               // Discard the tile
               right_.discard(index);
             }
           }
         }
       }


       // Row and column iteration functions ------------------------------------


       size_type iterate_row(size_type k) const {
         // Iterate over k's until a non-zero tile is found or the end of the
         // matrix is reached.
         size_type end = k * proc_grid_.cols();
         for(; k < k_; ++k) {
           // Search for non-zero tiles in row k of right
           size_type i = end + proc_grid_.rank_col();
           end += proc_grid_.cols();
           for(; i < end; i += right_stride_local_)
             if(! right_.shape().is_zero(i))
               return k;
         }

         return k;
       }


       size_type iterate_col(size_type k) const {
         // Iterate over k's until a non-zero tile is found or the end of the
         // matrix is reached.
         for(; k < k_; ++k)
           // Search row k for non-zero tiles
           for(size_type i = left_start_local_ + k; i < left_end_; i += left_stride_local_)
             if(! left_.shape().is_zero(i))
               return k;

         return k;
       }


       size_type iterate_sparse(const size_type k) const {
         // Initial step for k_col and k_row.
         size_type k_col = iterate_col(k);
         size_type k_row = iterate_row(k_col);

         // Search for a row and column that both have non-zero tiles
         while(k_col != k_row) {
           if(k_col < k_row) {
             k_col = iterate_col(k_row);
           } else {
             k_row = iterate_row(k_col);
           }
         }

         if(k < k_row) {
           // Spawn a task to broadcast any local columns of left that were skipped
           TensorImpl_::world().taskq.add(shared_from_this(),
               & Summa_::bcast_col_range_task, k, k_row,
               madness::TaskAttributes::hipri());

           // Spawn a task to broadcast any local rows of right that were skipped
           TensorImpl_::world().taskq.add(shared_from_this(),
               & Summa_::bcast_row_range_task, k, k_col,
               madness::TaskAttributes::hipri());
         }

         return k_col;
       }


       size_type iterate(const size_type k) const {
         return (left_.shape().is_dense() && right_.shape().is_dense() ?
             k : iterate_sparse(k));
       }


       // Initialization functions ----------------------------------------------

       size_type initialize(const DenseShape&) {
         // Construct static broadcast groups for dense arguments
         const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
         col_group_ = proc_grid_.make_col_group(col_did);
         const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
         row_group_ = proc_grid_.make_row_group(row_did);

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
         std::stringstream ss;
         ss << "init: rank=" << TensorImpl_::world().rank()
            << "\n    col_group_=(" << col_did.first << ", " << col_did.second << ") { ";
         for(ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
           ss << col_group_.world_rank(gproc) << " ";
         ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second << ") { ";
         for(ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
           ss << row_group_.world_rank(gproc) << " ";
         ss << "}\n";
         printf(ss.str().c_str());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

         // Allocate memory for the reduce pair tasks.
         std::allocator<ReducePairTask<op_type> > alloc;
         reduce_tasks_ = alloc.allocate(proc_grid_.local_size());

         // Iterate over all local tiles
         const size_type n = proc_grid_.local_size();
         for(size_type t = 0ul; t < n; ++t) {
           // Initialize the reduction task
           ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task = reduce_tasks_ + t;
           new(reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
         }

         return proc_grid_.local_size();
       }

       template <typename Shape>
       size_type initialize(const Shape& shape) {

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
         std::stringstream ss;
         ss << "    initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

         // Allocate memory for the reduce pair tasks.
         std::allocator<ReducePairTask<op_type> > alloc;
         reduce_tasks_ = alloc.allocate(proc_grid_.local_size());

         // Initialize iteration variables
         size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
         size_type row_end = row_start + proc_grid_.cols();
         row_start += proc_grid_.rank_col();
         const size_type col_stride = // The stride to iterate down a column
             proc_grid_.proc_rows() * proc_grid_.cols();
         const size_type row_stride = // The stride to iterate across a row
             proc_grid_.proc_cols();
         const size_type end = TensorImpl_::size();

         // Iterate over all local tiles
         size_type tile_count = 0ul;
         ReducePairTask<op_type>* MADNESS_RESTRICT reduce_task = reduce_tasks_;
         // this loops over result tiles arranged in block-cyclic order
         // index = tile index (row major)
         for(; row_start < end; row_start += col_stride, row_end += col_stride) {
           for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {

             // Initialize the reduction task

             // Skip zero tiles
             if(! shape.is_zero(DistEvalImpl_::perm_index_to_target(index))) {

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
               ss << index << " ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

               new(reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_);
               ++tile_count;
             } else {
               // Construct an empty task to represent zero tiles.
               new(reduce_task) ReducePairTask<op_type>();
             }
           }
         }

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
         ss << "}\n";
         printf(ss.str().c_str());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

         return tile_count;
       }

       size_type initialize() {
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
         printf("init: start rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

         const size_type result = initialize(TensorImpl_::shape());

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
         printf("init: finish rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE

         return result;
       }


       // Finalize functions ----------------------------------------------------

       void finalize(const DenseShape&) {
         // Initialize iteration variables
         size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
         size_type row_end = row_start + proc_grid_.cols();
         row_start += proc_grid_.rank_col();
         const size_type col_stride = // The stride to iterate down a column
             proc_grid_.proc_rows() * proc_grid_.cols();
         const size_type row_stride = // The stride to iterate across a row
             proc_grid_.proc_cols();
         const size_type end = TensorImpl_::size();

         // Iterate over all local tiles
         for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
             row_start < end; row_start += col_stride, row_end += col_stride) {
           for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {


             // Set the result tile
             DistEvalImpl_::set_tile(DistEvalImpl_::perm_index_to_target(index),
                 reduce_task->submit());

             // Destroy the reduce task
             reduce_task->~ReducePairTask<op_type>();
           }
         }

         // Deallocate the memory for the reduce pair tasks.
         std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
             proc_grid_.local_size());
       }

       template <typename Shape>
       void finalize(const Shape& shape) {

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
         std::stringstream ss;
         ss << "    finalize rank=" << TensorImpl_::world().rank() << " tiles={ ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE

         // Initialize iteration variables
         size_type row_start = proc_grid_.rank_row() * proc_grid_.cols();
         size_type row_end = row_start + proc_grid_.cols();
         row_start += proc_grid_.rank_col();
         const size_type col_stride = // The stride to iterate down a column
             proc_grid_.proc_rows() * proc_grid_.cols();
         const size_type row_stride = // The stride to iterate across a row
             proc_grid_.proc_cols();
         const size_type end = TensorImpl_::size();

         // Iterate over all local tiles
         for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
             row_start < end; row_start += col_stride, row_end += col_stride) {
           for(size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
             // Compute the permuted index
             const size_type perm_index = DistEvalImpl_::perm_index_to_target(index);

             // Skip zero tiles
             if(! shape.is_zero(perm_index)) {

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
               ss << index << " ";
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE

               // Set the result tile
               DistEvalImpl_::set_tile(perm_index, reduce_task->submit());
             }

             // Destroy the reduce task
             reduce_task->~ReducePairTask<op_type>();
           }
         }

         // Deallocate the memory for the reduce pair tasks.
         std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
             proc_grid_.local_size());

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
         ss << "}\n";
         printf(ss.str().c_str());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
       }

       void finalize() {
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
         printf("finalize: start rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE

         finalize(TensorImpl_::shape());

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
         printf("finalize: finish rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
       }


       class FinalizeTask : public madness::TaskInterface {
       private:
         std::shared_ptr<Summa_> owner_;

       public:
         FinalizeTask(const std::shared_ptr<Summa_>& owner, const int ndep) :
           madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
           owner_(owner)
         { }

         virtual ~FinalizeTask() { }

         virtual void run(const madness::TaskThreadEnv&) { owner_->finalize(); }

       }; // class FinalizeTask


       // Contraction functions -------------------------------------------------


       void contract(const DenseShape&, const size_type,
           const std::vector<col_datum>& col, const std::vector<row_datum>& row,
           madness::TaskInterface* const task)
       {
         // Iterate over the row
         for(size_type i = 0ul; i < col.size(); ++i) {
           // Compute the local, result-tile offset
           const size_type reduce_task_offset = col[i].first * proc_grid_.local_cols();

           // Iterate over columns
           for(size_type j = 0ul; j < row.size(); ++j) {
             const size_type reduce_task_index = reduce_task_offset + row[j].first;

             // Schedule task for contraction pairs
             if(task)
               task->inc();
             const left_future left = col[i].second;
             const right_future right = row[j].second;
             reduce_tasks_[reduce_task_index].add(left, right, task);
           }
         }
       }


       template <typename Shape>
       void contract(const Shape&, const size_type,
           const std::vector<col_datum>& col, const std::vector<row_datum>& row,
           madness::TaskInterface* const task)
       {
         // Iterate over the row
         for(size_type i = 0ul; i < col.size(); ++i) {
           // Compute the local, result-tile offset
           const size_type reduce_task_offset = col[i].first * proc_grid_.local_cols();

           // Iterate over columns
           for(size_type j = 0ul; j < row.size(); ++j) {
             const size_type reduce_task_index = reduce_task_offset + row[j].first;

             // Skip zero tiles
             if(! reduce_tasks_[reduce_task_index])
               continue;

             // Schedule task for contraction pairs
             if(task) {
               if (trace_tasks)
                 task->inc_debug("destroy(*ReduceObject)");
               else
                 task->inc();
             }
             const left_future left = col[i].second;
             const right_future right = row[j].second;
             reduce_tasks_[reduce_task_index].add(left, right, task);
           }
         }
       }

 #define TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER
 #ifndef TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER

       template <typename T>
       typename std::enable_if<std::is_floating_point<T>::value>::type
       contract(const SparseShape<T>&, const size_type k,
           const std::vector<col_datum>& col, const std::vector<row_datum>& row,
           madness::TaskInterface* const task)
       {
         // Cache row shape data.
         std::vector<typename SparseShape<T>::value_type> row_shape_values;
         row_shape_values.reserve(row.size());
         const size_type row_start = k * proc_grid_.cols() + proc_grid_.rank_col();
         for(size_type j = 0ul; j < row.size(); ++j)
           row_shape_values.push_back(right_.shape()[row_start + (row[j].first * right_stride_local_)]);

         const size_type col_start = left_start_local_ + k;
         const float threshold_k = TensorImpl_::shape().threshold() / typename SparseShape<T>::value_type(k_);
         // Iterate over the row
         for(size_type i = 0ul; i != col.size(); ++i) {
           // Compute the local, result-tile offset
           const size_type offset = col[i].first * proc_grid_.local_cols();

           // Get the shape data for col_it tile
           const typename SparseShape<T>::value_type col_shape_value =
               left_.shape()[col_start + (col[i].first * left_stride_local_)];

           // Iterate over columns
           for(size_type j = 0ul; j < row.size(); ++j) {
             if((col_shape_value * row_shape_values[j]) < threshold_k)
               continue;

             const size_type reduce_task_index = offset + row[j].first;

             // Skip zero tiles
             if(! reduce_tasks_[reduce_task_index])
               continue;

             if(task)
               task->inc();
             reduce_tasks_[reduce_task_index].add(col[i].second, row[j].second, task);
           }
         }
       }
 #endif // TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER

       void contract(const size_type k, const std::vector<col_datum>& col,
           const std::vector<row_datum>& row, madness::TaskInterface* const task)
       { contract(TensorImpl_::shape(), k, col, row, task); }


       // SUMMA step task -------------------------------------------------------


       class StepTask : public madness::TaskInterface {
       protected:
         // Member variables
         std::shared_ptr<Summa_> owner_;
         World& world_;
         std::vector<col_datum> col_{};
         std::vector<row_datum> row_{};
         FinalizeTask* finalize_task_;
         StepTask* next_step_task_ = nullptr;
         StepTask* tail_step_task_ = nullptr;

         void get_col(const size_type k) {
           owner_->get_col(k, col_);
           if (trace_tasks)
             this->notify_debug("StepTask::spawn_col");
           else
             this->notify();
         }

         void get_row(const size_type k) {
           owner_->get_row(k, row_);
           if (trace_tasks)
             this->notify_debug("StepTask::spawn_row");
           else
             this->notify();
         }

       public:

         StepTask(const std::shared_ptr<Summa_>& owner, int finalize_ndep) :
 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
         madness::TaskInterface(0ul, "StepTask 1st ctor", madness::TaskAttributes::hipri()),
 #else
         madness::TaskInterface(0ul, madness::TaskAttributes::hipri()),
 #endif
           owner_(owner), world_(owner->world()),
           finalize_task_(new FinalizeTask(owner, finalize_ndep))
         {
           TA_ASSERT(owner_);
           owner_->world().taskq.add(finalize_task_);
         }


         StepTask(StepTask* const parent, const int ndep) :
 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
           madness::TaskInterface(ndep, "StepTask nth ctor", madness::TaskAttributes::hipri()),
 #else
           madness::TaskInterface(ndep, madness::TaskAttributes::hipri()),
 #endif
           owner_(parent->owner_), world_(parent->world_),
           finalize_task_(parent->finalize_task_)
         {
           TA_ASSERT(parent);
           parent->next_step_task_ = this;
         }

         virtual ~StepTask() { }

         void spawn_get_row_col_tasks(const size_type k) {
           // Submit the task to collect column tiles of left for iteration k
           if (trace_tasks)
             madness::DependencyInterface::inc_debug("StepTask::spawn_col");
           else
             madness::DependencyInterface::inc();
           world_.taskq.add(this, & StepTask::get_col, k, madness::TaskAttributes::hipri());

           // Submit the task to collect row tiles of right for iteration k
           if (trace_tasks)
             madness::DependencyInterface::inc_debug("StepTask::spawn_row");
           else
             madness::DependencyInterface::inc();
           world_.taskq.add(this, & StepTask::get_row, k, madness::TaskAttributes::hipri());
         }

         template <typename Derived>
         void make_next_step_tasks(Derived* task, size_type depth) {
           TA_ASSERT(depth > 0);
           // Set the depth to be no greater than the maximum number steps
           if(depth > owner_->k_)
             depth = owner_->k_;

           // Spawn n=depth step tasks
           for(; depth > 0ul; --depth) {
             // Set dep count of the tail task to 1, it will not start until this task commands
             Derived* const next = new Derived(task, depth == 1 ? 1 : 0);
             task = next;
           }

           // Keep track of the tail ptr
           tail_step_task_ = task;
         }

         template <typename Derived, typename GroupType>
         void run(const size_type k, const GroupType& row_group, const GroupType& col_group) {
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
           printf("step:  start rank=%i k=%lu\n", owner_->world().rank(), k);
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP

           if(k < owner_->k_) {
             // Initialize next tail task and submit next task
             TA_ASSERT(next_step_task_);
             next_step_task_->tail_step_task_ =
                 new Derived(static_cast<Derived*>(tail_step_task_), 1);  // <- ndep=1, will control its scheduling by this task
             // submit next step task ... even if it's same as tail_step_task_ it is safe to submit
             // because its ndep > 0 (see StepTask::make_next_step_tasks)
             TA_ASSERT(tail_step_task_->ndep() > 0);
             world_.taskq.add(next_step_task_);
             next_step_task_ = nullptr;

             // Start broadcast of column and row tiles for this step
             world_.taskq.add(owner_, & Summa_::bcast_col, k, col_, row_group,
                              madness::TaskAttributes::hipri());
             world_.taskq.add(owner_, & Summa_::bcast_row, k, row_, col_group,
                              madness::TaskAttributes::hipri());

             // Submit tasks for the contraction of col and row tiles.
             owner_->contract(k, col_, row_, tail_step_task_);

             // Notify task dependencies
             TA_ASSERT(tail_step_task_);
             if (trace_tasks)
               tail_step_task_->notify_debug("StepTask nth ctor");
             else
               tail_step_task_->notify();
             finalize_task_->notify();

           } else if(finalize_task_) {
             // Signal the finalize task so it can run after all non-zero step
             // tasks have completed.
             finalize_task_->notify();

             // Cleanup any remaining step tasks
             StepTask* step_task = next_step_task_;
             while(step_task) {
               StepTask* const next_step_task = step_task->next_step_task_;
               step_task->next_step_task_ = nullptr;
               step_task->finalize_task_ = nullptr;
               world_.taskq.add(step_task);
               step_task = next_step_task;
             }

             if (trace_tasks)
               tail_step_task_->notify_debug("StepTask nth ctor");
             else
               tail_step_task_->notify();
           }

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
           printf("step: finish rank=%i k=%lu\n", owner_->world().rank(), k);
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP
         }

       }; // class StepTask

       class DenseStepTask : public StepTask {
       protected:
         const size_type k_;
         using StepTask::owner_;

       public:
         DenseStepTask(const std::shared_ptr<Summa_>& owner, const size_type depth) :
           StepTask(owner, owner->k_ + 1ul), k_(0)
         {
           StepTask::make_next_step_tasks(this, depth);
           StepTask::spawn_get_row_col_tasks(k_);
         }

         DenseStepTask(DenseStepTask* const parent, const int ndep) :
           StepTask(parent, ndep), k_(parent->k_ + 1ul)
         {
           // Spawn tasks to get k-th row and column tiles
           if(k_ < owner_->k_)
             StepTask::spawn_get_row_col_tasks(k_);
         }

         virtual ~DenseStepTask() { }

         virtual void run(const madness::TaskThreadEnv&) {
           StepTask::template run<DenseStepTask>(k_, owner_->row_group_, owner_->col_group_);
         }
       }; // class DenseStepTask

       class SparseStepTask : public StepTask {
       protected:
         Future<size_type> k_{};
         Future<madness::Group> row_group_{};
         Future<madness::Group> col_group_{};
         using StepTask::owner_;
         using StepTask::world_;
         using StepTask::finalize_task_;
         using StepTask::next_step_task_;

       private:

         void iterate_task(size_type k, const size_type offset) {
           // Search for the next non-zero row and column
           k = owner_->iterate_sparse(k + offset);
           k_.set(k);

           if(k < owner_->k_) {
             // NOTE: The order of task submissions is dependent on the order in
             // which we want the tasks to complete.

             // Spawn tasks to get k-th row and column tiles
             StepTask::spawn_get_row_col_tasks(k);

             // Spawn tasks to construct the row and column broadcast group
             row_group_ = world_.taskq.add(owner_, & Summa_::make_row_group, k,
                 madness::TaskAttributes::hipri());
             col_group_ = world_.taskq.add(owner_, & Summa_::make_col_group, k,
                 madness::TaskAttributes::hipri());

             // Increment the finalize task dependency counter, which indicates
             // that this task is not the terminating step task.
             TA_ASSERT(finalize_task_);
             finalize_task_->inc();
           }

           if (trace_tasks)
             madness::DependencyInterface::notify_debug("SparseStepTask ctor");
           else
             madness::DependencyInterface::notify();
         }

       public:

         SparseStepTask(const std::shared_ptr<Summa_>& owner, size_type depth) :
           StepTask(owner, 1ul)
         {
           StepTask::make_next_step_tasks(this, depth);

           // Spawn a task to find the next non-zero iteration
           if (trace_tasks)
             madness::DependencyInterface::inc_debug("SparseStepTask ctor");
           else
             madness::DependencyInterface::inc();
           world_.taskq.add(this, & SparseStepTask::iterate_task,
               0ul, 0ul, madness::TaskAttributes::hipri());
         }

         SparseStepTask(SparseStepTask* const parent, const int ndep) :
           StepTask(parent, ndep)
         {
           if(parent->k_.probe() && (parent->k_.get() >= owner_->k_)) {
             // Avoid running extra tasks if not needed.
             k_.set(parent->k_.get());
             MADNESS_ASSERT(ndep == 1);  // ensure that this does not get executed immediately
           } else {
             // Spawn a task to find the next non-zero iteration
             if (trace_tasks)
               madness::DependencyInterface::inc_debug("SparseStepTask ctor");
             else
               madness::DependencyInterface::inc();
             world_.taskq.add(this, & SparseStepTask::iterate_task,
                 parent->k_, 1ul, madness::TaskAttributes::hipri());
           }
         }

         virtual ~SparseStepTask() { }

         virtual void run(const madness::TaskThreadEnv&) {
           StepTask::template run<SparseStepTask>(k_, row_group_, col_group_);
         }
       }; // class SparseStepTask

     public:


       Summa(const left_type& left, const right_type& right,
           World& world, const trange_type trange, const shape_type& shape,
           const std::shared_ptr<pmap_interface>& pmap, const Permutation& perm,
           const op_type& op, const size_type k, const ProcGrid& proc_grid) :
         DistEvalImpl_(world, trange, shape, pmap, perm),
         left_(left), right_(right), op_(op),
         row_group_(), col_group_(),
         k_(k), proc_grid_(proc_grid),
         reduce_tasks_(NULL),
         left_start_local_(proc_grid_.rank_row() * k),
         left_end_(left.size()),
         left_stride_(k),
         left_stride_local_(proc_grid.proc_rows() * k),
         right_stride_(1ul),
         right_stride_local_(proc_grid.proc_cols())
       { }

       virtual ~Summa() { }


       virtual Future<value_type> get_tile(size_type i) const {
         TA_ASSERT(TensorImpl_::is_local(i));
         TA_ASSERT(! TensorImpl_::is_zero(i));

         const size_type source_index = DistEvalImpl_::perm_index_to_source(i);

         // Compute tile coordinate in tile grid
         const size_type tile_row = source_index / proc_grid_.cols();
         const size_type tile_col = source_index % proc_grid_.cols();
         // Compute process coordinate of tile in the process grid
         const size_type proc_row = tile_row % proc_grid_.proc_rows();
         const size_type proc_col = tile_col % proc_grid_.proc_cols();
         // Compute the process that owns tile
         const ProcessID source = proc_row * proc_grid_.proc_cols() + proc_col;

         const madness::DistributedID key(DistEvalImpl_::id(), i);
         return TensorImpl_::world().gop.template recv<value_type>(source, key);
       }


       virtual void discard_tile(size_type i) const { get_tile(i); }

     private:


       size_type mem_bound_depth(size_type depth, const float left_sparsity, const float right_sparsity) {

         // Check if a memory bound has been set
         const size_type available_memory = max_memory_;
         if(available_memory) {

           // Compute the average memory requirement per iteration of this process
           const std::size_t local_memory_per_iter_left =
               (left_.trange().elements_range().volume() / left_.trange().tiles_range().volume()) *
               sizeof(typename numeric_type<typename left_type::eval_type>::type) *
               proc_grid_.local_rows() * (1.0f - left_sparsity);
           const std::size_t local_memory_per_iter_right =
               (right_.trange().elements_range().volume() / right_.trange().tiles_range().volume()) *
               sizeof(typename numeric_type<typename right_type::eval_type>::type) *
               proc_grid_.local_cols() * (1.0f - right_sparsity);

           // Compute the maximum number of iterations based on available memory
           const size_type mem_bound_depth =
               ((local_memory_per_iter_left + local_memory_per_iter_right) /
               available_memory);

           // Check if the memory bounded depth is less than the optimal depth
           if(depth > mem_bound_depth) {

             // Adjust the depth based on the available memory
             switch(mem_bound_depth) {
               case 0:
                 // When memory bound depth is
                 TA_EXCEPTION("Insufficient memory available for SUMMA");
                 break;
               case 1:
                 if(TensorImpl_::world().rank() == 0)
                   printf("!! WARNING TiledArray: Memory constraints limit the SUMMA depth depth to 1.\n"
                          "!! WARNING TiledArray: Performance may be slow.\n");
               default:
                 depth = mem_bound_depth;
             }
           }
         }

         return depth;
       }


       virtual int internal_eval() {
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
         printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL

         // Start evaluate child tensors
         left_.eval();
         right_.eval();

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
         printf("eval: finished eval children rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL

         size_type tile_count = 0ul;
         if(proc_grid_.local_size() > 0ul) {
           tile_count = initialize();

           // depth controls the number of simultaneous SUMMA iterations
           // that are scheduled.

           // The optimal depth is equal to the smallest dimension of the process
           // grid, but no less than 2
           size_type depth =
               std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));

           // Construct the first SUMMA iteration task
           if(TensorImpl_::shape().is_dense()) {
             // We cannot have more iterations than there are blocks in the k
             // dimension
             if(depth > k_) depth = k_;

             // Modify the number of concurrent iterations based on the available
             // memory.
             depth = mem_bound_depth(depth, 0.0f, 0.0f);

             // Enforce user defined depth bound
             if(max_depth_) std::min(depth, max_depth_);

             TensorImpl_::world().taskq.add(new DenseStepTask(shared_from_this(),
                                                              depth));
           } else {
             // Increase the depth based on the amount of sparsity in an iteration.

             // Get the sparsity fractions for the left- and right-hand arguments.
             const float left_sparsity = left_.shape().sparsity();
             const float right_sparsity = right_.shape().sparsity();

             // Compute the fraction of non-zero result tiles in a single SUMMA iteration.
             const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f))
                                       * (1.0f - std::min(right_sparsity, 0.9f));

             // Compute the new depth based on sparsity of the arguments
             depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;

             // We cannot have more iterations than there are blocks in the k
             // dimension
             if(depth > k_) depth = k_;

             // Modify the number of concurrent iterations based on the available
             // memory and sparsity of the argument tensors.
             depth = mem_bound_depth(depth, left_sparsity, right_sparsity);

             // Enforce user defined depth bound
             if(max_depth_) std::min(depth, max_depth_);

             TensorImpl_::world().taskq.add(new SparseStepTask(shared_from_this(),
                                                               depth));
           }
         }

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
         printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL

         // Wait for child tensors to be evaluated, and process tasks while waiting.
         left_.wait();
         right_.wait();

 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
         printf("eval: finished wait children rank=%i\n", TensorImpl_::world().rank());
 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL

         return tile_count;
       }

     }; // class Summa


     // Initialize static member variables for Summa

     template <typename Left, typename Right, typename Op, typename Policy>
     typename Summa<Left, Right, Op, Policy>::size_type
     Summa<Left, Right, Op, Policy>::max_depth_ =
         Summa<Left, Right, Op, Policy>::init_max_depth();

     template <typename Left, typename Right, typename Op, typename Policy>
     typename Summa<Left, Right, Op, Policy>::size_type
     Summa<Left, Right, Op, Policy>::max_memory_ =
         Summa<Left, Right, Op, Policy>::init_max_memory();
   } // namespace detail
 }  // namespace TiledArray

 #endif // TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED
TiledArray::detail::Summa::left_type
Left left_type
The left-hand argument type.
Definition: contraction_eval.h:61

TiledArray::detail::DistEvalImpl::size_type
TensorImpl_::size_type size_type
Size type.
Definition: dist_eval.h:46

reduce_task.h

proc_grid.h

TiledArray::Cast
Tile cast operation.
Definition: cast.h:34

TiledArray::detail::Summa::right_type
Right right_type
The right-hand argument type.
Definition: contraction_eval.h:62

TiledArray::detail::TensorImpl::shape
const shape_type & shape() const
Tensor shape accessor.
Definition: tensor_impl.h:156

TiledArray::detail::TensorImpl::trange
const trange_type & trange() const
Tiled range accessor.
Definition: tensor_impl.h:161

TiledArray::detail::TensorImpl::is_dense
bool is_dense() const
Query the density of the tensor.
Definition: tensor_impl.h:150

TiledArray::detail::ProcGrid::rank_row
ProcessID rank_row() const
Rank row accessor.
Definition: proc_grid.h:389

TiledArray::detail::DistEvalImpl
Distributed evaluator implementation object.
Definition: dist_eval.h:40

TiledArray::detail::ProcGrid::rank_col
ProcessID rank_col() const
Rank row accessor.
Definition: proc_grid.h:394

TiledArray::detail::ProcGrid::make_row_group
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Definition: proc_grid.h:417

TiledArray::detail::Summa::Summa_
Summa< Left, Right, Op, Policy > Summa_
This object type.
Definition: contraction_eval.h:58

TiledArray::detail::DistEvalImpl::pmap_interface
TensorImpl_::pmap_interface pmap_interface
process map interface type
Definition: dist_eval.h:50

TiledArray::detail::ProcGrid::map_col
ProcessID map_col(const size_type col) const
Map a column to the process in this process&#39;s row.
Definition: proc_grid.h:479

TiledArray::detail::ProcGrid::map_row
ProcessID map_row(const size_type row) const
Map a row to the process in this process&#39;s column.
Definition: proc_grid.h:470

TiledArray::detail::TensorImpl
Tensor implementation and base for other tensor implementation objects.
Definition: tensor_impl.h:39

TiledArray::detail::numeric_type
Type trait for extracting the numeric type of tensors and arrays.
Definition: type_traits.h:479

TiledArray::detail::Summa::~Summa
virtual ~Summa()
Definition: contraction_eval.h:1500

TiledArray::detail::DistEvalImpl::set_tile
void set_tile(size_type i, const value_type &value)
Set tensor value.
Definition: dist_eval.h:146

TiledArray::detail::ProcGrid::size_type
uint_fast32_t size_type
Definition: proc_grid.h:61

TiledArray::detail::Summa::shape_type
DistEvalImpl_::shape_type shape_type
Shape type.
Definition: contraction_eval.h:65

max
KroneckerDeltaTile< _N >::numeric_type max(const KroneckerDeltaTile< _N > &arg)

dist_eval.h

TiledArray::detail::ProcGrid::cols
size_type cols() const
Element column count accessor.
Definition: proc_grid.h:364

TiledArray::detail::DistEvalImpl::trange_type
TensorImpl_::trange_type trange_type
Tiled range type for this object.
Definition: dist_eval.h:47

TiledArray::SparseShape::value_type
T value_type
The norm value type.
Definition: sparse_shape.h:58

TiledArray::detail::ReducePairTask::add
void add(const L &left, const R &right, madness::CallbackInterface *callback=nullptr)
Add a pair of arguments to the reduction task.
Definition: reduce_task.h:705

TiledArray::detail::Summa::op_type
Op op_type
Tile evaluation operator type.
Definition: contraction_eval.h:70

TiledArray::detail::DistEvalImpl::perm_index_to_target
size_type perm_index_to_target(size_type index) const
Permute index from a source index to a target index.
Definition: dist_eval.h:75

TiledArray::detail::Summa::discard_tile
virtual void discard_tile(size_type i) const
Discard a tile that is not needed.
Definition: contraction_eval.h:1533

TiledArray::detail::Summa::pmap_interface
DistEvalImpl_::pmap_interface pmap_interface
Process map interface type.
Definition: contraction_eval.h:66

TiledArray::detail::Summa::Summa
Summa(const left_type &left, const right_type &right, World &world, const trange_type trange, const shape_type &shape, const std::shared_ptr< pmap_interface > &pmap, const Permutation &perm, const op_type &op, const size_type k, const ProcGrid &proc_grid)
Constructor.
Definition: contraction_eval.h:1483

TiledArray
Definition: conjgrad.h:34

TiledArray::detail::DistEvalImpl::shape_type
TensorImpl_::shape_type shape_type
Shape type.
Definition: dist_eval.h:49

TiledArray::detail::TensorImpl::owner
ProcessID owner(const Index &i) const
Query a tile owner.
Definition: tensor_impl.h:116

TiledArray::detail::Summa::trange_type
DistEvalImpl_::trange_type trange_type
Tiled range type.
Definition: contraction_eval.h:67

TiledArray::detail::TensorImpl::is_local
bool is_local(const Index &i) const
Query for a locally owned tile.
Definition: tensor_impl.h:128

TiledArray::detail::ProcGrid::make_col_group
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
Definition: proc_grid.h:444

TA_ASSERT
#define TA_ASSERT(a)
Definition: error.h:107

TiledArray::detail::DistEvalImpl::id
const madness::uniqueidT & id() const
Unique object id accessor.
Definition: dist_eval.h:125

TiledArray::detail::ProcGrid::rows
size_type rows() const
Element row count accessor.
Definition: proc_grid.h:359

TiledArray::detail::TensorImpl::is_zero
bool is_zero(const Index &i) const
Query for a zero tile.
Definition: tensor_impl.h:141

TiledArray::detail::ReducePairTask< op_type >

TiledArray::detail::Summa::range_type
DistEvalImpl_::range_type range_type
Range type.
Definition: contraction_eval.h:64

TiledArray::detail::Summa::TensorImpl_
DistEvalImpl_::TensorImpl_ TensorImpl_
The base, base class type.
Definition: contraction_eval.h:60

TiledArray::detail::ProcGrid::proc_cols
size_type proc_cols() const
Process column count accessor.
Definition: proc_grid.h:404

TiledArray::detail::ProcGrid::local_rows
size_type local_rows() const
Local element row count accessor.
Definition: proc_grid.h:374

TiledArray::detail::TensorImpl::size
size_type size() const
Tensor tile volume accessor.
Definition: tensor_impl.h:97

TiledArray::detail::DistEvalImpl::range_type
TensorImpl_::range_type range_type
Range type this tensor.
Definition: dist_eval.h:48

TiledArray::detail::DistEvalImpl::eval_type
eval_trait< value_type >::type eval_type
Tile evaluation type.
Definition: dist_eval.h:52

TiledArray::Permutation
Permutation of a sequence of objects indexed by base-0 indices.
Definition: permutation.h:119

TiledArray::detail::ProcGrid::proc_rows
size_type proc_rows() const
Process row count accessor.
Definition: proc_grid.h:399

TA_EXCEPTION
#define TA_EXCEPTION(m)
Definition: error.h:72

TiledArray::detail::Summa::size_type
DistEvalImpl_::size_type size_type
Size type.
Definition: contraction_eval.h:63

TiledArray::detail::ProcGrid::local_cols
size_type local_cols() const
Local element column count accessor.
Definition: proc_grid.h:379

TiledArray::detail::ProcGrid::local_size
size_type local_size() const
Local element count accessor.
Definition: proc_grid.h:384

TiledArray::detail::TensorImpl::world
World & world() const
World accessor.
Definition: tensor_impl.h:169

TiledArray::detail::Summa
Distributed contraction evaluator implementation.
Definition: contraction_eval.h:53

shape.h

min
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)

TiledArray::detail::DistEvalImpl::perm_index_to_source
size_type perm_index_to_source(size_type index) const
Permute index from a target index to a source index.
Definition: dist_eval.h:84

TiledArray::detail::TensorImpl::pmap
const std::shared_ptr< pmap_interface > & pmap() const
Tensor process map accessor.
Definition: tensor_impl.h:85

madness
Definition: array_impl.h:211

TiledArray::detail::ProcGrid
A 2D processor grid.
Definition: proc_grid.h:59

TiledArray::detail::Summa::get_tile
virtual Future< value_type > get_tile(size_type i) const
Get tile at index i.
Definition: contraction_eval.h:1508

TiledArray::detail::Summa::value_type
DistEvalImpl_::value_type value_type
Tile type.
Definition: contraction_eval.h:68

TiledArray::detail::Summa::DistEvalImpl_
DistEvalImpl< typename Op::result_type, Policy > DistEvalImpl_
The base class type.
Definition: contraction_eval.h:59

TiledArray::Tile
An N-dimensional shallow copy wrapper for tile objects.
Definition: tile.h:80

TiledArray::detail::DistEvalImpl< Op::result_type, Policy >::notify
virtual void notify()
Tile set notification.
Definition: dist_eval.h:171

TiledArray::detail::Summa::eval_type
DistEvalImpl_::eval_type eval_type
Tile evaluation type.
Definition: contraction_eval.h:69

type_traits.h