20 #ifndef TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED 21 #define TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED 25 #include <TiledArray/config.h> 52 template <
typename Left,
typename Right,
typename Op,
typename Policy>
55 public std::enable_shared_from_this<Summa<Left, Right, Op, Policy> >
82 madness::Group row_group_;
83 madness::Group col_group_;
101 typedef Future<typename right_type::eval_type> right_future;
102 typedef Future<typename left_type::eval_type> left_future;
103 typedef std::pair<size_type, right_future> row_datum;
104 typedef std::pair<size_type, left_future> col_datum;
106 static constexpr
const bool trace_tasks =
107 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE 117 using std::enable_shared_from_this<Summa_>::shared_from_this;
126 const char* max_memory = getenv(
"TA_SUMMA_MAX_MEMORY");
129 std::stringstream ss(max_memory);
135 if(unit ==
"KB" || unit ==
"kB") {
137 }
else if(unit ==
"KiB" || unit ==
"kiB") {
139 }
else if(unit ==
"MB") {
141 }
else if(unit ==
"MiB") {
143 }
else if(unit ==
"GB") {
144 memory *= 1000000000.0;
145 }
else if(unit ==
"GiB") {
146 memory *= 1073741824.0;
152 memory =
std::max(memory, 104857600.0);
161 const char* max_depth = getenv(
"TA_SUMMA_MAX_DEPTH");
163 return std::stoul(max_depth);
193 template <
typename Shape,
typename ProcMap>
194 madness::Group make_group(
const Shape&
shape,
const std::vector<bool>& process_mask,
size_type index,
199 std::vector<ProcessID> proc_list(max_group_size, -1);
204 proc_list[p] = proc_map(p);
208 for(p = 0ul; (index < end) && (count < max_group_size); index += stride,
209 p = (p + 1u) % max_group_size)
211 if((proc_list[p] != -1) || (
shape.is_zero(index)) || !process_mask.at(p))
continue;
213 proc_list[p] = proc_map(p);
218 for(
size_type x = 0ul, p = 0ul; x < count; ++p) {
219 if(proc_list[p] == -1)
continue;
220 proc_list[x++] = proc_list[p];
224 proc_list.resize(count);
234 madness::Group make_row_group(
const size_type k)
const {
237 const size_type right_end_k = right_begin_k + proc_grid_.
cols();
240 auto result_row_mask_k = make_row_mask(k);
243 if (result_row_mask_k[proc_grid_.
rank_col()])
244 return make_group(right_.shape(), result_row_mask_k, right_begin_k, right_end_k,
245 right_stride_, proc_grid_.
proc_cols(), k, k_,
248 return madness::Group();
256 madness::Group make_col_group(
const size_type k)
const {
260 auto result_col_mask_k = make_col_mask(k);
263 if (result_col_mask_k[proc_grid_.
rank_row()])
264 return make_group(left_.shape(), result_col_mask_k, k, left_end_, left_stride_,
268 return madness::Group();
276 std::vector<bool> make_row_mask(
const size_type k)
const {
282 const auto nproc_cols = proc_grid_.
proc_cols();
283 const auto my_proc_row = proc_grid_.
rank_row();
289 if (result_shape.is_dense())
290 return std::vector<bool>(nproc_cols,
true);
293 std::vector<bool> mask(nproc_cols,
false);
296 const auto nj = proc_grid_.
cols();
302 std::tie(i_start, i_fence, i_stride) =
303 result_row_range(my_proc_row);
304 const auto ik_stride = i_stride * nk;
305 for (
size_type i = i_start, ik = i_start * nk + k; i < i_fence;
306 i += i_stride, ik += ik_stride) {
308 if (!left_.shape().is_zero(ik)) {
310 const auto k_proc_col = k % nproc_cols;
311 mask[k_proc_col] =
true;
313 for (
size_type proc_col = 0; proc_col != nproc_cols; ++proc_col) {
315 if (proc_col != k_proc_col) {
318 std::tie(j_start, j_fence, j_stride) =
319 result_col_range(proc_col);
320 const auto ij_stride = j_stride;
321 for (
size_type j = j_start, ij = i * nj + j_start; j < j_fence;
322 j += j_stride, ij += ij_stride) {
326 mask[proc_col] =
true;
344 std::vector<bool> make_col_mask(
const size_type k)
const {
351 const auto nproc_rows = proc_grid_.
proc_rows();
352 const auto my_proc_col = proc_grid_.
rank_col();
358 if (result_shape.is_dense())
359 return std::vector<bool>(nproc_rows,
true);
362 std::vector<bool> mask(nproc_rows,
false);
365 const auto nj = proc_grid_.
cols();
369 std::tie(j_start, j_fence, j_stride) = result_col_range(my_proc_col);
370 const auto kj_stride = j_stride;
371 for (
size_type j = j_start, kj = k * nj + j_start; j < j_fence;
372 j += j_stride, kj += kj_stride) {
374 if (!right_.shape().is_zero(kj)) {
376 auto k_proc_row = k % nproc_rows;
377 mask[k_proc_row] =
true;
379 for (
size_type proc_row = 0; proc_row != nproc_rows; ++proc_row) {
381 if (proc_row != k_proc_row) {
384 std::tie(i_start, i_fence, i_stride) =
385 result_row_range(proc_row);
386 const auto ij_stride = i_stride * nj;
387 for (
size_type i = i_start, ij = i_start * nj + j; i < i_fence;
388 i += i_stride, ij += ij_stride) {
391 if (!result_shape.is_zero(
393 mask[proc_row] =
true;
411 inline std::tuple<size_type, size_type, size_type> result_row_range(
416 return std::make_tuple(start, fence, stride);
425 std::tuple<size_type, size_type, size_type> result_col_range(
430 return std::make_tuple(start, fence, stride);
440 template <
typename Tile>
441 static auto convert_tile(
const Tile& tile) {
453 template <
typename Arg>
454 static typename std::enable_if<
455 ! is_lazy_tile<typename Arg::value_type>::value,
456 Future<typename Arg::eval_type> >::type
457 get_tile(Arg& arg,
const typename Arg::size_type index) {
return arg.get(index); }
468 template <
typename Arg>
469 static typename std::enable_if<
470 is_lazy_tile<typename Arg::value_type>::value,
471 Future<typename Arg::eval_type> >::type
472 get_tile(Arg& arg,
const typename Arg::size_type index) {
473 auto convert_tile_fn =
474 &Summa_::template convert_tile<typename Arg::value_type>;
475 return arg.world().taskq.add(convert_tile_fn, arg.get(index),
476 madness::TaskAttributes::hipri());
489 template <
typename Arg,
typename Datum>
491 const size_type stride, std::vector<Datum>& vec)
const 496 if(arg.is_local(index)) {
497 for(
size_type i = 0ul; index < end; ++i, index += stride) {
498 if(arg.shape().is_zero(index))
continue;
499 vec.emplace_back(i, get_tile(arg, index));
502 for(
size_type i = 0ul; index < end; ++i, index += stride) {
503 if(arg.shape().is_zero(index))
continue;
504 vec.emplace_back(i, Future<typename Arg::eval_type>());
515 void get_col(
const size_type k, std::vector<col_datum>& col)
const {
517 get_vector(left_, left_start_local_ + k, left_end_, left_stride_local_, col);
524 void get_row(
const size_type k, std::vector<row_datum>& row)
const {
532 get_vector(right_, begin, end, right_stride_local_, row);
543 template <
typename Datum>
545 const madness::Group& group,
const ProcessID group_root,
546 const size_type key_offset, std::vector<Datum>& vec)
const 552 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 553 std::stringstream ss;
555 <<
" root=" << group.world_rank(group_root)
556 <<
" groupid=(" << group.id().first <<
"," << group.id().second
557 <<
") keyoffset=" << key_offset <<
" group={ ";
558 for(ProcessID group_proc = 0; group_proc < group.size(); ++group_proc)
559 ss << group.world_rank(group_proc) <<
" ";
561 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 564 for(
typename std::vector<Datum>::iterator it = vec.begin(); it != vec.end(); ++it) {
565 const size_type index = it->first * stride + start;
571 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 573 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 578 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 580 printf(ss.str().c_str());
581 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 587 ProcessID get_row_group_root(
const size_type k,
const madness::Group& row_group)
const {
588 ProcessID group_root = k % proc_grid_.
proc_cols();
589 if(! right_.shape().is_dense() && row_group.size() <
static_cast<ProcessID
>(proc_grid_.
proc_cols())) {
590 const ProcessID world_root = proc_grid_.
rank_row() * proc_grid_.
proc_cols() + group_root;
591 group_root = row_group.rank(world_root);
596 ProcessID get_col_group_root(
const size_type k,
const madness::Group& col_group)
const {
597 ProcessID group_root = k % proc_grid_.
proc_rows();
598 if(! left_.shape().is_dense() && col_group.size() <
static_cast<ProcessID
>(proc_grid_.
proc_rows())) {
599 const ProcessID world_root = group_root * proc_grid_.
proc_cols() + proc_grid_.
rank_col();
600 group_root = col_group.rank(world_root);
609 void bcast_col(
const size_type k, std::vector<col_datum>& col,
const madness::Group& row_group)
const {
611 if (!row_group.empty()) {
613 ProcessID group_root = get_row_group_root(k, row_group);
614 bcast(left_start_local_ + k, left_stride_local_, row_group, group_root, 0ul, col);
622 void bcast_row(
const size_type k, std::vector<row_datum>& row,
const madness::Group& col_group)
const {
624 if (!col_group.empty()) {
626 ProcessID group_root = get_col_group_root(k, col_group);
630 right_stride_local_, col_group, group_root, left_.size(), row);
637 k += (Pcols - ((k + Pcols - proc_grid_.
rank_col()) % Pcols)) % Pcols;
639 for(; k < end; k += Pcols) {
645 bool have_group =
false;
646 madness::Group row_group;
647 ProcessID group_root;
651 for(; index < left_end_; index += left_stride_local_) {
652 if(left_.shape().is_zero(index))
continue;
657 row_group = make_row_group(k);
659 do_broadcast = !row_group.empty() && row_group.size() > 1;
661 group_root = get_row_group_root(k, row_group);
667 auto tile = get_tile(left_, index);
671 left_.discard(index);
680 k += (Prows - ((k + Prows - proc_grid_.
rank_row()) % Prows)) % Prows;
682 for(; k < end; k += Prows) {
690 bool have_group =
false;
691 madness::Group col_group;
692 ProcessID group_root;
696 for(; index < row_end; index += right_stride_local_) {
697 if(right_.shape().is_zero(index))
continue;
702 col_group = make_col_group(k);
704 do_broadcast = !col_group.empty() && col_group.size() > 1;
706 group_root = get_col_group_root(k, col_group);
712 auto tile = get_tile(right_, index);
716 right_.discard(index);
740 end += proc_grid_.
cols();
741 for(; i < end; i += right_stride_local_)
742 if(! right_.shape().is_zero(i))
762 for(
size_type i = left_start_local_ + k; i < left_end_; i += left_stride_local_)
763 if(! left_.shape().is_zero(i))
786 while(k_col != k_row) {
788 k_col = iterate_col(k_row);
790 k_row = iterate_row(k_col);
797 & Summa_::bcast_col_range_task, k, k_row,
798 madness::TaskAttributes::hipri());
802 & Summa_::bcast_row_range_task, k, k_col,
803 madness::TaskAttributes::hipri());
821 return (left_.shape().is_dense() && right_.shape().is_dense() ?
822 k : iterate_sparse(k));
829 size_type initialize(
const DenseShape&) {
836 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 837 std::stringstream ss;
839 <<
"\n col_group_=(" << col_did.first <<
", " << col_did.second <<
") { ";
840 for(ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
841 ss << col_group_.world_rank(gproc) <<
" ";
842 ss <<
"}\n row_group_=(" << row_did.first <<
", " << row_did.second <<
") { ";
843 for(ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
844 ss << row_group_.world_rank(gproc) <<
" ";
846 printf(ss.str().c_str());
847 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 850 std::allocator<ReducePairTask<op_type> > alloc;
851 reduce_tasks_ = alloc.allocate(proc_grid_.
local_size());
857 ReducePairTask<op_type>* MADNESS_RESTRICT
const reduce_task = reduce_tasks_ + t;
865 template <
typename Shape>
868 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 869 std::stringstream ss;
871 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 874 std::allocator<ReducePairTask<op_type> > alloc;
875 reduce_tasks_ = alloc.allocate(proc_grid_.
local_size());
889 ReducePairTask<op_type>* MADNESS_RESTRICT reduce_task = reduce_tasks_;
892 for(; row_start < end; row_start += col_stride, row_end += col_stride) {
893 for(
size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
900 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 902 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 908 new(reduce_task) ReducePairTask<op_type>();
913 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 915 printf(ss.str().c_str());
916 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 922 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 924 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 928 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 930 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 939 void finalize(
const DenseShape&) {
951 for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
952 row_start < end; row_start += col_stride, row_end += col_stride) {
953 for(
size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
958 reduce_task->submit());
961 reduce_task->~ReducePairTask<
op_type>();
966 std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
971 template <
typename Shape>
972 void finalize(
const Shape&
shape) {
974 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 975 std::stringstream ss;
977 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 990 for(ReducePairTask<op_type>* reduce_task = reduce_tasks_;
991 row_start < end; row_start += col_stride, row_end += col_stride) {
992 for(
size_type index = row_start; index < row_end; index += row_stride, ++reduce_task) {
997 if(!
shape.is_zero(perm_index)) {
999 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1001 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1008 reduce_task->~ReducePairTask<
op_type>();
1013 std::allocator<ReducePairTask<op_type> >().deallocate(reduce_tasks_,
1016 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1018 printf(ss.str().c_str());
1019 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1023 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1025 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1029 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1031 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1037 class FinalizeTask :
public madness::TaskInterface {
1039 std::shared_ptr<Summa_> owner_;
1042 FinalizeTask(
const std::shared_ptr<Summa_>&
owner,
const int ndep) :
1047 virtual ~FinalizeTask() { }
1049 virtual void run(
const madness::TaskThreadEnv&) { owner_->finalize(); }
1064 void contract(
const DenseShape&,
const size_type,
1065 const std::vector<col_datum>& col,
const std::vector<row_datum>& row,
1066 madness::TaskInterface*
const task)
1069 for(
size_type i = 0ul; i < col.size(); ++i) {
1074 for(
size_type j = 0ul; j < row.size(); ++j) {
1075 const size_type reduce_task_index = reduce_task_offset + row[j].first;
1080 const left_future left = col[i].second;
1081 const right_future right = row[j].second;
1082 reduce_tasks_[reduce_task_index].
add(left, right, task);
1095 template <
typename Shape>
1096 void contract(
const Shape&,
const size_type,
1097 const std::vector<col_datum>& col,
const std::vector<row_datum>& row,
1098 madness::TaskInterface*
const task)
1101 for(
size_type i = 0ul; i < col.size(); ++i) {
1106 for(
size_type j = 0ul; j < row.size(); ++j) {
1107 const size_type reduce_task_index = reduce_task_offset + row[j].first;
1110 if(! reduce_tasks_[reduce_task_index])
1116 task->inc_debug(
"destroy(*ReduceObject)");
1120 const left_future left = col[i].second;
1121 const right_future right = row[j].second;
1122 reduce_tasks_[reduce_task_index].
add(left, right, task);
1127 #define TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER 1128 #ifndef TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER 1141 template <
typename T>
1142 typename std::enable_if<std::is_floating_point<T>::value>::type
1143 contract(
const SparseShape<T>&,
const size_type k,
1144 const std::vector<col_datum>& col,
const std::vector<row_datum>& row,
1145 madness::TaskInterface*
const task)
1148 std::vector<typename SparseShape<T>::value_type> row_shape_values;
1149 row_shape_values.reserve(row.size());
1151 for(
size_type j = 0ul; j < row.size(); ++j)
1152 row_shape_values.push_back(right_.shape()[row_start + (row[j].first * right_stride_local_)]);
1154 const size_type col_start = left_start_local_ + k;
1157 for(
size_type i = 0ul; i != col.size(); ++i) {
1163 left_.shape()[col_start + (col[i].first * left_stride_local_)];
1166 for(
size_type j = 0ul; j < row.size(); ++j) {
1167 if((col_shape_value * row_shape_values[j]) < threshold_k)
1170 const size_type reduce_task_index = offset + row[j].first;
1173 if(! reduce_tasks_[reduce_task_index])
1178 reduce_tasks_[reduce_task_index].
add(col[i].second, row[j].second, task);
1182 #endif // TILEDARRAY_DISABLE_TILE_CONTRACTION_FILTER 1184 void contract(
const size_type k,
const std::vector<col_datum>& col,
1185 const std::vector<row_datum>& row, madness::TaskInterface*
const task)
1196 class StepTask :
public madness::TaskInterface {
1199 std::shared_ptr<Summa_> owner_;
1201 std::vector<col_datum> col_{};
1202 std::vector<row_datum> row_{};
1203 FinalizeTask* finalize_task_;
1204 StepTask* next_step_task_ =
nullptr;
1205 StepTask* tail_step_task_ =
nullptr;
1208 owner_->get_col(k, col_);
1210 this->notify_debug(
"StepTask::spawn_col");
1216 owner_->get_row(k, row_);
1218 this->notify_debug(
"StepTask::spawn_row");
1225 StepTask(
const std::shared_ptr<Summa_>&
owner,
int finalize_ndep) :
1226 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1227 madness::TaskInterface(0ul,
"StepTask 1st ctor",
madness::TaskAttributes::hipri()),
1232 finalize_task_(new FinalizeTask(
owner, finalize_ndep))
1235 owner_->world().taskq.add(finalize_task_);
1242 StepTask(StepTask*
const parent,
const int ndep) :
1243 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
1244 madness::TaskInterface(ndep,
"StepTask nth ctor",
madness::TaskAttributes::hipri()),
1248 owner_(parent->owner_), world_(parent->world_),
1249 finalize_task_(parent->finalize_task_)
1252 parent->next_step_task_ =
this;
1255 virtual ~StepTask() { }
1257 void spawn_get_row_col_tasks(
const size_type k) {
1260 madness::DependencyInterface::inc_debug(
"StepTask::spawn_col");
1262 madness::DependencyInterface::inc();
1263 world_.taskq.add(
this, & StepTask::get_col, k, madness::TaskAttributes::hipri());
1267 madness::DependencyInterface::inc_debug(
"StepTask::spawn_row");
1269 madness::DependencyInterface::inc();
1270 world_.taskq.add(
this, & StepTask::get_row, k, madness::TaskAttributes::hipri());
1273 template <
typename Derived>
1274 void make_next_step_tasks(Derived* task,
size_type depth) {
1277 if(depth > owner_->k_)
1281 for(; depth > 0ul; --depth) {
1283 Derived*
const next =
new Derived(task, depth == 1 ? 1 : 0);
1288 tail_step_task_ = task;
1291 template <
typename Derived,
typename GroupType>
1292 void run(
const size_type k,
const GroupType& row_group,
const GroupType& col_group) {
1293 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1294 printf(
"step: start rank=%i k=%lu\n", owner_->world().rank(), k);
1295 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1297 if(k < owner_->k_) {
1300 next_step_task_->tail_step_task_ =
1301 new Derived(static_cast<Derived*>(tail_step_task_), 1);
1305 world_.taskq.add(next_step_task_);
1306 next_step_task_ =
nullptr;
1309 world_.taskq.add(owner_, & Summa_::bcast_col, k, col_, row_group,
1310 madness::TaskAttributes::hipri());
1311 world_.taskq.add(owner_, & Summa_::bcast_row, k, row_, col_group,
1312 madness::TaskAttributes::hipri());
1315 owner_->contract(k, col_, row_, tail_step_task_);
1320 tail_step_task_->notify_debug(
"StepTask nth ctor");
1322 tail_step_task_->notify();
1323 finalize_task_->notify();
1325 }
else if(finalize_task_) {
1328 finalize_task_->notify();
1331 StepTask* step_task = next_step_task_;
1333 StepTask*
const next_step_task = step_task->next_step_task_;
1334 step_task->next_step_task_ =
nullptr;
1335 step_task->finalize_task_ =
nullptr;
1336 world_.taskq.add(step_task);
1337 step_task = next_step_task;
1341 tail_step_task_->notify_debug(
"StepTask nth ctor");
1343 tail_step_task_->notify();
1346 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1347 printf(
"step: finish rank=%i k=%lu\n", owner_->world().rank(), k);
1348 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1353 class DenseStepTask :
public StepTask {
1356 using StepTask::owner_;
1359 DenseStepTask(
const std::shared_ptr<Summa_>&
owner,
const size_type depth) :
1362 StepTask::make_next_step_tasks(
this, depth);
1363 StepTask::spawn_get_row_col_tasks(k_);
1366 DenseStepTask(DenseStepTask*
const parent,
const int ndep) :
1367 StepTask(parent, ndep), k_(parent->k_ + 1ul)
1371 StepTask::spawn_get_row_col_tasks(k_);
1374 virtual ~DenseStepTask() { }
1376 virtual void run(
const madness::TaskThreadEnv&) {
1377 StepTask::template run<DenseStepTask>(k_, owner_->row_group_, owner_->col_group_);
1381 class SparseStepTask :
public StepTask {
1383 Future<size_type> k_{};
1384 Future<madness::Group> row_group_{};
1385 Future<madness::Group> col_group_{};
1386 using StepTask::owner_;
1387 using StepTask::world_;
1388 using StepTask::finalize_task_;
1389 using StepTask::next_step_task_;
1396 k = owner_->iterate_sparse(k + offset);
1399 if(k < owner_->k_) {
1404 StepTask::spawn_get_row_col_tasks(k);
1407 row_group_ = world_.taskq.add(owner_, & Summa_::make_row_group, k,
1408 madness::TaskAttributes::hipri());
1409 col_group_ = world_.taskq.add(owner_, & Summa_::make_col_group, k,
1410 madness::TaskAttributes::hipri());
1415 finalize_task_->inc();
1419 madness::DependencyInterface::notify_debug(
"SparseStepTask ctor");
1421 madness::DependencyInterface::notify();
1426 SparseStepTask(
const std::shared_ptr<Summa_>&
owner,
size_type depth) :
1427 StepTask(
owner, 1ul)
1429 StepTask::make_next_step_tasks(
this, depth);
1433 madness::DependencyInterface::inc_debug(
"SparseStepTask ctor");
1435 madness::DependencyInterface::inc();
1436 world_.taskq.add(
this, & SparseStepTask::iterate_task,
1437 0ul, 0ul, madness::TaskAttributes::hipri());
1440 SparseStepTask(SparseStepTask*
const parent,
const int ndep) :
1441 StepTask(parent, ndep)
1443 if(parent->k_.probe() && (parent->k_.get() >= owner_->k_)) {
1445 k_.set(parent->k_.get());
1446 MADNESS_ASSERT(ndep == 1);
1450 madness::DependencyInterface::inc_debug(
"SparseStepTask ctor");
1452 madness::DependencyInterface::inc();
1453 world_.taskq.add(
this, & SparseStepTask::iterate_task,
1454 parent->k_, 1ul, madness::TaskAttributes::hipri());
1458 virtual ~SparseStepTask() { }
1460 virtual void run(
const madness::TaskThreadEnv&) {
1461 StepTask::template run<SparseStepTask>(k_, row_group_, col_group_);
1488 left_(left), right_(right), op_(op),
1489 row_group_(), col_group_(),
1490 k_(k), proc_grid_(proc_grid),
1491 reduce_tasks_(NULL),
1492 left_start_local_(proc_grid_.rank_row() * k),
1493 left_end_(left.
size()),
1495 left_stride_local_(proc_grid.proc_rows() * k),
1497 right_stride_local_(proc_grid.proc_cols())
1515 const size_type tile_row = source_index / proc_grid_.
cols();
1516 const size_type tile_col = source_index % proc_grid_.
cols();
1521 const ProcessID source = proc_row * proc_grid_.
proc_cols() + proc_col;
1545 size_type mem_bound_depth(
size_type depth,
const float left_sparsity,
const float right_sparsity) {
1548 const size_type available_memory = max_memory_;
1549 if(available_memory) {
1552 const std::size_t local_memory_per_iter_left =
1553 (left_.trange().elements_range().volume() / left_.trange().tiles_range().volume()) *
1555 proc_grid_.
local_rows() * (1.0f - left_sparsity);
1556 const std::size_t local_memory_per_iter_right =
1557 (right_.trange().elements_range().volume() / right_.trange().tiles_range().volume()) *
1559 proc_grid_.
local_cols() * (1.0f - right_sparsity);
1563 ((local_memory_per_iter_left + local_memory_per_iter_right) /
1567 if(depth > mem_bound_depth) {
1570 switch(mem_bound_depth) {
1573 TA_EXCEPTION(
"Insufficient memory available for SUMMA");
1577 printf(
"!! WARNING TiledArray: Memory constraints limit the SUMMA depth depth to 1.\n" 1578 "!! WARNING TiledArray: Performance may be slow.\n");
1580 depth = mem_bound_depth;
1595 virtual int internal_eval() {
1596 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1598 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1604 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1606 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1610 tile_count = initialize();
1624 if(depth > k_) depth = k_;
1628 depth = mem_bound_depth(depth, 0.0f, 0.0f);
1631 if(max_depth_)
std::min(depth, max_depth_);
1639 const float left_sparsity = left_.shape().sparsity();
1640 const float right_sparsity = right_.shape().sparsity();
1643 const float frac_non_zero = (1.0f -
std::min(left_sparsity, 0.9f))
1644 * (1.0f -
std::min(right_sparsity, 0.9f));
1647 depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
1651 if(depth > k_) depth = k_;
1655 depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
1658 if(max_depth_)
std::min(depth, max_depth_);
1665 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1667 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1673 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1675 #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1685 template <
typename Left,
typename Right,
typename Op,
typename Policy>
1687 Summa<Left, Right, Op, Policy>::max_depth_ =
1688 Summa<Left, Right, Op, Policy>::init_max_depth();
1690 template <
typename Left,
typename Right,
typename Op,
typename Policy>
1692 Summa<Left, Right, Op, Policy>::max_memory_ =
1693 Summa<Left, Right, Op, Policy>::init_max_memory();
1697 #endif // TILEDARRAY_DIST_EVAL_CONTRACTION_EVAL_H__INCLUDED Left left_type
The left-hand argument type.
TensorImpl_::size_type size_type
Size type.
Right right_type
The right-hand argument type.
const shape_type & shape() const
Tensor shape accessor.
const trange_type & trange() const
Tiled range accessor.
bool is_dense() const
Query the density of the tensor.
ProcessID rank_row() const
Rank row accessor.
Distributed evaluator implementation object.
ProcessID rank_col() const
Rank row accessor.
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Summa< Left, Right, Op, Policy > Summa_
This object type.
TensorImpl_::pmap_interface pmap_interface
process map interface type
ProcessID map_col(const size_type col) const
Map a column to the process in this process's row.
ProcessID map_row(const size_type row) const
Map a row to the process in this process's column.
Tensor implementation and base for other tensor implementation objects.
Type trait for extracting the numeric type of tensors and arrays.
void set_tile(size_type i, const value_type &value)
Set tensor value.
DistEvalImpl_::shape_type shape_type
Shape type.
KroneckerDeltaTile< _N >::numeric_type max(const KroneckerDeltaTile< _N > &arg)
size_type cols() const
Element column count accessor.
TensorImpl_::trange_type trange_type
Tiled range type for this object.
T value_type
The norm value type.
void add(const L &left, const R &right, madness::CallbackInterface *callback=nullptr)
Add a pair of arguments to the reduction task.
Op op_type
Tile evaluation operator type.
size_type perm_index_to_target(size_type index) const
Permute index from a source index to a target index.
virtual void discard_tile(size_type i) const
Discard a tile that is not needed.
DistEvalImpl_::pmap_interface pmap_interface
Process map interface type.
Summa(const left_type &left, const right_type &right, World &world, const trange_type trange, const shape_type &shape, const std::shared_ptr< pmap_interface > &pmap, const Permutation &perm, const op_type &op, const size_type k, const ProcGrid &proc_grid)
Constructor.
TensorImpl_::shape_type shape_type
Shape type.
ProcessID owner(const Index &i) const
Query a tile owner.
DistEvalImpl_::trange_type trange_type
Tiled range type.
bool is_local(const Index &i) const
Query for a locally owned tile.
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
const madness::uniqueidT & id() const
Unique object id accessor.
size_type rows() const
Element row count accessor.
bool is_zero(const Index &i) const
Query for a zero tile.
DistEvalImpl_::range_type range_type
Range type.
DistEvalImpl_::TensorImpl_ TensorImpl_
The base, base class type.
size_type proc_cols() const
Process column count accessor.
size_type local_rows() const
Local element row count accessor.
size_type size() const
Tensor tile volume accessor.
TensorImpl_::range_type range_type
Range type this tensor.
eval_trait< value_type >::type eval_type
Tile evaluation type.
Permutation of a sequence of objects indexed by base-0 indices.
size_type proc_rows() const
Process row count accessor.
DistEvalImpl_::size_type size_type
Size type.
size_type local_cols() const
Local element column count accessor.
size_type local_size() const
Local element count accessor.
World & world() const
World accessor.
Distributed contraction evaluator implementation.
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)
size_type perm_index_to_source(size_type index) const
Permute index from a target index to a source index.
const std::shared_ptr< pmap_interface > & pmap() const
Tensor process map accessor.
virtual Future< value_type > get_tile(size_type i) const
Get tile at index i.
DistEvalImpl_::value_type value_type
Tile type.
DistEvalImpl< typename Op::result_type, Policy > DistEvalImpl_
The base class type.
An N-dimensional shallow copy wrapper for tile objects.
virtual void notify()
Tile set notification.
DistEvalImpl_::eval_type eval_type
Tile evaluation type.