TiledArray  0.7.0
proc_grid.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2013 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * proc_grid.h
22  * Nov 6, 2013
23  *
24  */
25 
26 #ifndef TILEDARRAY_GRID_H__INCLUDED
27 #define TILEDARRAY_GRID_H__INCLUDED
28 
30 #include <TiledArray/math/eigen.h>
31 
32 namespace TiledArray {
33  namespace detail {
34 
36 
59  class ProcGrid {
60  public:
61  typedef uint_fast32_t size_type;
62 
63  private:
64  World* world_;
65  size_type rows_;
66  size_type cols_;
67  size_type size_;
68  size_type proc_rows_;
69  size_type proc_cols_;
70  size_type proc_size_;
71  ProcessID rank_row_;
73  ProcessID rank_col_;
74  size_type local_rows_;
75  size_type local_cols_;
76  size_type local_size_;
77 
78 
80 
87  static size_type optimal_proc_row(const double nprocs, const double Mm,
88  const double Nn)
89  {
90  // Compute the initial guess for P_row. This is the optimal guess when
91  // Mm is equal to Nn, and the ideal solution.
92  double P_row_estimate = std::sqrt(nprocs);
93 
94  // Here we want to find the positive, real root of the polynomial:
95  // Nn(2x^4 - x^3) + Mm(Px - 2P^2) = 0
96  // using a Newton-Raphson algorithm.
97 
98  // Precompute some constants
99  const double PMm = nprocs * Mm;
100  const double two_P = nprocs + nprocs;
101 
102  const unsigned int max_it = 21u;
103  unsigned int it = 0u;
104  double r = 0.0;
105  do {
106  // Precompute P_row squared
107  const double P_row2 = P_row_estimate * P_row_estimate;
108  const double NnP_row2 = Nn * P_row2;
109 
110  // Compute the value of f(P_row_estimate) and df(P_row_estimate)
111  const double f = NnP_row2 * ( 2.0 * P_row2 - P_row_estimate)
112  + PMm * ( P_row_estimate - two_P);
113  const double df = NnP_row2 * ( 8.0 * P_row_estimate - 3.0) + PMm;
114 
115  // Compute a new guess for P_row
116  const double P_row_n1 = P_row_estimate - (f / df);
117 
118  // Compute the residual for this iteration
119  r = std::abs(P_row_n1 - P_row_estimate);
120 
121  // Update the guess
122  P_row_estimate = P_row_n1;
123 
124  } while((r > 0.1) && ((++it) < max_it));
125 
126  return P_row_estimate + 0.5;
127  }
128 
130 
141  void minimize_unused_procs(size_type& x, size_type& y,
142  const size_type nprocs, const size_type min_x, const size_type max_x)
143  {
144  // Check for the quick exit
145  size_type unused = x * y;
146  if(unused == 0u)
147  return;
148 
149  // Compute the range of values for x to be tested.
150  const size_type delta = std::max<size_type>(1ul, std::log2(nprocs));
151 
152  const size_type optimal_x = x;
153  size_type diff = 0ul;
154  const size_type min_test_x = std::max<int_fast32_t>(min_x, int_fast32_t(x) - delta);
155  size_type test_x = std::min(x + delta, max_x);
156 
157  for(; test_x >= min_test_x; --test_x) {
158  const size_type test_y = nprocs / test_x;
159  const size_type test_unused = nprocs - test_x * test_y;
160  const size_type test_diff = std::abs(long(optimal_x) - long(test_x));
161 
162  if((test_unused < unused) || ((test_unused == unused) && (test_diff < diff))) {
163  x = test_x;
164  y = test_y;
165  unused = test_unused;
166  diff = test_diff;
167  }
168  }
169  }
170 
172 
175  void init(const size_type rank, const size_type nprocs,
176  const std::size_t row_size, const std::size_t col_size)
177  {
178  // Check for the simple cases first ...
179  if(nprocs == 1u) { // Only one process
180 
181  // Set process grid sizes
182  proc_rows_ = 1u;
183  proc_cols_ = 1u;
184  proc_size_ = 1u;
185 
186  // Set this process rank
187  rank_row_ = 0;
188  rank_col_ = 0;
189 
190  // Set local counts
191  local_rows_ = rows_;
192  local_cols_ = cols_;
193  local_size_ = size_;
194 
195  } else if(size_ <= nprocs) { // Max one tile per process
196 
197  // Set process grid sizes
198  proc_rows_ = rows_;
199  proc_cols_ = cols_;
200  proc_size_ = size_;
201 
202  if(rank < proc_size_) {
203  // Set this process rank
204  rank_row_ = rank / proc_cols_;
205  rank_col_ = rank % proc_cols_;
206 
207  // Set local counts
208  local_rows_ = 1u;
209  local_cols_ = 1u;
210  local_size_ = 1u;
211  }
212 
213  } else { // The not so simple case
214 
215  // Compute the limits for process rows
216  const size_type min_proc_rows =
217  std::max<size_type>(((nprocs + cols_ - 1ul) / cols_), 1ul);
218  const size_type max_proc_rows = std::min<size_type>(nprocs, rows_);
219 
220  // Compute optimal the number of process rows and columns in terms of
221  // communication time.
222  proc_rows_ = std::max<size_type>(min_proc_rows,
223  std::min<size_type>(optimal_proc_row(nprocs, row_size, col_size),
224  max_proc_rows));
225  proc_cols_ = nprocs / proc_rows_;
226 
227  if((proc_rows_ > min_proc_rows) && (proc_rows_ < max_proc_rows)) {
228  // Search for the values of proc_rows_ and proc_cols_ that minimizes
229  // the number of unused processes in the process grid.
230  minimize_unused_procs(proc_rows_, proc_cols_, nprocs,
231  min_proc_rows, max_proc_rows);
232  }
233 
234  proc_size_ = proc_rows_ * proc_cols_;
235 
236  if(rank < proc_size_) {
237  // Set this process rank
238  rank_row_ = rank / proc_cols_;
239  rank_col_ = rank % proc_cols_;
240 
241  // Set local counts
242  local_rows_ = (rows_ / proc_rows_) + (size_type(rank_row_) < (rows_ % proc_rows_) ? 1u : 0u);
243  local_cols_ = (cols_ / proc_cols_) + (size_type(rank_col_) < (cols_ % proc_cols_) ? 1u : 0u);
244  local_size_ = local_rows_ * local_cols_;
245  }
246  }
247  }
248 
249  public:
251 
254  world_(NULL), rows_(0u), cols_(0u), size_(0u), proc_rows_(0u),
255  proc_cols_(0u), proc_size_(0u), rank_row_(0), rank_col_(0),
256  local_rows_(0u), local_cols_(0u), local_size_(0u)
257  { }
258 
260 
261  // This constructor makes a rough estimate of the optimal process
262  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
263  // rows/cols to be approximately equal.
269  ProcGrid(World& world, const size_type rows, const size_type cols,
270  const std::size_t row_size, const std::size_t col_size) :
271  world_(&world), rows_(rows), cols_(cols), size_(rows_ * cols_),
272  proc_rows_(0ul), proc_cols_(0ul), proc_size_(0ul),
273  rank_row_(-1), rank_col_(-1),
274  local_rows_(0ul), local_cols_(0ul), local_size_(0ul)
275  {
276  // Check for non-zero sizes
277  TA_ASSERT(rows_ >= 1u);
278  TA_ASSERT(cols_ >= 1u);
279  TA_ASSERT(row_size >= 1ul);
280  TA_ASSERT(col_size >= 1ul);
281 
282  init(world_->rank(), world_->size(), row_size, col_size);
283  }
284 
285 #ifdef TILEDARRAY_ENABLE_TEST_PROC_GRID
286  // Note: The following function is here for testing purposes only. It
287  // has the same functionality as the default constructor above, except the
288  // rank and number of processes can be specified.
289 
290 
292 
293  // This constructor makes a rough estimate of the optimal process
294  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
295  // rows/cols to be approximately equal.
303  ProcGrid(World& world, const size_type test_rank, size_type test_nprocs,
304  const size_type rows, const size_type cols,
305  const std::size_t row_size, const std::size_t col_size) :
306  world_(&world), rows_(rows), cols_(cols), size_(rows_ * cols_),
307  proc_rows_(0u), proc_cols_(0u), proc_size_(0u), rank_row_(-1),
308  rank_col_(-1), local_rows_(0u), local_cols_(0u), local_size_(0u)
309  {
310  // Check for non-zero sizes
311  TA_ASSERT(rows >= 1u);
312  TA_ASSERT(cols >= 1u);
313  TA_ASSERT(row_size >= 1u);
314  TA_ASSERT(col_size >= 1u);
315  TA_ASSERT(test_rank < test_nprocs);
316 
317  init(test_rank, test_nprocs, row_size, col_size);
318  }
319 #endif // TILEDARRAY_ENABLE_TEST_PROC_GRID
320 
322 
323  // This constructor makes a rough estimate of the optimal process
324  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
325  // rows/cols to be approximately equal.
327  ProcGrid(const ProcGrid& other) :
328  world_(other.world_), rows_(other.rows_), cols_(other.cols_),
329  size_(other.size_), proc_rows_(other.proc_rows_),
330  proc_cols_(other.proc_cols_), proc_size_(other.proc_size_),
331  rank_row_(other.rank_row_), rank_col_(other.rank_col_),
332  local_rows_(other.local_rows_), local_cols_(other.local_cols_),
333  local_size_(other.local_size_)
334  { }
335 
337 
339  ProcGrid& operator=(const ProcGrid& other) {
340  world_ = other.world_;
341  rows_ = other.rows_;
342  cols_ = other.cols_;
343  size_ = other.size_;
344  proc_rows_ = other.proc_rows_;
345  proc_cols_ = other.proc_cols_;
346  proc_size_ = other.proc_size_;
347  rank_row_ = other.rank_row_;
348  rank_col_ = other.rank_col_;
349  local_rows_ = other.local_rows_;
350  local_cols_ = other.local_cols_;
351  local_size_ = other.local_size_;
352 
353  return *this;
354  }
355 
357 
359  size_type rows() const { return rows_; }
360 
362 
364  size_type cols() const { return cols_; }
365 
367 
369  size_type size() const { return size_; }
370 
372 
374  size_type local_rows() const { return local_rows_; }
375 
377 
379  size_type local_cols() const { return local_cols_; }
380 
382 
384  size_type local_size() const { return local_size_; }
385 
387 
389  ProcessID rank_row() const { return rank_row_; }
390 
392 
394  ProcessID rank_col() const { return rank_col_; }
395 
397 
399  size_type proc_rows() const { return proc_rows_; }
400 
402 
404  size_type proc_cols() const { return proc_cols_; }
405 
407 
410  size_type proc_size() const { return proc_size_; }
411 
412 
414 
417  madness::Group make_row_group(const madness::DistributedID& did) const {
418  TA_ASSERT(world_);
419 
420  madness::Group group;
421 
422  if(local_size_ != 0u) {
423  // Construct a vector to hold the
424  std::vector<ProcessID> proc_list;
425  proc_list.reserve(proc_cols_);
426 
427  // Populate the row process list
428  size_type p = rank_row_ * proc_cols_;
429  const size_type row_end = p + proc_cols_;
430  for(; p < row_end; ++p)
431  proc_list.push_back(p);
432 
433  // Construct the group
434  group = madness::Group(*world_, proc_list, did);
435  }
436 
437  return group;
438  }
439 
441 
444  madness::Group make_col_group(const madness::DistributedID& did) const {
445  TA_ASSERT(world_);
446 
447  madness::Group group;
448 
449  if(local_size_ != 0u) {
450  // Generate the list of processes in rank_row
451  std::vector<ProcessID> proc_list;
452  proc_list.reserve(proc_rows_);
453 
454  // Populate the column process list
455  for(size_type p = rank_col_; p < proc_size_; p += proc_cols_)
456  proc_list.push_back(p);
457 
458  // Construct the group
459  if(proc_list.size() != 0)
460  group = madness::Group(*world_, proc_list, did);
461  }
462 
463  return group;
464  }
465 
467 
470  ProcessID map_row(const size_type row) const {
471  TA_ASSERT(row < proc_rows_);
472  return rank_col_ + row * proc_cols_;
473  }
474 
476 
479  ProcessID map_col(const size_type col) const {
480  TA_ASSERT(col < proc_cols_);
481  return rank_row_ * proc_cols_ + col;
482  }
483 
485 
488  std::shared_ptr<Pmap> make_pmap() const {
489  TA_ASSERT(world_);
490 
491  return std::make_shared<CyclicPmap>(*world_, rows_, cols_, proc_rows_, proc_cols_);
492  }
493 
495 
500  std::shared_ptr<Pmap> make_col_phase_pmap(const size_type rows) const {
501  TA_ASSERT(world_);
502 
503  return std::make_shared<CyclicPmap>(*world_, rows, cols_, proc_rows_, proc_cols_);
504  }
505 
507 
512  std::shared_ptr<Pmap> make_row_phase_pmap(const size_type cols) const {
513  TA_ASSERT(world_);
514 
515  return std::make_shared<CyclicPmap>(*world_, rows_, cols, proc_rows_, proc_cols_);
516  }
517  }; // class Grid
518 
519  } // namespace detail
520 } // namespace TiledArray
521 
522 #endif // TILEDARRAY_GRID_H__INCLUDED
ProcessID rank_row() const
Rank row accessor.
Definition: proc_grid.h:389
ProcessID rank_col() const
Rank row accessor.
Definition: proc_grid.h:394
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Definition: proc_grid.h:417
ProcessID map_col(const size_type col) const
Map a column to the process in this process&#39;s row.
Definition: proc_grid.h:479
ProcessID map_row(const size_type row) const
Map a row to the process in this process&#39;s column.
Definition: proc_grid.h:470
size_type cols() const
Element column count accessor.
Definition: proc_grid.h:364
auto abs(const ComplexConjugate< T > &a)
Definition: complex.h:247
std::shared_ptr< Pmap > make_pmap() const
Construct a cyclic process.
Definition: proc_grid.h:488
ProcGrid(const ProcGrid &other)
Copy constructor.
Definition: proc_grid.h:327
ProcGrid(World &world, const size_type rows, const size_type cols, const std::size_t row_size, const std::size_t col_size)
Construct a process grid.
Definition: proc_grid.h:269
ProcGrid & operator=(const ProcGrid &other)
Copy assignment operator.
Definition: proc_grid.h:339
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
Definition: proc_grid.h:444
#define TA_ASSERT(a)
Definition: error.h:107
ProcGrid()
Default constructor.
Definition: proc_grid.h:253
size_type rows() const
Element row count accessor.
Definition: proc_grid.h:359
size_type proc_cols() const
Process column count accessor.
Definition: proc_grid.h:404
size_type local_rows() const
Local element row count accessor.
Definition: proc_grid.h:374
size_type size() const
Element count accessor.
Definition: proc_grid.h:369
size_type proc_rows() const
Process row count accessor.
Definition: proc_grid.h:399
size_type local_cols() const
Local element column count accessor.
Definition: proc_grid.h:379
size_type local_size() const
Local element count accessor.
Definition: proc_grid.h:384
std::shared_ptr< Pmap > make_row_phase_pmap(const size_type cols) const
Construct row phased a cyclic process.
Definition: proc_grid.h:512
std::shared_ptr< Pmap > make_col_phase_pmap(const size_type rows) const
Construct column phased a cyclic process.
Definition: proc_grid.h:500
size_type proc_size() const
Process grid size accessor.
Definition: proc_grid.h:410
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)
A 2D processor grid.
Definition: proc_grid.h:59