proc_grid.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2013 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Justus Calvin
19  * Department of Chemistry, Virginia Tech
20  *
21  * proc_grid.h
22  * Nov 6, 2013
23  *
24  */
25 
26 #ifndef TILEDARRAY_GRID_H__INCLUDED
27 #define TILEDARRAY_GRID_H__INCLUDED
28 
30 
31 namespace TiledArray {
32 namespace detail {
33 
35 
58 class ProcGrid {
59  public:
60  typedef uint_fast32_t size_type;
61 
62  private:
63  World* world_;
64  size_type rows_;
65  size_type cols_;
66  size_type size_;
67  size_type proc_rows_;
68  size_type proc_cols_;
69  size_type
70  proc_size_;
71  ProcessID rank_row_;
73  ProcessID rank_col_;
74  size_type local_rows_;
75  size_type local_cols_;
76  size_type local_size_;
77 
79 
86  static size_type optimal_proc_row(const double nprocs, const double Mm,
87  const double Nn) {
88  // Compute the initial guess for P_row. This is the optimal guess when
89  // Mm is equal to Nn, and the ideal solution.
90  double P_row_estimate = std::sqrt(nprocs);
91 
92  // Here we want to find the positive, real root of the polynomial:
93  // Nn(2x^4 - x^3) + Mm(Px - 2P^2) = 0
94  // using a Newton-Raphson algorithm.
95 
96  // Precompute some constants
97  const double PMm = nprocs * Mm;
98  const double two_P = nprocs + nprocs;
99 
100  const unsigned int max_it = 21u;
101  unsigned int it = 0u;
102  double r = 0.0;
103  do {
104  // Precompute P_row squared
105  const double P_row2 = P_row_estimate * P_row_estimate;
106  const double NnP_row2 = Nn * P_row2;
107 
108  // Compute the value of f(P_row_estimate) and df(P_row_estimate)
109  const double f = NnP_row2 * (2.0 * P_row2 - P_row_estimate) +
110  PMm * (P_row_estimate - two_P);
111  const double df = NnP_row2 * (8.0 * P_row_estimate - 3.0) + PMm;
112 
113  // Compute a new guess for P_row
114  const double P_row_n1 = P_row_estimate - (f / df);
115 
116  // Compute the residual for this iteration
117  r = std::abs(P_row_n1 - P_row_estimate);
118 
119  // Update the guess
120  P_row_estimate = P_row_n1;
121 
122  } while ((r > 0.1) && ((++it) < max_it));
123 
124  return P_row_estimate + 0.5;
125  }
126 
128 
139  void minimize_unused_procs(size_type& x, size_type& y, const size_type nprocs,
140  const size_type min_x, const size_type max_x) {
141  // Check for the quick exit
142  size_type unused = x * y;
143  if (unused == 0u) return;
144 
145  // Compute the range of values for x to be tested.
146  const size_type delta = std::max<size_type>(1ul, std::log2(nprocs));
147 
148  const size_type optimal_x = x;
149  size_type diff = 0ul;
150  const size_type min_test_x =
151  std::max<int_fast32_t>(min_x, int_fast32_t(x) - delta);
152  size_type test_x = std::min(x + delta, max_x);
153 
154  for (; test_x >= min_test_x; --test_x) {
155  const size_type test_y = nprocs / test_x;
156  const size_type test_unused = nprocs - test_x * test_y;
157  const size_type test_diff = std::abs(long(optimal_x) - long(test_x));
158 
159  if ((test_unused < unused) ||
160  ((test_unused == unused) && (test_diff < diff))) {
161  x = test_x;
162  y = test_y;
163  unused = test_unused;
164  diff = test_diff;
165  }
166  }
167  }
168 
170 
173  void init(const size_type rank, const size_type nprocs,
174  const std::size_t row_size, const std::size_t col_size) {
175  // Check for the simple cases first ...
176  if (nprocs == 1u) { // Only one process
177 
178  // Set process grid sizes
179  proc_rows_ = 1u;
180  proc_cols_ = 1u;
181  proc_size_ = 1u;
182 
183  // Set this process rank
184  rank_row_ = 0;
185  rank_col_ = 0;
186 
187  // Set local counts
188  local_rows_ = rows_;
189  local_cols_ = cols_;
190  local_size_ = size_;
191 
192  } else if (size_ <= nprocs) { // Max one tile per process
193 
194  // Set process grid sizes
195  proc_rows_ = rows_;
196  proc_cols_ = cols_;
197  proc_size_ = size_;
198 
199  if (rank < proc_size_) {
200  // Set this process rank
201  rank_row_ = rank / proc_cols_;
202  rank_col_ = rank % proc_cols_;
203 
204  // Set local counts
205  local_rows_ = 1u;
206  local_cols_ = 1u;
207  local_size_ = 1u;
208  }
209 
210  } else { // The not so simple case
211 
212  // Compute the limits for process rows
213  const size_type min_proc_rows =
214  std::max<size_type>(((nprocs + cols_ - 1ul) / cols_), 1ul);
215  const size_type max_proc_rows = std::min<size_type>(nprocs, rows_);
216 
217  // Compute optimal the number of process rows and columns in terms of
218  // communication time.
219  proc_rows_ = std::max<size_type>(
220  min_proc_rows,
221  std::min<size_type>(optimal_proc_row(nprocs, row_size, col_size),
222  max_proc_rows));
223  proc_cols_ = nprocs / proc_rows_;
224 
225  if ((proc_rows_ > min_proc_rows) && (proc_rows_ < max_proc_rows)) {
226  // Search for the values of proc_rows_ and proc_cols_ that minimizes
227  // the number of unused processes in the process grid.
228  minimize_unused_procs(proc_rows_, proc_cols_, nprocs, min_proc_rows,
229  max_proc_rows);
230  }
231 
232  proc_size_ = proc_rows_ * proc_cols_;
233 
234  if (rank < proc_size_) {
235  // Set this process rank
236  rank_row_ = rank / proc_cols_;
237  rank_col_ = rank % proc_cols_;
238 
239  // Set local counts
240  local_rows_ = (rows_ / proc_rows_) +
241  (size_type(rank_row_) < (rows_ % proc_rows_) ? 1u : 0u);
242  local_cols_ = (cols_ / proc_cols_) +
243  (size_type(rank_col_) < (cols_ % proc_cols_) ? 1u : 0u);
244  local_size_ = local_rows_ * local_cols_;
245  }
246  }
247  }
248 
249  public:
251 
254  : world_(NULL),
255  rows_(0u),
256  cols_(0u),
257  size_(0u),
258  proc_rows_(0u),
259  proc_cols_(0u),
260  proc_size_(0u),
261  rank_row_(0),
262  rank_col_(0),
263  local_rows_(0u),
264  local_cols_(0u),
265  local_size_(0u) {}
266 
268 
269  // This constructor makes a rough estimate of the optimal process
270  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
271  // rows/cols to be approximately equal.
277  ProcGrid(World& world, const size_type rows, const size_type cols,
278  const std::size_t row_size, const std::size_t col_size)
279  : world_(&world),
280  rows_(rows),
281  cols_(cols),
282  size_(rows_ * cols_),
283  proc_rows_(0ul),
284  proc_cols_(0ul),
285  proc_size_(0ul),
286  rank_row_(-1),
287  rank_col_(-1),
288  local_rows_(0ul),
289  local_cols_(0ul),
290  local_size_(0ul) {
291  // Check for non-zero sizes
292  TA_ASSERT(rows_ >= 1u);
293  TA_ASSERT(cols_ >= 1u);
294  TA_ASSERT(row_size >= 1ul);
295  TA_ASSERT(col_size >= 1ul);
296 
297  init(world_->rank(), world_->size(), row_size, col_size);
298  }
299 
300 #ifdef TILEDARRAY_ENABLE_TEST_PROC_GRID
301  // Note: The following function is here for testing purposes only. It
302  // has the same functionality as the default constructor above, except the
303  // rank and number of processes can be specified.
304 
306 
307  // This constructor makes a rough estimate of the optimal process
308  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
309  // rows/cols to be approximately equal.
317  ProcGrid(World& world, const size_type test_rank, size_type test_nprocs,
318  const size_type rows, const size_type cols,
319  const std::size_t row_size, const std::size_t col_size)
320  : world_(&world),
321  rows_(rows),
322  cols_(cols),
323  size_(rows_ * cols_),
324  proc_rows_(0u),
325  proc_cols_(0u),
326  proc_size_(0u),
327  rank_row_(-1),
328  rank_col_(-1),
329  local_rows_(0u),
330  local_cols_(0u),
331  local_size_(0u) {
332  // Check for non-zero sizes
333  TA_ASSERT(rows >= 1u);
334  TA_ASSERT(cols >= 1u);
335  TA_ASSERT(row_size >= 1u);
336  TA_ASSERT(col_size >= 1u);
337  TA_ASSERT(test_rank < test_nprocs);
338 
339  init(test_rank, test_nprocs, row_size, col_size);
340  }
341 #endif // TILEDARRAY_ENABLE_TEST_PROC_GRID
342 
344 
345  // This constructor makes a rough estimate of the optimal process
346  // dimensions. The goal is for the ratios of proc_rows/proc_cols and
347  // rows/cols to be approximately equal.
349  ProcGrid(const ProcGrid& other)
350  : world_(other.world_),
351  rows_(other.rows_),
352  cols_(other.cols_),
353  size_(other.size_),
354  proc_rows_(other.proc_rows_),
355  proc_cols_(other.proc_cols_),
356  proc_size_(other.proc_size_),
357  rank_row_(other.rank_row_),
358  rank_col_(other.rank_col_),
359  local_rows_(other.local_rows_),
360  local_cols_(other.local_cols_),
361  local_size_(other.local_size_) {}
362 
364 
366  ProcGrid& operator=(const ProcGrid& other) {
367  world_ = other.world_;
368  rows_ = other.rows_;
369  cols_ = other.cols_;
370  size_ = other.size_;
371  proc_rows_ = other.proc_rows_;
372  proc_cols_ = other.proc_cols_;
373  proc_size_ = other.proc_size_;
374  rank_row_ = other.rank_row_;
375  rank_col_ = other.rank_col_;
376  local_rows_ = other.local_rows_;
377  local_cols_ = other.local_cols_;
378  local_size_ = other.local_size_;
379 
380  return *this;
381  }
382 
384 
386  size_type rows() const { return rows_; }
387 
389 
391  size_type cols() const { return cols_; }
392 
394 
396  size_type size() const { return size_; }
397 
399 
401  size_type local_rows() const { return local_rows_; }
402 
404 
406  size_type local_cols() const { return local_cols_; }
407 
409 
411  size_type local_size() const { return local_size_; }
412 
414 
416  ProcessID rank_row() const { return rank_row_; }
417 
419 
421  ProcessID rank_col() const { return rank_col_; }
422 
424 
426  size_type proc_rows() const { return proc_rows_; }
427 
429 
431  size_type proc_cols() const { return proc_cols_; }
432 
434 
437  size_type proc_size() const { return proc_size_; }
438 
440 
443  madness::Group make_row_group(const madness::DistributedID& did) const {
444  TA_ASSERT(world_);
445 
446  madness::Group group;
447 
448  if (local_size_ != 0u) {
449  // Construct a vector to hold the
450  std::vector<ProcessID> proc_list;
451  proc_list.reserve(proc_cols_);
452 
453  // Populate the row process list
454  size_type p = rank_row_ * proc_cols_;
455  const size_type row_end = p + proc_cols_;
456  for (; p < row_end; ++p) proc_list.push_back(p);
457 
458  // Construct the group
459  group = madness::Group(*world_, proc_list, did);
460  }
461 
462  return group;
463  }
464 
466 
469  madness::Group make_col_group(const madness::DistributedID& did) const {
470  TA_ASSERT(world_);
471 
472  madness::Group group;
473 
474  if (local_size_ != 0u) {
475  // Generate the list of processes in rank_row
476  std::vector<ProcessID> proc_list;
477  proc_list.reserve(proc_rows_);
478 
479  // Populate the column process list
480  for (size_type p = rank_col_; p < proc_size_; p += proc_cols_)
481  proc_list.push_back(p);
482 
483  // Construct the group
484  if (proc_list.size() != 0)
485  group = madness::Group(*world_, proc_list, did);
486  }
487 
488  return group;
489  }
490 
492 
496  ProcessID map_row(const size_type row) const {
497  TA_ASSERT(row < proc_rows_);
498  return rank_col_ + row * proc_cols_;
499  }
500 
502 
506  ProcessID map_col(const size_type col) const {
507  TA_ASSERT(col < proc_cols_);
508  return rank_row_ * proc_cols_ + col;
509  }
510 
512 
515  std::shared_ptr<Pmap> make_pmap() const {
516  TA_ASSERT(world_);
517 
518  return std::make_shared<CyclicPmap>(*world_, rows_, cols_, proc_rows_,
519  proc_cols_);
520  }
521 
523 
528  std::shared_ptr<Pmap> make_col_phase_pmap(const size_type rows) const {
529  TA_ASSERT(world_);
530 
531  return std::make_shared<CyclicPmap>(*world_, rows, cols_, proc_rows_,
532  proc_cols_);
533  }
534 
536 
541  std::shared_ptr<Pmap> make_row_phase_pmap(const size_type cols) const {
542  TA_ASSERT(world_);
543 
544  return std::make_shared<CyclicPmap>(*world_, rows_, cols, proc_rows_,
545  proc_cols_);
546  }
547 }; // class Grid
548 
549 } // namespace detail
550 } // namespace TiledArray
551 
552 #endif // TILEDARRAY_GRID_H__INCLUDED
ProcGrid(World &world, const size_type rows, const size_type cols, const std::size_t row_size, const std::size_t col_size)
Construct a process grid.
Definition: proc_grid.h:277
A 2D processor grid.
Definition: proc_grid.h:58
size_type proc_rows() const
Process row count accessor.
Definition: proc_grid.h:426
std::shared_ptr< Pmap > make_pmap() const
Construct a cyclic process.
Definition: proc_grid.h:515
ProcessID rank_row() const
Rank row accessor.
Definition: proc_grid.h:416
KroneckerDeltaTile< _N >::numeric_type min(const KroneckerDeltaTile< _N > &arg)
ProcessID map_row(const size_type row) const
Map a row to the process in this process's column.
Definition: proc_grid.h:496
auto rank(const DistArray< Tile, Policy > &a)
Definition: dist_array.h:1617
madness::Group make_col_group(const madness::DistributedID &did) const
Construct a column group.
Definition: proc_grid.h:469
size_type proc_size() const
Process grid size accessor.
Definition: proc_grid.h:437
size_type local_rows() const
Local element row count accessor.
Definition: proc_grid.h:401
ProcGrid(const ProcGrid &other)
Copy constructor.
Definition: proc_grid.h:349
madness::Group make_row_group(const madness::DistributedID &did) const
Construct a row group.
Definition: proc_grid.h:443
#define TA_ASSERT(EXPR,...)
Definition: error.h:39
std::shared_ptr< Pmap > make_col_phase_pmap(const size_type rows) const
Construct column phased a cyclic process.
Definition: proc_grid.h:528
size_type local_cols() const
Local element column count accessor.
Definition: proc_grid.h:406
size_type proc_cols() const
Process column count accessor.
Definition: proc_grid.h:431
size_type local_size() const
Local element count accessor.
Definition: proc_grid.h:411
ProcGrid()
Default constructor.
Definition: proc_grid.h:253
ProcessID map_col(const size_type col) const
Map a column to the process in this process's row.
Definition: proc_grid.h:506
size_type size() const
Element count accessor.
Definition: proc_grid.h:396
ProcessID rank_col() const
Rank row accessor.
Definition: proc_grid.h:421
auto abs(const ComplexConjugate< T > &a)
Definition: complex.h:270
std::shared_ptr< Pmap > make_row_phase_pmap(const size_type cols) const
Construct row phased a cyclic process.
Definition: proc_grid.h:541
size_type rows() const
Element row count accessor.
Definition: proc_grid.h:386
ProcGrid & operator=(const ProcGrid &other)
Copy assignment operator.
Definition: proc_grid.h:366
size_type cols() const
Element column count accessor.
Definition: proc_grid.h:391