MPQC  3.0.0-alpha
parallel.h
1 
2 /*
3  * Copyright 2009 Sandia Corporation. Under the terms of Contract
4  * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government
5  * retains certain rights in this software.
6  *
7  * This file is a part of the MPQC LMP2 library.
8  *
9  * The MPQC LMP2 library is free software: you can redistribute it
10  * and/or modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation, either
12  * version 3 of the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful, but
15  * WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this program. If not, see
21  * <http://www.gnu.org/licenses/>.
22  *
23  */
24 
25 #ifndef _chemistry_qc_lmp2_parallel_h
26 #define _chemistry_qc_lmp2_parallel_h
27 
28 #ifdef HAVE_MPI
29 #define OMPI_SKIP_MPICXX
30 #define MPICH_SKIP_MPICXX
31 #include <mpi.h>
32 #endif
33 
34 #include <memory>
35 #include <algorithm>
36 #include <mpqc_config.h>
37 #include <util/misc/scint.h>
38 #include <util/misc/regtime.h>
39 
40 #include <chemistry/qc/lmp2/collective.h>
41 
42 #define NEW_DIST_DATA
43 
44 namespace sc {
45 
46 namespace sma2 {
47 
48 template <int N>
49 void
51  sc::Timer tim;
52  tim.enter("accum");
53  //tim_enter("sum");
54  //grp->sum(data_->data(), data_->ndata());
55  //tim_exit("sum");
56 #ifndef HAVE_MPI
57  tim.enter("sum0");
58  grp->sum(data_->data(), data_->ndata(), 0, 0);
59  tim.exit("sum0");
60  tim.enter("bcast");
61  grp->bcast(data_->data(), data_->ndata());
62  tim.exit("bcast");
63 #else
64  allsum(data_->data(), data_->ndata());
65 #endif
66  tim.enter("bounds");
67  compute_bounds();
68  tim.exit("bounds");
69  tim.exit("accum");
70 }
71 
72 template <int N>
73 void
75 {
76  sc::Timer tim;
77  int me = msg->me();
78  int nproc = msg->n();
79 
80  // fetch the block map entries from all the other nodes
81  tim.enter("parallel_union");
82  int nblock_local = n_block();
83  int nblock_max = n_block();
84  msg->max(nblock_max);
85  int *local_blocks = new int[1 + nblock_local*N];
86  int *remote_blocks = new int[1 + nblock_max*N];
87  int iarray = 0;
88  local_blocks[iarray++] = nblock_local;
89  for (typename blockmap_t::iterator i = blocks_.begin();
90  i != blocks_.end();
91  i++) {
92  for (int j=0; j<N; j++) {
93  local_blocks[iarray++] = i->first.block(j);
94  }
95  }
96  int source = (me + nproc - 1)%nproc; // me - 1 mod nproc
97  for (int next_node = (me+1)%nproc;
98  next_node != me;
99  next_node = (next_node+1)%nproc) {
100 #ifdef HAVE_MPI
101  MPI_Request send_req, recv_req;
102  MPI_Irecv(remote_blocks,1+nblock_max*N,MPI_INT,source,
103  100+source,MPI_COMM_WORLD,&recv_req);
104  MPI_Isend(local_blocks,1+nblock_local*N,MPI_INT,next_node,
105  100+me,MPI_COMM_WORLD,&send_req);
106  MPI_Status status;
107  MPI_Wait(&recv_req,&status); // wait for the recv to complete
108  int iarray = 0;
109  int nblock = remote_blocks[iarray++];
110  for (int i=0; i<nblock; i++) {
111  BlockInfo<N> bi;
112  for (int j=0; j<N; j++) bi.block(j) = remote_blocks[iarray++];
113  add_unallocated_block(bi);
114  }
115  MPI_Wait(&send_req,&status); // wait for the send to complete
116  source = (source + nproc - 1)%nproc; // source - 1 mod nproc
117 #else
118  throw std::runtime_error("Array<N,C>::replicated_from_distributed requires MPI");
119 #endif
120  }
121  delete[] local_blocks;
122  delete[] remote_blocks;
123  tim.exit("parallel_union");
124 }
125 
128 template <int N>
130  public:
132  virtual int block_to_node(const BlockInfo<N> &b) const = 0;
133  virtual ~BlockDistrib() {}
134 };
135 
138 class PairMapping: public sc::RefCount {
139  std::map<std::pair<int,int>,int> nodemap_;
140  int nproc_;
141  int me_;
142  public:
143  // Distributes two indices among the nodes. The passed
144  // indexset must be identical on all nodes.
146  const std::set<std::pair<int,int> > &indexset,
147  bool randomize = false) {
148  nproc_ = grp->n();
149  me_= grp->me();
150  std::vector<int> nodemap(indexset.size());
151  for (int i=0; i<nodemap.size(); i++) nodemap[i] = i%nproc_;
152  if (randomize) {
153  std::random_shuffle(nodemap.begin(), nodemap.end());
154  }
155  int node=0;
156  for (std::set<std::pair<int,int> >::const_iterator i = indexset.begin();
157  i != indexset.end();
158  i++) {
159  nodemap_[*i] = nodemap[node++];
160  }
161  }
162  // Distributes indices i0 and i1 among the nodes. The passed
163  // indexmap is identical on all nodes.
165  const std::map<int,std::set<int> > &indexmap) {
166  nproc_ = grp->n();
167  me_= grp->me();
168  int node=0;
169  for (std::map<int,std::set<int> >::const_iterator i = indexmap.begin();
170  i != indexmap.end();
171  i++) {
172  for (std::set<int>::const_iterator j = i->second.begin();
173  j != i->second.end();
174  j++) {
175  nodemap_[std::make_pair(i->first,*j)] = node++%nproc_;
176  }
177  }
178  }
179  int pair_to_node(int i, int j) const {
180  std::map<std::pair<int,int>,int>::const_iterator
181  found = nodemap_.find(std::make_pair(i,j));
182  if (found == nodemap_.end()) {
183  throw std::runtime_error("PairMapping: requested non-existent block");
184  }
185  return found->second;
186  }
187  int me() const { return me_; }
188  int nproc() const { return nproc_; }
189  void local_pairs(std::set<std::pair<int,int> > &pairs) const {
190  pairs.clear();
191  for (std::map<std::pair<int,int>,int>::const_iterator i=nodemap_.begin();
192  i != nodemap_.end();
193  i++) {
194  if (i->second == me_) pairs.insert(i->first);
195  }
196  }
197  void print() const {
198  for (std::map<std::pair<int,int>,int>::const_iterator i=nodemap_.begin();
199  i != nodemap_.end();
200  i++) {
201  sc::ExEnv::out0() << i->first.first
202  << " " << i->first.second
203  << " -> " << i->second
204  << std::endl;
205  }
206  }
207 };
208 
211 template <int N>
212 class PairBlockDistrib: public BlockDistrib<N> {
213  int i0_, i1_;
214  sc::Ref<PairMapping> mapping_;
215  public:
217  int i0, int i1,
218  const std::set<std::pair<int,int> > &indexset,
219  bool randomize = false) {
220  i0_ = i0; i1_ = i1;
221  mapping_ = new PairMapping(grp,indexset,randomize);
222  }
224  int i0, int i1,
225  const std::map<int,std::set<int> > &indexmap) {
226  i0_ = i0; i1_ = i1;
227  mapping_ = new PairMapping(grp,indexmap);
228  }
229  // Distributes indices i0 and i1 among the nodes.
230  // The PairMapping object describes the distribution.
231  PairBlockDistrib(int i0, int i1,
232  const sc::Ref<PairMapping> &mapping) {
233  i0_ = i0; i1_ = i1;
234  mapping_ = mapping;
235  }
236  int block_to_node(const BlockInfo<N> &b) const {
237  return mapping_->pair_to_node(b.block(i0_),b.block(i1_));
238  }
239  void local_pairs(std::set<std::pair<int,int> > &pairs) const {
240  mapping_->local_pairs(pairs);
241  }
242 };
243 
247 template <int N>
249  int nindex_;
250  int indices_[N];
251  sc::sc_uint64_t size_[N];
252  int nindex_i_[N];
253  int nproc_;
254  int me_;
255  void init_sizes(const Array<N> &a) {
256  size_[0] = 1;
257  for (int i=1; i<N; i++) {
258  size_[i] = size_[i-1] * a.index(i).nindex();
259  }
260  for (int i=0; i<N; i++) {
261  nindex_i_[i] = a.index(i).nindex();
262  }
263  }
264  public:
266  int i0): nindex_(1) {
267  me_ = grp->me();
268  indices_[0]=i0;
269  nproc_ = grp->n();
270  init_sizes(a);
271  }
273  int i0, int i1): nindex_(2) {
274  me_ = grp->me();
275  indices_[0]=i0; indices_[1]=i1;
276  nproc_ = grp->n();
277  init_sizes(a);
278  }
280  int i0, int i1, int i2): nindex_(3) {
281  me_ = grp->me();
282  indices_[0]=i0; indices_[1]=i1; indices_[2]=i2;
283  nproc_ = grp->n();
284  init_sizes(a);
285  }
286  int block_to_node(const BlockInfo<N> &b) const {
287  sc::sc_uint64_t loc = 0;
288  for (int i=0; i<nindex_; i++) {
289  loc += size_[i] * b.block(indices_[i]);
290  }
291  return loc%nproc_;
292  }
293  // Only valid for a single index!
294  int nlocalindex(int node = -1) {
295  if (node == -1) node = me_;
296  int r = nindex_i_[0] / nproc_;
297  if (me_ < nindex_i_[0] % nproc_) r++;
298  return r;
299  }
300  // Only valid for a single index!
301  int localindex(int i, int node = -1) {
302  if (node == -1) node = me_;
303  return me_ + nproc_*i;
304  }
305 };
306 
307 template <int N>
308 void
310  const Array<N> &d)
311 {
312  sc::Timer tim;
313  tim.enter("repl_from_dist");
314 
315  int me = msg->me();
316  int nproc = msg->n();
317 // if (nproc == 1) {
318 // operator = (d);
319 // return;
320 // }
321 
322  // initialize the indices, etc, in the array
323  // and copy the blockmap entries from the local node
324  init_blocks(d, 0.0); // using tol=0.0 causes all blocks to be used
325 #ifdef USE_BOUND
326  bound_ = d.bound_;
327  msg->max(bound_);
328 #endif
329  tolerance_ = d.tolerance_;
330 
331  parallel_union(msg);
332 
333  // allocate and initialize storage
334  zero();
335 
336  // copy the data from the local node
337  tim.enter("local");
338  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
339  i != d.blocks_.end();
340  i++) {
341  int ndat = d.block_size(i->first);
342  typename blockmap_t::iterator loc = blocks_.find(i->first);
343  // must make sure block was found, since init_blocks may
344  // throw out blocks with a bound below the tolerance
345  if (loc != blocks_.end()) {
346  memcpy(loc->second, i->second, ndat*sizeof(double));
347  }
348  }
349  tim.exit("local");
350 
351  // accumulate the data on all nodes
352  parallel_accumulate(msg);
353 
354  tim.exit("repl_from_dist");
355 }
356 
357 template <int N>
358 void
360  const BlockDistrib<N> &distrib,
361  Array<N> &d,
362  bool clear_source_array,
363  bool ignore_block_distrib_throws)
364 {
365  int me = msg->me();
366  int nproc = msg->n();
367 
368  // If MPI exists, then the MPI routines will be used on even a single
369  // processor. This is done for testing purposes.
370 #ifndef HAVE_MPI
371  if (nproc == 1) {
372  assign_all(d);
373  if (clear_source_array) d.clear();
374  return;
375  }
376  else {
377  throw std::runtime_error("Array<N,C>::distributed_from_distributed requires MPI");
378  }
379 #else // HAVE_MPI
380  // clj debug
381  //d.print(msg,true,std::cout);
382  // clj end debug
383 
384  // initialize the array
385  clear();
386  set_indices(d.indices());
387 
388 #if 1
389  // compute number of blocks and amount of data moving from this node to
390  // each other node
391  int *nblock_send = new int[nproc];
392  int *nblock_recv = new int[nproc];
393  for (int i=0; i<nproc; i++) nblock_send[i] = 0;
394  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
395  i != d.blocks_.end();
396  i++) {
397  int node;
398  if (ignore_block_distrib_throws) {
399  try {
400  node = distrib.block_to_node(i->first);
401  }
402  catch (...) {
403  continue;
404  }
405  }
406  else {
407  node = distrib.block_to_node(i->first);
408  }
409  nblock_send[node]++;
410  }
411 
412  // Exchange the size of buffers that will be sent from each node.
413  MPI_Alltoall(nblock_send, 1, MPI_INT,
414  nblock_recv, 1, MPI_INT,
415  MPI_COMM_WORLD);
416 
417  // clj debug
418 // for (int i=0; i<nproc; i++) {
419 // std::cout << me << "nblock_send[" << i << "] = " << nblock_send[i]
420 // << std::endl;
421 // }
422 // for (int i=0; i<nproc; i++) {
423 // std::cout << me << "nblock_recv[" << i << "] = " << nblock_recv[i]
424 // << std::endl;
425 // }
426  // clj end debug
427 
428  // Find out how much data we'll send and recv and what the offsets are.
429  int *block_send_offset = new int[nproc];
430  int *block_recv_offset = new int[nproc];
431  int *block_send_size = new int[nproc];
432  int *block_recv_size = new int[nproc];
433  for (int i=0; i<nproc; i++) {
434  block_send_size[i] = N*nblock_send[i];
435  block_recv_size[i] = N*nblock_recv[i];
436  if (i==0) {
437  block_send_offset[i] = 0;
438  block_recv_offset[i] = 0;
439  }
440  else {
441  block_send_offset[i] = block_send_size[i-1]
442  + block_send_offset[i-1];
443  block_recv_offset[i] = block_recv_size[i-1]
444  + block_recv_offset[i-1];
445  }
446  }
447  int block_send_size_total = block_send_size[nproc-1]
448  + block_send_offset[nproc-1];
449  int block_recv_size_total = block_recv_size[nproc-1]
450  + block_recv_offset[nproc-1];
451 
452  // Allocate the blockinfo buffers and copy data into them. The ordering
453  // of data is nblock, blockinfo[0], ...,
454  // blockinfo[n-1].
455  int *block_send_buffers = new int[block_send_size_total];
456  std::vector<int> current_send_offset(nproc);
457  std::fill(current_send_offset.begin(), current_send_offset.end(), 0);
458  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
459  i != d.blocks_.end();
460  i++) {
461  int node;
462  if (ignore_block_distrib_throws) {
463  try {
464  node = distrib.block_to_node(i->first);
465  }
466  catch (...) {
467  continue;
468  }
469  }
470  else {
471  node = distrib.block_to_node(i->first);
472  }
473  for (int j=0; j<N; j++)
474  block_send_buffers[block_send_offset[node]
475  + current_send_offset[node]++]
476  = i->first.block(j);
477  }
478 
479  // Exchange the blockinfo buffers.
480  int *block_recv_buffers = new int[block_recv_size_total];
481  custom_alltoallv(block_send_buffers, block_send_size,
482  block_send_offset, MPI_INT,
483  block_recv_buffers, block_recv_size,
484  block_recv_offset, MPI_INT,
485  MPI_COMM_WORLD);
486 
487  // Add the blockinfos to this Array object and allocate the data.
488  int irecv = 0;
489  long ndata_recv_total = 0;
490  int *data_recv_offset = new int[nproc];
491  int *data_recv_size = new int[nproc];
492  for (int i=0; i<nproc; i++) {
493  int nblock = nblock_recv[i];
494  long ndata = 0;
495  for (int j=0; j<nblock; j++) {
496  BlockInfo<N> bi;
497  for (int k=0; k<N; k++) {
498  bi.block(k) = block_recv_buffers[irecv++];
499  }
500  ndata += block_size(bi);
501  add_unallocated_block(bi);
502  }
503  ndata_recv_total += ndata;
504  data_recv_size[i] = ndata;
505  if (i == 0)
506  data_recv_offset[i] = 0;
507  else
508  data_recv_offset[i] = data_recv_offset[i-1] + data_recv_size[i-1];
509  }
510 
511  // Allocate the buffers for the exchange of data and place the data into
512  // the send buffer.
513  double *data_send_buffers = new double[d.n_element_allocated()];
514  int *data_send_offset = new int[nproc];
515  int *data_send_size = new int[nproc];
516  int isend = 0;
517  double *current_send_buffer = data_send_buffers;
518  for (int i=0; i<nproc; i++) {
519  int nblock = nblock_send[i];
520  data_send_size[i] = 0;
521  for (int j=0; j<nblock; j++) {
522  BlockInfo<N> bi;
523  for (int k=0; k<N; k++) {
524  bi.block(k) = block_send_buffers[isend++];
525  }
526  int ndata_in_block = block_size(bi);
527  memcpy(current_send_buffer, d.blocks_.find(bi)->second,
528  ndata_in_block*sizeof(double));
529  current_send_buffer += ndata_in_block;
530  data_send_size[i] += ndata_in_block;
531  }
532  if (i == 0)
533  data_send_offset[i] = 0;
534  else
535  data_send_offset[i] = data_send_offset[i-1] + data_send_size[i-1];
536  }
537  delete[] block_send_buffers;
538  delete[] block_send_offset;
539  delete[] block_send_size;
540  if (clear_source_array) d.clear();
541 
542  // Exchange the data
543  double *data_recv_buffers = new double[ndata_recv_total];
544  custom_alltoallv(data_send_buffers, data_send_size,
545  data_send_offset, MPI_DOUBLE,
546  data_recv_buffers, data_recv_size,
547  data_recv_offset, MPI_DOUBLE,
548  MPI_COMM_WORLD);
549  delete[] data_send_buffers;
550  delete[] data_send_offset;
551  delete[] data_send_size;
552 
553  // Update the array's data.
554  allocate_blocks();
555  irecv = 0;
556  double *current_recv_buffer = data_recv_buffers;
557  for (int i=0; i<nproc; i++) {
558  int nblock = nblock_recv[i];
559  for (int j=0; j<nblock; j++) {
560  BlockInfo<N> bi;
561  for (int k=0; k<N; k++) {
562  bi.block(k) = block_recv_buffers[irecv++];
563  }
564  typename blockmap_t::iterator iblock = blocks_.find(bi);
565  double *data = iblock->second;
566  int ndata_in_block = block_size(bi);
567  memcpy(data, current_recv_buffer, ndata_in_block*sizeof(double));
568  current_recv_buffer += ndata_in_block;
569  }
570  }
571 
572  delete[] block_recv_buffers;
573  delete[] block_recv_offset;
574  delete[] block_recv_size;
575 
576  delete[] data_recv_buffers;
577  delete[] data_recv_offset;
578  delete[] data_recv_size;
579 
580  delete[] nblock_send;
581  delete[] nblock_recv;
582 
583 #else
584 
586  // fetch my block map entries from all the other nodes
587  // and send d's contributions to all the other nodes
588 
589  // compute number of blocks moving from this node to each other node
590  std::vector<int> nblock(nproc);
591  std::fill(nblock.begin(), nblock.end(), 0);
592  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
593  i != d.blocks_.end();
594  i++) {
595  int node = distrib.block_to_node(i->first);
596  nblock[node]++;
597  }
598 
599  // allocate buffers for each node
600  std::vector<int*> buffer(nproc);
601  for (int i=0; i<nproc; i++) {
602  if (i != me) {
603  buffer[i] = new int[1 + N*nblock[i]];
604  buffer[i][0] = nblock[i];
605  }
606  else {
607  buffer[i] = 0;
608  }
609  }
610 
611  // pack the buffer for each node with blockinfo data
612  // local blocks are directly inserted
613  std::vector<int> ndata(nproc);
614  std::fill(ndata.begin(), ndata.end(), 1); // fill w/1 since all contain nblocks 1st
615  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
616  i != d.blocks_.end();
617  i++) {
618  int node = distrib.block_to_node(i->first);
619  int &counter = ndata[node];
620  for (int j=0; j<N; j++) {
621  if (node == me) {
622  add_unallocated_block(i->first);
623  }
624  else {
625  buffer[node][counter++] = i->first.block(j);
626  }
627  }
628  }
629 
630  // send the blockinfo data from this node to each other node
631  int max_nblock = *std::max_element(nblock.begin(), nblock.end());
632  msg->max(max_nblock);
633  int *remote_blocks = new int[1 + max_nblock*N];
634  int source = (me + nproc - 1)%nproc; // me - 1 mod nproc
635  for (int next_node = (me+1)%nproc;
636  next_node != me;
637  next_node = (next_node+1)%nproc) {
638 #ifdef HAVE_MPI
639  MPI_Request send_req, recv_req;
640  MPI_Irecv(remote_blocks,1+max_nblock*N,MPI_INT,source,
641  100+source,MPI_COMM_WORLD,&recv_req);
642  MPI_Isend(buffer[next_node],1+nblock[next_node]*N,MPI_INT,next_node,
643  100+me,MPI_COMM_WORLD,&send_req);
644  MPI_Status status;
645  MPI_Wait(&recv_req,&status); // wait for the recv to complete
646  int iarray = 0;
647  int nblock = remote_blocks[iarray++];
648  for (int i=0; i<nblock; i++) {
649  BlockInfo<N> bi;
650  for (int j=0; j<N; j++) bi.block(j) = remote_blocks[iarray++];
651  add_unallocated_block(bi);
652  }
653  MPI_Wait(&send_req,&status); // wait for the send to complete
654  source = (source + nproc - 1)%nproc; // source - 1 mod nproc
655 #else
656  throw std::runtime_error("Array<N,C>::distributed_from_distributed requires MPI");
657 #endif
658  }
659  delete[] remote_blocks;
660  for (int i=0; i<nproc; i++) delete[] buffer[i];
661 
663  // allocate storage
664  allocate_blocks();
665  // find the largest packet size and allocate storage
666  int maxsz = 0;
667 #ifdef HAVE_MPI
668  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
669  i != d.blocks_.end();
670  i++) {
671  int sz = block_size(i->first);
672  if (sz > maxsz) maxsz = sz;
673  }
674  msg->max(maxsz);
675  int maxalloc;
676  MPI_Pack_size(maxsz, MPI_DOUBLE, MPI_COMM_WORLD, &maxalloc);
677  for (int i=0; i<N; i++) {
678  int isz;
679  MPI_Pack_size(1, MPI_INT, MPI_COMM_WORLD, &isz);
680  maxalloc += isz;
681  }
682  char *outgoing_buffer = new char[maxalloc];
683 #endif
684 
686  // send/recv data
687  int n_to_be_received = blocks_.size() - nblock[me];
688 #ifdef HAVE_MPI
689  const int nrecv_req = 20;
690  std::vector<char *> incoming_buffer(nrecv_req);
691  for (int i=0; i<nrecv_req; i++) {
692  incoming_buffer[i] = new char[maxalloc];
693  }
694  MPI_Request recv_req[nrecv_req];
695  for (int i=0; i<nrecv_req; i++) {
696  if (i<n_to_be_received) {
697  MPI_Irecv(incoming_buffer[i], maxalloc, MPI_BYTE, MPI_ANY_SOURCE,
698  99, MPI_COMM_WORLD, &recv_req[i]);
699  }
700  else {
701  recv_req[i] = MPI_REQUEST_NULL;
702  }
703  }
704 #endif
705  for (typename blockmap_t::const_iterator i = d.blocks_.begin();
706  i != d.blocks_.end() || n_to_be_received > 0;) {
707 #ifdef HAVE_MPI
708  MPI_Request send_req;
709 #endif
710  int node;
711 // std::cout << sc::scprintf("%d end = %d nleft = %d\n",
712 // me, int(i == d.blocks_.end()), n_to_be_received)
713 // << std::flush;
714  bool need_wait_for_send = false;
715  if (i != d.blocks_.end()) {
716  node = distrib.block_to_node(i->first);
717  int ndat = d.block_size(i->first);
718  if (node == me) {
719 // std::cout << sc::scprintf(" %d processing local block\n",me)
720 // << std::flush;
721  typename blockmap_t::iterator loc = blocks_.find(i->first);
722  memcpy(loc->second, i->second, ndat*sizeof(double));
723  }
724  else {
725 // std::cout << sc::scprintf(" %d sending block to %d\n",me,node)
726 // << std::flush;
727  // pack up block and send it off
728 #ifdef HAVE_MPI
729  int outgoing_loc = 0;
730  for (int j=0; j<N; j++) {
731  int jval = i->first.block(j);
732  MPI_Pack(&jval, 1, MPI_INT, outgoing_buffer, maxalloc,
733  &outgoing_loc, MPI_COMM_WORLD);
734  }
735  MPI_Pack(i->second, ndat, MPI_DOUBLE, outgoing_buffer, maxalloc,
736  &outgoing_loc, MPI_COMM_WORLD);
737  MPI_Isend(outgoing_buffer,outgoing_loc,MPI_BYTE,node,
738  99,MPI_COMM_WORLD,&send_req);
739  need_wait_for_send = true;
740 #else
741  throw std::runtime_error("LMP2:: cannot send remote block w/o MPI");
742 #endif
743  }
744  i++;
745  }
746 #ifdef HAVE_MPI
747  // See if any data is ready to be recved
748  int nrecvd = 0;
749  for (int irecv=0; irecv<nrecv_req; irecv++) {
750  if (recv_req[irecv] == MPI_REQUEST_NULL) continue;
751  int flag = 1;
752  MPI_Status status;
753  MPI_Test(&recv_req[irecv],&flag,&status);
754  if (flag) {
755 // std::cout << sc::scprintf(" %d got incoming\n",me)
756 // << std::flush;
757  BlockInfo<N> bi;
758  int position = 0;
759  for (int j=0; j<N; j++) {
760  int index;
761  MPI_Unpack(incoming_buffer[irecv], maxalloc, &position, &index,
762  1, MPI_INT, MPI_COMM_WORLD);
763  bi.block(j) = index;
764  }
765  typename blockmap_t::iterator loc = blocks_.find(bi);
766  int block_sz = block_size(bi);
767  MPI_Unpack(incoming_buffer[irecv], maxalloc, &position, loc->second,
768  block_sz, MPI_DOUBLE, MPI_COMM_WORLD);
769  n_to_be_received--;
770  // only repost the receive if there are not enough recv requests remaining
771  if (n_to_be_received >= nrecv_req) {
772  MPI_Irecv(incoming_buffer[irecv], maxalloc, MPI_BYTE, MPI_ANY_SOURCE,
773  99, MPI_COMM_WORLD, &recv_req[irecv]);
774  }
775  }
776  }
777 
778  // Wait until data is sent
779  if (need_wait_for_send) {
780  MPI_Status status;
781  MPI_Wait(&send_req,&status);
782  }
783 #endif
784  }
785 #ifdef HAVE_MPI
786  for (int i=0; i<nrecv_req; i++) {
787  delete[] incoming_buffer[i];
788  }
789  delete[] outgoing_buffer;
790 #endif
791  if (clear_source_array) d.clear();
792 #endif
793 #endif // HAVE_MPI
794 
795  compute_bounds();
796 
797  // clj debug
798  //print(msg,true,std::cout);
799  // clj end debug
800 }
801 
802 }
803 
804 }
805 
806 #endif
sc::sma2::PairMapping
Distributes pairs of indices among the processes.
Definition: parallel.h:138
sc::sma2::Array::index
const Range & index(int i) const
Gets the i'th Range.
Definition: sma.h:1503
sc::Timer
The Timer class uses RegionTimer to time intervals in an exception safe manner.
Definition: regtime.h:181
sc::Ref< sc::MessageGrp >
sc::sma2::Array::parallel_union
void parallel_union(const sc::Ref< sc::MessageGrp > &grp)
Makes the block distribution the same on all processes.
Definition: parallel.h:74
sc::sma2::BlockDistrib::block_to_node
virtual int block_to_node(const BlockInfo< N > &b) const =0
Given a block, returns the node on which it resides.
sc::sma2::BlockInfo
BlockInfo stores info about a block of data.
Definition: sma.h:200
sc::MessageGrp::me
int me()
Returns my processor number. In the range [0,n()).
Definition: message.h:194
sc::sma2::Array
Implements a block sparse tensor.
Definition: sma.h:1088
sc::Timer::enter
void enter(const char *region)
Begin timing, using the given timing region name.
sc::MessageGrp::n
int n()
Returns the number of processors.
Definition: message.h:192
sc::sma2::Array::block_size
int block_size(const BlockInfo< N > &bi) const
Return size of the given block.
Definition: sma.h:1507
sc::sma2::CompleteBlockDistrib::block_to_node
int block_to_node(const BlockInfo< N > &b) const
Given a block, returns the node on which it resides.
Definition: parallel.h:286
sc::sma2::Array::clear
void clear()
Make this array store nothing.
Definition: sma.h:1475
sc::Timer::exit
void exit(const char *region=0)
Exit the current timing region.
sc::sma2::Array::distributed_from_distributed
void distributed_from_distributed(const sc::Ref< sc::MessageGrp > &, const BlockDistrib< N > &, Array< N > &, bool clear_source_array=false, bool ignore_block_distrib_throws=false)
Redistribute an array.
Definition: parallel.h:359
sc::map
std::vector< int > map(const GaussianBasisSet &B, const GaussianBasisSet &A)
same as operator<<, except A does not have to be contained in B, map[a] = -1 if function a is not in ...
sc::sma2::BlockDistrib
Provides information about how blocks are distributed onto processes.
Definition: parallel.h:129
sc::sma2::Array::n_element_allocated
size_t n_element_allocated() const
Returns the number of elements contained in blocks.
Definition: sma.h:1702
sc::sma2::CompleteBlockDistrib
Distribute blocks round-robin among processes using one or more index values.
Definition: parallel.h:248
sc::sma2::PairBlockDistrib
An implementation of BlockDistrib using PairMapping.
Definition: parallel.h:212
sc::sma2::Array::replicated_from_distributed
void replicated_from_distributed(const sc::Ref< sc::MessageGrp > &, const Array< N > &)
Convert a distributed array to a replicated array.
Definition: parallel.h:309
sc::ExEnv::out0
static std::ostream & out0()
Return an ostream that writes from node 0.
sc::RefCount
The base class for all reference counted objects.
Definition: ref.h:192
sc::sma2::PairBlockDistrib::block_to_node
int block_to_node(const BlockInfo< N > &b) const
Given a block, returns the node on which it resides.
Definition: parallel.h:236
sc::sma2::Array::indices
const Range * indices() const
Gets the Range array.
Definition: sma.h:1505
sc
Contains all MPQC code up to version 3.
Definition: mpqcin.h:14
sc::sma2::Array::parallel_accumulate
void parallel_accumulate(const sc::Ref< sc::MessageGrp > &grp)
Performs a reduce-broadcast on the data.
Definition: parallel.h:50

Generated at Sun Jan 26 2020 23:23:58 for MPQC 3.0.0-alpha using the documentation package Doxygen 1.8.16.