TiledArray  0.7.0
parallel_gemm.h
Go to the documentation of this file.
1 /*
2  * This file is a part of TiledArray.
3  * Copyright (C) 2015 Virginia Tech
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * justus
19  * Department of Chemistry, Virginia Tech
20  *
21  * parallel_gemm.h
22  * Apr 29, 2015
23  *
24  */
25 
26 #ifndef TILEDARRAY_PARALLEL_GEMM_H__INCLUDED
27 #define TILEDARRAY_PARALLEL_GEMM_H__INCLUDED
28 
29 #include <TiledArray/madness.h>
30 #include <TiledArray/vector_op.h>
31 #include <TiledArray/blas.h>
32 
33 #define TILEDARRAY_DYNAMIC_BLOCK_SIZE std::numeric_limits<std::size_t>::max();
34 
35 namespace TiledArray {
36  namespace math {
37 
38 //#ifdef HAVE_INTEL_TBB
39 
40 
41  template <typename T, integer Size>
42  class MatrixBlockTask : public tbb::task {
43 
44  const integer rows_;
45  const integer cols_;
46  T* data_;
47  const integer ld_;
48  std::shared_ptr<T> result_;
49 
51 
55  void copy_block(T* result, const T* data, const integer ld) {
56  const T* const block_end = result + (TILEDARRAY_LOOP_UNWIND * Size);
57  for(; result < block_end; result += Size, data += ld)
59  }
60 
62 
68  void copy_block(const integer m, const integer n, T* result,
69  const T* data, const integer ld)
70  {
71  const T* const block_end = result + (m * Size);
72  for(; result < block_end; result += Size, data += ld)
74  }
75 
76  public:
77  MatrixBlockTask(const integer rows, const integer cols,
78  const T* const data, const integer ld) :
79  rows_(rows), cols_(cols), data_(data), ld_(ld)
80  { }
81 
83  virtual tbb::task* execut() {
84 
85  // Compute block iteration limit
86  constexpr integer index_mask = ~integer(TILEDARRAY_LOOP_UNWIND - 1ul);
87  const integer mx = rows_ & index_mask; // = rows - rows % TILEDARRAY_LOOP_UNWIND
88  const integer nx = cols_ & index_mask; // = cols - cols % TILEDARRAY_LOOP_UNWIND
89  const integer m_tail = rows_ - mx;
90  const integer n_tail = cols_ - nx;
91 
92  // Copy data into block_
93  integer i = 0ul;
94  T* result_i = result_.get();
95  const T* data_i = data_;
96  for(; i < mx; i += TILEDARRAY_LOOP_UNWIND, result_i += Size, data_i += ld_) {
97 
98  integer j = 0ul;
99  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND)
100  copy_block(result_i + j, data_i + j);
101 
102  if(n_tail)
103  copy_block(TILEDARRAY_LOOP_UNWIND, n_tail, result_i + j, data_i + j);
104  }
105 
106  if(m_tail) {
107 
108  integer j = 0ul;
109  for(; j < nx; j += TILEDARRAY_LOOP_UNWIND)
110  copy_block(m_tail, TILEDARRAY_LOOP_UNWIND, result_i + j, data_i + j);
111 
112  if(n_tail)
113  copy_block(m_tail, n_tail, result_i + j, data_i + j);
114 
115  }
116 
117  return nullptr;
118  }
119 
120  std::shared_ptr<T> result() {
121  constexpr integer size = Size * Size;
122  constexpr integer bytes = size * sizeof(T);
123 
124  T* result_ptr = nullptr;
125  if(! posix_memalign(result_ptr, TILEARRAY_ALIGNMENT, bytes))
126  throw std::bad_alloc();
127 
128  result_.reset(result_ptr);
129 
130  return result_;
131  }
132 
133  }; // class MatrixBlockTask
134 
135 
136  template <integer Size, typename C, typename A = C, typename B = C, typename Alpha = C, typename Beta = C>
137  class GemmTask : public tbb::task {
138  const madness::cblas::CBLAS_TRANSPOSE op_a_, op_b_;
139  const integer m_, n_, k_;
140  const Alpha alpha_;
141  std::shared_ptr<A> a_;
142  constexpr integer lda_ = Size;
143  std::shared_ptr<B> b_;
144  const Beta beta_;
145  std::shared_ptr<C> c_;
146  const integer ldc_;
147 
148  public:
149 
150 
151  GemmTask(madness::cblas::CBLAS_TRANSPOSE op_a,
152  madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n,
153  const integer k, const Alpha alpha, const std::shared_ptr<A>& a,
154  const std::shared_ptr<B>& b, const Beta beta,
155  const std::shared_ptr<C>& c, const integer ldc) :
156  op_a_(op_a), op_b_(op_b), m_(m), n_(n), k_(k), alpha_(alpha), a_(a),
157  b_(b), beta_(beta), c_(c), ldc_(c)
158  { }
159 
160  virtual tbb::task execute() {
161  gemm(op_a_, op_b_, m_, n_, k_, alpha_, a_.get(), Size, b_.get(), Size, c_, ldc_);
162  }
163 
164  }; // class GemmTask
165 
166 //#endif // HAVE_INTEL_TBB
167 
168  } // namespace math
169 } // namespace TiledArray
170 
171 #endif // TILEDARRAY_PARALLEL_GEMM_H__INCLUDED
auto data(T &t)
Container data pointer accessor.
Definition: utility.h:89
GemmTask(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const Alpha alpha, const std::shared_ptr< A > &a, const std::shared_ptr< B > &b, const Beta beta, const std::shared_ptr< C > &c, const integer ldc)
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
Definition: vector_op.h:44
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:220
size_t size(const DistArray< Tile, Policy > &a)
Definition: utils.h:49
virtual tbb::task * execut()
Task body.
Definition: parallel_gemm.h:83
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:33
virtual tbb::task execute()
std::shared_ptr< T > result()
MatrixBlockTask(const integer rows, const integer cols, const T *const data, const integer ld)
Definition: parallel_gemm.h:77
void gemm(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const S1 alpha, const T1 *a, const integer lda, const T2 *b, const integer ldb, const S2 beta, T3 *c, const integer ldc)
Definition: blas.h:39