26 #ifndef TILEDARRAY_PARALLEL_GEMM_H__INCLUDED 27 #define TILEDARRAY_PARALLEL_GEMM_H__INCLUDED 30 #include <TiledArray/vector_op.h> 31 #include <TiledArray/blas.h> 33 #define TILEDARRAY_DYNAMIC_BLOCK_SIZE std::numeric_limits<std::size_t>::max(); 41 template <
typename T,
integer Size>
48 std::shared_ptr<T> result_;
55 void copy_block(T*
result,
const T*
data,
const integer ld) {
68 void copy_block(
const integer m,
const integer n, T*
result,
69 const T*
data,
const integer ld)
71 const T*
const block_end =
result + (m * Size);
78 const T*
const data,
const integer ld) :
79 rows_(rows), cols_(cols), data_(
data), ld_(ld)
89 const integer m_tail = rows_ - mx;
90 const integer n_tail = cols_ - nx;
94 T* result_i = result_.get();
95 const T* data_i = data_;
100 copy_block(result_i + j, data_i + j);
113 copy_block(m_tail, n_tail, result_i + j, data_i + j);
121 constexpr integer
size = Size * Size;
122 constexpr integer bytes =
size *
sizeof(T);
124 T* result_ptr =
nullptr;
125 if(! posix_memalign(result_ptr, TILEARRAY_ALIGNMENT, bytes))
126 throw std::bad_alloc();
128 result_.reset(result_ptr);
136 template <
integer Size,
typename C,
typename A = C,
typename B = C,
typename Alpha = C,
typename Beta = C>
138 const madness::cblas::CBLAS_TRANSPOSE op_a_, op_b_;
139 const integer m_, n_, k_;
141 std::shared_ptr<A> a_;
142 constexpr integer lda_ = Size;
143 std::shared_ptr<B> b_;
145 std::shared_ptr<C> c_;
152 madness::cblas::CBLAS_TRANSPOSE op_b,
const integer m,
const integer n,
153 const integer k,
const Alpha alpha,
const std::shared_ptr<A>& a,
154 const std::shared_ptr<B>& b,
const Beta beta,
155 const std::shared_ptr<C>& c,
const integer ldc) :
156 op_a_(op_a), op_b_(op_b), m_(m), n_(n), k_(k), alpha_(alpha), a_(a),
157 b_(b), beta_(beta), c_(c), ldc_(c)
161 gemm(op_a_, op_b_, m_, n_, k_, alpha_, a_.get(), Size, b_.get(), Size, c_, ldc_);
171 #endif // TILEDARRAY_PARALLEL_GEMM_H__INCLUDED auto data(T &t)
Container data pointer accessor.
GemmTask(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const Alpha alpha, const std::shared_ptr< A > &a, const std::shared_ptr< B > &b, const Beta beta, const std::shared_ptr< C > &c, const integer ldc)
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
size_t size(const DistArray< Tile, Policy > &a)
virtual tbb::task * execut()
Task body.
#define TILEDARRAY_LOOP_UNWIND
virtual tbb::task execute()
std::shared_ptr< T > result()
MatrixBlockTask(const integer rows, const integer cols, const T *const data, const integer ld)
void gemm(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const S1 alpha, const T1 *a, const integer lda, const T2 *b, const integer ldb, const S2 beta, T3 *c, const integer ldc)