doc/html/parallel__gemm_8h_source.html

 /*
  *  This file is a part of TiledArray.
  *  Copyright (C) 2015  Virginia Tech
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  justus
  *  Department of Chemistry, Virginia Tech
  *
  *  parallel_gemm.h
  *  Apr 29, 2015
  *
  */

 #ifndef TILEDARRAY_PARALLEL_GEMM_H__INCLUDED
 #define TILEDARRAY_PARALLEL_GEMM_H__INCLUDED

 #include <TiledArray/madness.h>
 #include <TiledArray/vector_op.h>
 #include <TiledArray/blas.h>

 #define TILEDARRAY_DYNAMIC_BLOCK_SIZE std::numeric_limits<std::size_t>::max();

 namespace TiledArray {
   namespace math {

 //#ifdef HAVE_INTEL_TBB


     template <typename T, integer Size>
     class MatrixBlockTask : public tbb::task {

       const integer rows_;
       const integer cols_;
       T* data_;
       const integer ld_;
       std::shared_ptr<T> result_;


       void copy_block(T* result, const T* data, const integer ld) {
         const T* const block_end = result + (TILEDARRAY_LOOP_UNWIND * Size);
         for(; result < block_end; result += Size, data += ld)
           TiledArray::math::copy_block(result, data);
       }


       void copy_block(const integer m, const integer n, T* result,
           const T* data, const integer ld)
       {
         const T* const block_end = result + (m * Size);
         for(; result < block_end; result += Size, data += ld)
           TiledArray::math::copy_block(n, result, data);
       }

     public:
       MatrixBlockTask(const integer rows, const integer cols,
           const T* const data, const integer ld) :
         rows_(rows), cols_(cols), data_(data), ld_(ld)
       { }

       virtual tbb::task* execut() {

         // Compute block iteration limit
         constexpr integer index_mask = ~integer(TILEDARRAY_LOOP_UNWIND - 1ul);
         const integer mx = rows_ & index_mask; // = rows - rows % TILEDARRAY_LOOP_UNWIND
         const integer nx = cols_ & index_mask; // = cols - cols % TILEDARRAY_LOOP_UNWIND
         const integer m_tail = rows_ - mx;
         const integer n_tail = cols_ - nx;

         // Copy data into block_
         integer i = 0ul;
         T* result_i = result_.get();
         const T* data_i = data_;
         for(; i < mx; i += TILEDARRAY_LOOP_UNWIND, result_i += Size, data_i += ld_) {

           integer j = 0ul;
           for(; j < nx; j += TILEDARRAY_LOOP_UNWIND)
             copy_block(result_i + j, data_i + j);

           if(n_tail)
             copy_block(TILEDARRAY_LOOP_UNWIND, n_tail, result_i + j, data_i + j);
         }

         if(m_tail) {

           integer j = 0ul;
           for(; j < nx; j += TILEDARRAY_LOOP_UNWIND)
             copy_block(m_tail, TILEDARRAY_LOOP_UNWIND, result_i + j, data_i + j);

           if(n_tail)
             copy_block(m_tail, n_tail, result_i + j, data_i + j);

         }

         return nullptr;
       }

       std::shared_ptr<T> result() {
         constexpr integer size = Size * Size;
         constexpr integer bytes = size * sizeof(T);

         T* result_ptr = nullptr;
         if(! posix_memalign(result_ptr, TILEARRAY_ALIGNMENT, bytes))
           throw std::bad_alloc();

         result_.reset(result_ptr);

         return result_;
       }

     }; // class MatrixBlockTask


     template <integer Size, typename C, typename A = C, typename B = C, typename Alpha = C, typename Beta = C>
     class GemmTask : public tbb::task {
       const madness::cblas::CBLAS_TRANSPOSE op_a_, op_b_;
       const integer m_, n_, k_;
       const Alpha alpha_;
       std::shared_ptr<A> a_;
       constexpr integer lda_ = Size;
       std::shared_ptr<B> b_;
       const Beta beta_;
       std::shared_ptr<C> c_;
       const integer ldc_;

     public:


       GemmTask(madness::cblas::CBLAS_TRANSPOSE op_a,
           madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n,
           const integer k, const Alpha alpha, const std::shared_ptr<A>& a,
           const std::shared_ptr<B>& b, const Beta beta,
           const std::shared_ptr<C>& c, const integer ldc) :
             op_a_(op_a), op_b_(op_b), m_(m), n_(n), k_(k), alpha_(alpha), a_(a),
             b_(b), beta_(beta), c_(c), ldc_(c)
       { }

       virtual tbb::task execute() {
         gemm(op_a_, op_b_, m_, n_, k_, alpha_, a_.get(), Size, b_.get(), Size, c_, ldc_);
       }

     }; // class GemmTask

 //#endif // HAVE_INTEL_TBB

   }  // namespace math
 } // namespace TiledArray

 #endif // TILEDARRAY_PARALLEL_GEMM_H__INCLUDED
TiledArray::detail::data
auto data(T &t)
Container data pointer accessor.
Definition: utility.h:89

TiledArray::math::GemmTask::GemmTask
GemmTask(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const Alpha alpha, const std::shared_ptr< A > &a, const std::shared_ptr< B > &b, const Beta beta, const std::shared_ptr< C > &c, const integer ldc)
Definition: parallel_gemm.h:151

TiledArray::math::index_mask
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
Definition: vector_op.h:44

TiledArray::math::copy_block
TILEDARRAY_FORCE_INLINE void copy_block(Result *const result, const Arg *const arg)
Definition: vector_op.h:220

TiledArray::size
size_t size(const DistArray< Tile, Policy > &a)
Definition: utils.h:49

TiledArray::math::MatrixBlockTask::execut
virtual tbb::task * execut()
Task body.
Definition: parallel_gemm.h:83

TiledArray
Definition: conjgrad.h:34

TILEDARRAY_LOOP_UNWIND
#define TILEDARRAY_LOOP_UNWIND
Definition: vector_op.h:33

madness.h

TiledArray::math::GemmTask::execute
virtual tbb::task execute()
Definition: parallel_gemm.h:160

TiledArray::math::MatrixBlockTask::result
std::shared_ptr< T > result()
Definition: parallel_gemm.h:120

TiledArray::math::MatrixBlockTask::MatrixBlockTask
MatrixBlockTask(const integer rows, const integer cols, const T *const data, const integer ld)
Definition: parallel_gemm.h:77

TiledArray::math::gemm
void gemm(madness::cblas::CBLAS_TRANSPOSE op_a, madness::cblas::CBLAS_TRANSPOSE op_b, const integer m, const integer n, const integer k, const S1 alpha, const T1 *a, const integer lda, const T2 *b, const integer ldb, const S2 beta, T3 *c, const integer ldc)
Definition: blas.h:39

TiledArray::math::GemmTask
Definition: parallel_gemm.h:137

TiledArray::math::MatrixBlockTask
Definition: parallel_gemm.h:42