26 #ifndef TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED 27 #define TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED 46 template <
typename Op,
typename Result,
typename... Args>
47 static TILEDARRAY_FORCE_INLINE
void 49 const std::size_t arg_stride,
const Args* MADNESS_RESTRICT
const... args)
59 template <
typename Op,
typename Result>
60 static TILEDARRAY_FORCE_INLINE
void 69 template <std::
size_t N>
70 class TransposeUnwind :
public TransposeUnwind<N - 1> {
77 template <
typename Op,
typename Result,
typename... Args>
78 static TILEDARRAY_FORCE_INLINE
void 80 const std::size_t arg_stride,
const Args* MADNESS_RESTRICT
const... args)
92 arg_stride, (args + arg_stride)...);
95 template <
typename Op,
typename Result>
96 static TILEDARRAY_FORCE_INLINE
void 98 const std::size_t result_stride)
111 template <
typename InputOp,
typename OutputOp,
typename Result,
typename... Args>
112 TILEDARRAY_FORCE_INLINE
void 114 const std::size_t result_stride, Result*
const result,
115 const std::size_t arg_stride,
const Args*
const... args)
118 TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
122 arg_stride, args...);
125 temp, result_stride);
129 template <
typename InputOp,
typename OutputOp,
typename Result,
typename... Args>
130 TILEDARRAY_FORCE_INLINE
void 132 const std::size_t m,
const std::size_t n,
133 const std::size_t result_stride, Result* MADNESS_RESTRICT
const result,
134 const std::size_t arg_stride,
const Args* MADNESS_RESTRICT
const... args)
140 TILEDARRAY_ALIGNED_STORAGE Result temp[block_size];
143 for(std::size_t i = 0ul; i < m; ++i) {
144 std::size_t offset = i * arg_stride;
146 input_op(temp[x], args[offset]...);
150 for(std::size_t j = 0ul; j < n; ++j) {
151 Result* MADNESS_RESTRICT
const result_j = result + (j * result_stride);
153 for(std::size_t i = 0ul; i < m; ++i)
154 output_op(result_j + i, temp_j[i]);
175 template <
typename InputOp,
typename OutputOp,
typename Result,
typename... Args>
176 void transpose(InputOp&& input_op, OutputOp&& output_op,
177 const std::size_t m,
const std::size_t n,
178 const std::size_t result_stride, Result* result,
179 const std::size_t arg_stride,
const Args*
const... args)
185 const std::size_t m_tail = m - mx;
186 const std::size_t n_tail = n - nx;
189 const std::size_t arg_end = mx * arg_stride;
190 const Result* result_end = result + (nx * result_stride);
192 const auto wrapper_input_op =
193 [&] (Result& res, param_type<Args>... a) { res = input_op(a...); };
196 std::size_t arg_start = 0;
198 std::size_t arg_offset = arg_start;
199 Result* result_ij = result;
200 for(; result_ij < result_end; result_ij += result_block_step,
203 arg_stride, (args + arg_offset)...);
207 n_tail, result_stride, result_ij, arg_stride, (args + arg_offset)...);
211 std::size_t arg_offset = arg_start;
212 Result* result_ij = result;
213 for(; result_ij < result_end; result_ij += result_block_step,
217 (args + arg_offset)...);
221 result_stride, result_ij, arg_stride, (args + arg_offset)...);
228 #endif // TILEDARRAY_MATH_TRANSPOSE_H__INCLUDED
void scatter_to(T *const data, std::size_t stride) const
TILEDARRAY_FORCE_INLINE void for_each_block(Op &&op, Result *const result, const Args *const ... args)
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
std::integral_constant< std::size_t, ~std::size_t(TILEDARRAY_LOOP_UNWIND - 1ul)> index_mask
TILEDARRAY_FORCE_INLINE void transpose_block(InputOp &&input_op, OutputOp &&output_op, const std::size_t result_stride, Result *const result, const std::size_t arg_stride, const Args *const ... args)
Partial transpose algorithm automatic loop unwinding.
#define TILEDARRAY_LOOP_UNWIND
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t result_stride)
static TILEDARRAY_FORCE_INLINE void block_scatter(Op &&op, Result *const result, const Result *const arg, const std::size_t)
static TILEDARRAY_FORCE_INLINE void gather_trans(Op &&op, Result *MADNESS_RESTRICT const result, const std::size_t arg_stride, const Args *MADNESS_RESTRICT const ... args)
void transpose(InputOp &&input_op, OutputOp &&output_op, const std::size_t m, const std::size_t n, const std::size_t result_stride, Result *result, const std::size_t arg_stride, const Args *const ... args)
Matrix transpose and initialization.
TILEDARRAY_FORCE_INLINE std::enable_if<(sizeof...(Args) >=0)>::type for_each_block_ptr(Op &&op, Result *const result, const Args *const ... args)
TransposeUnwind< TILEDARRAY_LOOP_UNWIND - 1 > TransposeUnwindN
TransposeUnwind< N - 1 > TransposeUnwindN1
static constexpr std::size_t offset