mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-19 16:19:37 +08:00
583 lines
29 KiB
C++
583 lines
29 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Mehdi Goli Codeplay Software Ltd.
|
|
// Ralph Potter Codeplay Software Ltd.
|
|
// Luke Iwanski Codeplay Software Ltd.
|
|
// Contact: <eigen@codeplay.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
/*****************************************************************
|
|
* TensorReductionSycl.h
|
|
*
|
|
* \brief:
|
|
* This is the specialization of the reduction operation. Two phase reduction approach
|
|
* is used since the GPU does not have Global Synchronization for global memory among
|
|
* different work-group/thread block. To solve the problem, we need to create two kernels
|
|
* to reduce the data, where the first kernel reduce the data locally and each local
|
|
* workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
|
|
* one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
|
|
* Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
|
|
* https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
|
|
*
|
|
*****************************************************************/
|
|
|
|
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
|
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
|
namespace Eigen {
|
|
namespace TensorSycl {
|
|
namespace internal {
|
|
|
|
template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
|
|
struct OpDefiner {
|
|
typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
|
|
typedef Op type;
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
|
|
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
|
|
const Index &) {
|
|
return accumulator;
|
|
}
|
|
};
|
|
|
|
template <typename CoeffReturnType, typename Index>
|
|
struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
|
|
typedef Eigen::internal::SumReducer<CoeffReturnType> type;
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
|
|
return type();
|
|
}
|
|
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
|
|
const Index &scale) {
|
|
::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
|
|
return quotient_op(accumulator, CoeffReturnType(scale));
|
|
}
|
|
};
|
|
|
|
template <typename CoeffReturnType, typename Index>
|
|
struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
|
|
typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
|
|
typedef Eigen::internal::SumReducer<CoeffReturnType> type;
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
|
|
return type();
|
|
}
|
|
|
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
|
|
const Index &scale) {
|
|
return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
|
|
}
|
|
};
|
|
|
|
template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
|
|
Index local_range>
|
|
struct SecondStepFullReducer {
|
|
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
|
|
LocalAccessor;
|
|
typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
|
|
typedef typename OpDef::type Op;
|
|
LocalAccessor scratch;
|
|
InputAccessor aI;
|
|
OutputAccessor outAcc;
|
|
Op op;
|
|
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
|
|
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
|
|
|
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
|
// Our empirical research shows that the best performance will be achieved
|
|
// when there is only one element per thread to reduce in the second step.
|
|
// in this step the second step reduction time is almost negligible.
|
|
// Hence, in the second step of reduction the input size is fixed to the
|
|
// local size, thus, there is only one element read per thread. The
|
|
// algorithm must be changed if the number of reduce per thread in the
|
|
// second step is greater than 1. Otherwise, the result will be wrong.
|
|
const Index localid = itemID.get_local_id(0);
|
|
auto aInPtr = aI.get_pointer() + localid;
|
|
auto aOutPtr = outAcc.get_pointer();
|
|
CoeffReturnType *scratchptr = scratch.get_pointer();
|
|
CoeffReturnType accumulator = *aInPtr;
|
|
|
|
scratchptr[localid] = op.finalize(accumulator);
|
|
for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
|
|
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
|
if (localid < offset) {
|
|
op.reduce(scratchptr[localid + offset], &accumulator);
|
|
scratchptr[localid] = op.finalize(accumulator);
|
|
}
|
|
}
|
|
if (localid == 0) *aOutPtr = op.finalize(accumulator);
|
|
}
|
|
};
|
|
|
|
// Full reduction first phase. In this version the vectorization is true and the reduction accept
|
|
// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
|
|
template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
|
|
class FullReductionKernelFunctor {
|
|
public:
|
|
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
|
|
typedef typename Evaluator::Index Index;
|
|
typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
|
|
(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
|
OpDef;
|
|
|
|
typedef typename OpDef::type Op;
|
|
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
|
|
typedef typename Evaluator::PacketReturnType PacketReturnType;
|
|
typedef
|
|
typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
|
|
PacketReturnType, CoeffReturnType>::type OutType;
|
|
typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
|
|
LocalAccessor;
|
|
LocalAccessor scratch;
|
|
Evaluator evaluator;
|
|
EvaluatorPointerType final_output;
|
|
Index rng;
|
|
Op op;
|
|
|
|
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
|
|
Index rng_, OpType op_)
|
|
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
|
|
|
|
void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
|
|
|
|
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
|
|
const cl::sycl::nd_item<1> &itemID) {
|
|
auto output_ptr = final_output.get_pointer();
|
|
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
|
|
Index globalid = itemID.get_global_id(0);
|
|
Index localid = itemID.get_local_id(0);
|
|
Index step = Evaluator::PacketSize * itemID.get_global_range(0);
|
|
Index start = Evaluator::PacketSize * globalid;
|
|
// vectorizable parts
|
|
PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
|
|
for (Index i = start; i < VectorizedRange; i += step) {
|
|
op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
|
|
}
|
|
globalid += VectorizedRange;
|
|
// non vectorizable parts
|
|
for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
|
|
op.template reducePacket<PacketReturnType>(
|
|
::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
|
|
evaluator.impl().coeff(i), op.initialize()),
|
|
&packetAccumulator);
|
|
}
|
|
scratch[localid] = packetAccumulator =
|
|
OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
|
|
// reduction parts // Local size is always power of 2
|
|
EIGEN_UNROLL_LOOP
|
|
for (Index offset = local_range / 2; offset > 0; offset /= 2) {
|
|
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
|
if (localid < offset) {
|
|
op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
|
|
scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
|
|
}
|
|
}
|
|
if (localid == 0) {
|
|
output_ptr[itemID.get_group(0)] =
|
|
op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
|
|
}
|
|
}
|
|
|
|
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
|
|
const cl::sycl::nd_item<1> &itemID) {
|
|
auto output_ptr = final_output.get_pointer();
|
|
Index globalid = itemID.get_global_id(0);
|
|
Index localid = itemID.get_local_id(0);
|
|
// vectorizable parts
|
|
CoeffReturnType accumulator = op.initialize();
|
|
// non vectorizable parts
|
|
for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
|
|
op.reduce(evaluator.impl().coeff(i), &accumulator);
|
|
}
|
|
scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
|
|
|
|
// reduction parts. the local size is always power of 2
|
|
EIGEN_UNROLL_LOOP
|
|
for (Index offset = local_range / 2; offset > 0; offset /= 2) {
|
|
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
|
if (localid < offset) {
|
|
op.reduce(scratch[localid + offset], &accumulator);
|
|
scratch[localid] = op.finalize(accumulator);
|
|
}
|
|
}
|
|
if (localid == 0) {
|
|
output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename Evaluator, typename OpType>
|
|
class GenericNondeterministicReducer {
|
|
public:
|
|
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
|
|
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
|
|
typedef typename Evaluator::Index Index;
|
|
typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
|
|
typedef typename OpDef::type Op;
|
|
template <typename Scratch>
|
|
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
|
|
Index range_, Index num_values_to_reduce_)
|
|
: evaluator(evaluator_),
|
|
output_accessor(output_accessor_),
|
|
functor(OpDef::get_op(functor_)),
|
|
range(range_),
|
|
num_values_to_reduce(num_values_to_reduce_) {}
|
|
|
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
|
auto output_accessor_ptr = output_accessor.get_pointer();
|
|
/// const cast added as a naive solution to solve the qualifier drop error
|
|
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
|
|
if (globalid < range) {
|
|
CoeffReturnType accum = functor.initialize();
|
|
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
|
|
evaluator, evaluator.firstInput(globalid), functor, &accum);
|
|
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
|
|
}
|
|
}
|
|
|
|
private:
|
|
Evaluator evaluator;
|
|
EvaluatorPointerType output_accessor;
|
|
Op functor;
|
|
Index range;
|
|
Index num_values_to_reduce;
|
|
};
|
|
|
|
enum class reduction_dim { inner_most, outer_most };
|
|
// default is preserver
|
|
template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
|
|
struct PartialReductionKernel {
|
|
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
|
|
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
|
|
typedef typename Evaluator::Index Index;
|
|
typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
|
|
typedef typename OpDef::type Op;
|
|
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
|
|
ScratchAcc;
|
|
ScratchAcc scratch;
|
|
Evaluator evaluator;
|
|
EvaluatorPointerType output_accessor;
|
|
Op op;
|
|
const Index preserve_elements_num_groups;
|
|
const Index reduce_elements_num_groups;
|
|
const Index num_coeffs_to_preserve;
|
|
const Index num_coeffs_to_reduce;
|
|
|
|
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
|
|
const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
|
|
const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
|
|
: scratch(scratch_),
|
|
evaluator(evaluator_),
|
|
output_accessor(output_accessor_),
|
|
op(OpDef::get_op(op_)),
|
|
preserve_elements_num_groups(preserve_elements_num_groups_),
|
|
reduce_elements_num_groups(reduce_elements_num_groups_),
|
|
num_coeffs_to_preserve(num_coeffs_to_preserve_),
|
|
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
|
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
|
|
CoeffReturnType &accumulator) {
|
|
if (globalPId >= num_coeffs_to_preserve) {
|
|
return;
|
|
}
|
|
Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
|
|
: globalRId + (globalPId * num_coeffs_to_reduce);
|
|
Index localOffset = globalRId;
|
|
|
|
const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
|
|
const Index per_thread_global_stride =
|
|
rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
|
|
for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
|
|
op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
|
|
localOffset += per_thread_local_stride;
|
|
global_offset += per_thread_global_stride;
|
|
}
|
|
}
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
|
const Index linearLocalThreadId = itemID.get_local_id(0);
|
|
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
|
|
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
|
|
Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
|
|
: linearLocalThreadId % PannelParameters::LocalThreadSizeR;
|
|
const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
|
|
: itemID.get_group(0) / reduce_elements_num_groups;
|
|
const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
|
|
: itemID.get_group(0) % reduce_elements_num_groups;
|
|
|
|
Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
|
|
const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
|
|
auto scratchPtr = scratch.get_pointer().get();
|
|
auto outPtr =
|
|
output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
|
|
CoeffReturnType accumulator = op.initialize();
|
|
|
|
element_wise_reduce(globalRId, globalPId, accumulator);
|
|
|
|
accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
|
|
scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
|
|
accumulator;
|
|
if (rt == reduction_dim::inner_most) {
|
|
pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
|
|
rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
|
|
globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
|
|
}
|
|
|
|
/* Apply the reduction operation between the current local
|
|
* id and the one on the other half of the vector. */
|
|
auto out_scratch_ptr =
|
|
scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
|
|
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
|
if (rt == reduction_dim::inner_most) {
|
|
accumulator = *out_scratch_ptr;
|
|
}
|
|
// The Local LocalThreadSizeR is always power of 2
|
|
EIGEN_UNROLL_LOOP
|
|
for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
|
|
if (rLocalThreadId < offset) {
|
|
op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
|
|
// The result has already been divided for mean reducer in the
|
|
// previous reduction so no need to divide furthermore
|
|
*out_scratch_ptr = op.finalize(accumulator);
|
|
}
|
|
/* All threads collectively read from global memory into local.
|
|
* The barrier ensures all threads' IO is resolved before
|
|
* execution continues (strictly speaking, all threads within
|
|
* a single work-group - there is no co-ordination between
|
|
* work-groups, only work-items). */
|
|
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
|
}
|
|
|
|
if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
|
|
outPtr[globalPId] = op.finalize(accumulator);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
|
|
struct SecondStepPartialReduction {
|
|
typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
|
|
typedef typename OpDef::type Op;
|
|
typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
|
|
ScratchAccessor;
|
|
InputAccessor input_accessor;
|
|
OutputAccessor output_accessor;
|
|
Op op;
|
|
const Index num_coeffs_to_preserve;
|
|
const Index num_coeffs_to_reduce;
|
|
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
|
|
OutputAccessor output_accessor_, OpType op_,
|
|
const Index num_coeffs_to_preserve_,
|
|
const Index num_coeffs_to_reduce_)
|
|
: input_accessor(input_accessor_),
|
|
output_accessor(output_accessor_),
|
|
op(OpDef::get_op(op_)),
|
|
num_coeffs_to_preserve(num_coeffs_to_preserve_),
|
|
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
|
|
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
|
const Index globalId = itemID.get_global_id(0);
|
|
|
|
if (globalId >= num_coeffs_to_preserve) return;
|
|
|
|
auto in_ptr = input_accessor.get_pointer() + globalId;
|
|
|
|
OutScalar accumulator = op.initialize();
|
|
// num_coeffs_to_reduce is not bigger that 256
|
|
for (Index i = 0; i < num_coeffs_to_reduce; i++) {
|
|
op.reduce(*in_ptr, &accumulator);
|
|
in_ptr += num_coeffs_to_preserve;
|
|
}
|
|
output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
|
|
}
|
|
}; // namespace internal
|
|
|
|
template <typename Index, Index LTP, Index LTR, bool BC_>
|
|
struct ReductionPannel {
|
|
static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
|
|
static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
|
|
static EIGEN_CONSTEXPR bool BC = BC_;
|
|
};
|
|
|
|
template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
|
|
struct PartialReducerLauncher {
|
|
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
|
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
|
typedef typename Self::Storage Storage;
|
|
typedef typename Self::Index Index;
|
|
typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
|
|
PannelParameters;
|
|
|
|
typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
|
|
|
|
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
|
|
Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
|
|
Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
|
|
|
|
// getPowerOfTwo makes sure local range is power of 2 and <=
|
|
// maxSyclThreadPerBlock this will help us to avoid extra check on the
|
|
// kernel
|
|
static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
|
|
(PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
|
|
"The Local thread size must be a power of 2 for the reduction "
|
|
"operation");
|
|
|
|
EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
|
|
// In this step, we force the code not to be more than 2-step reduction:
|
|
// Our empirical research shows that if each thread reduces at least 64
|
|
// elemnts individually, we get better performance. However, this can change
|
|
// on different platforms. In this step we force the code not to be
|
|
// morthan step reduction: Our empirical research shows that for inner_most
|
|
// dim reducer, it is better to have 8 group in a reduce dimension for sizes
|
|
// > 1024 to achieve the best performance.
|
|
const Index reductionPerThread = 64;
|
|
Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
|
|
const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
|
|
Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
|
|
const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
|
|
const Index globalRange = pNumGroups * rNumGroups * localRange;
|
|
|
|
EIGEN_CONSTEXPR Index scratchSize =
|
|
PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
|
|
auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
|
|
if (rNumGroups > 1) {
|
|
CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
|
|
dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
|
|
EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
|
|
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
|
|
self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
|
|
num_coeffs_to_reduce);
|
|
|
|
typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
|
|
SecondStepPartialReductionKernel;
|
|
|
|
dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
|
|
temp_accessor, output,
|
|
cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
|
|
reducer, num_coeffs_to_preserve, rNumGroups);
|
|
|
|
self.device().deallocate_temp(temp_pointer);
|
|
} else {
|
|
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
|
|
self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
|
|
num_coeffs_to_reduce);
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
} // namespace internal
|
|
} // namespace TensorSycl
|
|
|
|
namespace internal {
|
|
|
|
template <typename Self, typename Op, bool Vectorizable>
|
|
struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
|
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
|
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
|
|
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
|
|
static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
|
|
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
|
|
typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
|
|
static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
|
|
(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
|
|
"The Local thread size must be a power of 2 for the reduction "
|
|
"operation");
|
|
EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
|
|
|
|
typename Self::Index inputSize = self.impl().dimensions().TotalSize();
|
|
// In this step we force the code not to be more than 2-step reduction:
|
|
// Our empirical research shows that if each thread reduces at least 512
|
|
// elemnts individually, we get better performance.
|
|
const Index reductionPerThread = 2048;
|
|
// const Index num_work_group =
|
|
Index reductionGroup = dev.getPowerOfTwo(
|
|
(inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
|
|
const Index num_work_group = std::min(reductionGroup, local_range);
|
|
// 1
|
|
// ? local_range
|
|
// : 1);
|
|
const Index global_range = num_work_group * local_range;
|
|
|
|
auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
|
|
typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
|
|
if (num_work_group > 1) {
|
|
CoeffReturnType *temp_pointer =
|
|
static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
|
|
typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
|
|
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
|
|
local_range, inputSize, reducer);
|
|
|
|
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
|
|
EvaluatorPointerType, Index, local_range>
|
|
GenericRKernel;
|
|
dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
|
|
tmp_global_accessor, data,
|
|
cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
|
|
reducer);
|
|
|
|
dev.deallocate_temp(temp_pointer);
|
|
} else {
|
|
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
|
|
reducer);
|
|
}
|
|
}
|
|
};
|
|
// vectorizable inner_most most dim preserver
|
|
// col reduction
|
|
template <typename Self, typename Op>
|
|
struct OuterReducer<Self, Op, Eigen::SyclDevice> {
|
|
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
|
|
|
|
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
|
|
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
|
|
typename Self::Index num_coeffs_to_preserve) {
|
|
return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
|
|
Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
|
|
num_coeffs_to_reduce,
|
|
num_coeffs_to_preserve);
|
|
}
|
|
};
|
|
// row reduction
|
|
template <typename Self, typename Op>
|
|
struct InnerReducer<Self, Op, Eigen::SyclDevice> {
|
|
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
|
|
|
|
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
|
|
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
|
|
typename Self::Index num_coeffs_to_preserve) {
|
|
return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
|
|
Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
|
|
num_coeffs_to_reduce,
|
|
num_coeffs_to_preserve);
|
|
}
|
|
};
|
|
|
|
// ArmgMax uses this kernel for partial reduction//
|
|
// TODO(@mehdi.goli) come up with a better kernel
|
|
// generic partial reduction
|
|
template <typename Self, typename Op>
|
|
struct GenericReducer<Self, Op, Eigen::SyclDevice> {
|
|
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
|
|
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
|
|
typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
|
|
typename Self::Index num_coeffs_to_preserve) {
|
|
typename Self::Index range, GRange, tileSize;
|
|
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
|
|
|
|
dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
|
|
TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
|
|
self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
|
|
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
|
|
return false;
|
|
}
|
|
};
|
|
|
|
} // namespace internal
|
|
} // namespace Eigen
|
|
|
|
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|