Disable Tree reduction for GPU.

For moderately sized inputs, running the Tree reduction quickly
fills/overflows the GPU thread stack space, leading to memory errors.
This was happening in the `cxx11_tensor_complex_gpu` test, for example.
Disabling tree reduction on GPU fixes this.

(cherry picked from commit 24ebb37f38287d65c0e0b60c714e39faffeb5b94)
This commit is contained in:
Antonio Sanchez 2021-10-08 11:38:13 -07:00
parent 89a71f3126
commit 554982beef

View File

@ -166,8 +166,12 @@ struct GenericDimReducer<-1, Self, Op> {
}; };
template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess), template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
!Self::ReducerTraits::IsExactlyAssociative)> !Self::ReducerTraits::IsExactlyAssociative &&
// GPU threads can quickly run out of stack space
// for moderately sized inputs.
!Self::RunningOnGPU
)>
struct InnerMostDimReducer { struct InnerMostDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
typename Self::CoeffReturnType accum = reducer.initialize(); typename Self::CoeffReturnType accum = reducer.initialize();
@ -528,6 +532,18 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
// Subset of strides of the input tensor for the non-reduced dimensions. // Subset of strides of the input tensor for the non-reduced dimensions.
// Indexed by output dimensions. // Indexed by output dimensions.
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
// For full reductions
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static constexpr bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
static const bool RunningOnGPU = false;
#else
static constexpr bool RunningOnGPU = false;
static constexpr bool RunningOnSycl = false;
#endif
enum { enum {
IsAligned = false, IsAligned = false,
@ -950,17 +966,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
// Operation to apply for computing the reduction. // Operation to apply for computing the reduction.
Op m_reducer; Op m_reducer;
// For full reductions
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
static const bool RunningOnGPU = false;
#else
static const bool RunningOnGPU = false;
static const bool RunningOnSycl = false;
#endif
EvaluatorPointerType m_result; EvaluatorPointerType m_result;
const Device EIGEN_DEVICE_REF m_device; const Device EIGEN_DEVICE_REF m_device;