mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-06 19:29:08 +08:00
Simplified the reduction code a little.
This commit is contained in:
parent
75bd2bd32d
commit
4013b8feca
@ -91,8 +91,8 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <>
|
||||||
__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
|
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
atomicAdd(output, accum);
|
atomicAdd(output, accum);
|
||||||
#else
|
#else
|
||||||
@ -208,9 +208,14 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template <typename Self, typename Op, bool is_half>
|
template <typename Self, typename Op>
|
||||||
struct Launcher {
|
struct FullReductionLauncher {
|
||||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, typename Self::CoeffReturnType* output, typename Self::Index num_coeffs) {
|
template <typename OutputType>
|
||||||
|
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
||||||
|
assert(false && "Should only be called on floats and half floats");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
typedef typename Self::CoeffReturnType Scalar;
|
typedef typename Self::CoeffReturnType Scalar;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
@ -220,18 +225,15 @@ struct Launcher {
|
|||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||||
// won't be a race conditions between multiple thread blocks.
|
// won't be a race conditions between multiple thread blocks.
|
||||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
|
LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
|
||||||
1, 32, 0, device, reducer.initialize(), 1, output);
|
1, 32, 0, device, reducer.initialize(), 1, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
template <typename Self, typename Op>
|
|
||||||
struct Launcher<Self, Op, true> {
|
|
||||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
|
|
||||||
@ -255,8 +257,8 @@ struct Launcher<Self, Op, true> {
|
|||||||
1, 1, 0, device, reducer, output, scratch);
|
1, 1, 0, device, reducer, output, scratch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
|
||||||
#endif
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
template <typename Self, typename Op, bool Vectorizable>
|
template <typename Self, typename Op, bool Vectorizable>
|
||||||
@ -282,8 +284,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const bool is_half = internal::is_same<typename Self::CoeffReturnType, half>::value;
|
FullReductionLauncher<Self, Op>::run(self, reducer, device, output, num_coeffs);
|
||||||
Launcher<Self, Op, is_half>::run(self, reducer, device, output, num_coeffs);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user