mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Properly gate the use of half2.
This commit is contained in:
parent
bf185c3c28
commit
4ede059de1
@ -323,10 +323,12 @@ template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename S, typename R, typename I>
|
||||
__global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||
#endif
|
||||
|
||||
template <int NPT, typename S, typename R, typename I>
|
||||
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
@ -624,8 +626,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
#endif
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||
#endif
|
||||
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
#endif
|
||||
|
@ -68,6 +68,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <template <typename T> class R>
|
||||
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
@ -90,6 +91,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user