Add the necessary CUDA >= 300 checks back

This commit is contained in:
Igor Babuschkin 2016-08-18 17:15:12 +01:00
parent 841e075154
commit 1569a7d7ab

View File

@ -23,6 +23,7 @@ namespace internal {
// updated the content of the output address it will try again. // updated the content of the output address it will try again.
template <typename T, typename R> template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if __CUDA_ARCH__ >= 300
if (sizeof(T) == 4) if (sizeof(T) == 4)
{ {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -40,6 +41,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
return; return;
} }
} }
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
} }
else if (sizeof(T) == 8) { else if (sizeof(T) == 8) {
unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
@ -98,7 +102,11 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
template <> template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if __CUDA_ARCH__ >= 300
atomicAdd(output, accum); atomicAdd(output, accum);
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
} }
@ -116,6 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) { typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if __CUDA_ARCH__ >= 300
// Initialize the output value // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) { if (gridDim.x == 1) {
@ -170,6 +179,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
// Let the last block reset the semaphore // Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1); atomicInc(semaphore, gridDim.x + 1);
} }
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
} }
@ -684,7 +696,6 @@ struct OuterReducer<Self, Op, GpuDevice> {
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType> template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce doubles or floats on a gpu device"); assert(false && "Should only be called to reduce doubles or floats on a gpu device");