Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code.

This commit is contained in:
Benoit Steiner 2015-10-23 09:15:34 -07:00
parent ac99b49249
commit 9ea39ce13c
3 changed files with 54 additions and 37 deletions

View File

@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU)
template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
{
public:

View File

@ -287,6 +287,7 @@ class StreamInterface {
virtual void deallocate(void* buffer) const = 0;
};
#if defined(__CUDACC__)
static cudaDeviceProp* m_deviceProperties;
static bool m_devicePropInitialized = false;
@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
const cudaStream_t* stream_;
int device_;
};
#endif // __CUDACC__
struct GpuDevice {
// The StreamInterface is not owned: the caller is
@ -450,7 +451,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#ifndef __CUDA_ARCH__
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
assert(err == cudaSuccess);
#else
@ -477,8 +478,12 @@ struct GpuDevice {
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
#ifdef __CUDACC__
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
#else
return false;
#endif
}
private:
@ -493,10 +498,12 @@ struct GpuDevice {
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__
static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
assert(status == cudaSuccess);
}
#endif
#endif

View File

@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU)
template <typename Expression>
class TensorExecutor<Expression, GpuDevice, false> {
public:
typedef typename Expression::Index Index;
static void run(const Expression& expr, const GpuDevice& device);
};
template <typename Expression>
class TensorExecutor<Expression, GpuDevice, true> {
public:
typedef typename Expression::Index Index;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(__CUDACC__)
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
@ -193,14 +210,10 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
}
}
template<typename Expression>
class TensorExecutor<Expression, GpuDevice, false>
/*static*/
template <typename Expression>
inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device)
{
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@ -211,16 +224,13 @@ class TensorExecutor<Expression, GpuDevice, false>
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
};
}
/*static*/
template<typename Expression>
class TensorExecutor<Expression, GpuDevice, true>
inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device)
{
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@ -231,10 +241,10 @@ class TensorExecutor<Expression, GpuDevice, true>
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
};
}
#endif
#endif // __CUDACC__
#endif // EIGEN_USE_GPU
} // end namespace internal