Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code.

2025-08-14 12:46:00 +08:00 · 2015-10-23 09:15:34 -07:00 · 2015-10-23 09:15:34 -07:00 · 9ea39ce13c
commit 9ea39ce13c
parent ac99b49249
3 changed files with 54 additions and 37 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
 #endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
 template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
 {
  public:
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@ -287,6 +287,7 @@ class StreamInterface {
  virtual void deallocate(void* buffer) const = 0;
 };
 #if defined(__CUDACC__)
 static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;
@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
  const cudaStream_t* stream_;
  int device_;
 };
-
+#endif  // __CUDACC__
 struct GpuDevice {
  // The StreamInterface is not owned: the caller is
@ -450,7 +451,7 @@ struct GpuDevice {
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef __CUDA_ARCH__
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
    cudaError_t err = cudaStreamSynchronize(stream_->stream());
    assert(err == cudaSuccess);
 #else
@ -477,8 +478,12 @@ struct GpuDevice {
  // This function checks if the CUDA runtime recorded an error for the
  // underlying stream device.
  inline bool ok() const {
 #ifdef __CUDACC__
    cudaError_t error = cudaStreamQuery(stream_->stream());
    return (error == cudaSuccess) || (error == cudaErrorNotReady);
 #else
    return false;
 #endif
  }
 private:
@ -493,10 +498,12 @@ struct GpuDevice {
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
  assert(status == cudaSuccess);
 }
 #endif
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 // GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, false> {
 public:
  typedef typename Expression::Index Index;
  static void run(const Expression& expr, const GpuDevice& device);
 };
 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, true> {
 public:
  typedef typename Expression::Index Index;
  static void run(const Expression& expr, const GpuDevice& device);
 };
 #if defined(__CUDACC__)
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
@ -193,13 +210,9 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
  }
 }
-
+/*static*/
 template <typename Expression>
-class TensorExecutor<Expression, GpuDevice, false>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
 public:
  typedef typename Expression::Index Index;
  static inline void run(const Expression& expr, const GpuDevice& device)
 {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -212,14 +225,11 @@ class TensorExecutor<Expression, GpuDevice, false>
  }
  evaluator.cleanup();
 }
 };
 /*static*/
 template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, true>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
 public:
  typedef typename Expression::Index Index;
  static inline void run(const Expression& expr, const GpuDevice& device)
 {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -232,9 +242,9 @@ class TensorExecutor<Expression, GpuDevice, true>
  }
  evaluator.cleanup();
 }
 };
-#endif
+#endif  // __CUDACC__
 #endif  // EIGEN_USE_GPU
 } // end namespace internal