Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code.

2025-08-14 04:35:57 +08:00 · 2015-10-23 09:15:34 -07:00 · 2015-10-23 09:15:34 -07:00 · 9ea39ce13c
commit 9ea39ce13c
parent ac99b49249
3 changed files with 54 additions and 37 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
 #endif


-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
 template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
 {
  public:
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@ -287,6 +287,7 @@ class StreamInterface {
  virtual void deallocate(void* buffer) const = 0;
 };

+#if defined(__CUDACC__)
 static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;

@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
  const cudaStream_t* stream_;
  int device_;
 };
-
+#endif  // __CUDACC__

 struct GpuDevice {
  // The StreamInterface is not owned: the caller is
@ -450,7 +451,7 @@ struct GpuDevice {
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef __CUDA_ARCH__
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
    cudaError_t err = cudaStreamSynchronize(stream_->stream());
    assert(err == cudaSuccess);
 #else
@ -477,8 +478,12 @@ struct GpuDevice {
  // This function checks if the CUDA runtime recorded an error for the
  // underlying stream device.
  inline bool ok() const {
+#ifdef __CUDACC__
    cudaError_t error = cudaStreamQuery(stream_->stream());
    return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+    return false;
+#endif
  }

 private:
@ -493,10 +498,12 @@ struct GpuDevice {


 // FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
  assert(status == cudaSuccess);
 }
+#endif

 #endif

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>


 // GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, false> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, true> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(__CUDACC__)
+
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
@ -193,14 +210,10 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
  }
 }

-
-template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, false>
+/*static*/
+template <typename Expression>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
-  {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
@ -211,16 +224,13 @@ class TensorExecutor<Expression, GpuDevice, false>
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();
-  }
-};
+}

+
+/*static*/
 template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, true>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
-  {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
@ -231,10 +241,10 @@ class TensorExecutor<Expression, GpuDevice, true>
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();
-  }
-};
+}

-#endif
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU

 } // end namespace internal