Silenced several compilation warnings triggered by nvcc.

2025-08-14 04:35:57 +08:00 · 2016-01-11 14:25:43 -08:00 · 2016-01-11 14:25:43 -08:00 · b523771a24
commit b523771a24
parent 2c3b13eded
3 changed files with 46 additions and 22 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -10,7 +10,6 @@
 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
 namespace Eigen {
 // This defines an interface that GPUDevice can take to use
@ -206,20 +205,45 @@ struct GpuDevice {
 #endif
  }
-  inline int getNumCudaMultiProcessors() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().multiProcessorCount;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  inline int maxCudaThreadsPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerBlock;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  inline int maxCudaThreadsPerMultiProcessor() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  inline int sharedMemPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().sharedMemPerBlock;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  inline int majorDeviceVersion() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().major;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
  // This function checks if the CUDA runtime recorded an error for the
@ -239,13 +263,13 @@ struct GpuDevice {
 };
 #ifndef __CUDA_ARCH__
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
  assert(cudaGetLastError() == cudaSuccess);
 #else
-#define LAUNCH_CUDA_KERNEL(kernel, ...)                                  \
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
-    { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
-   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
 #endif
@ -260,4 +284,4 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
 }  // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -156,14 +156,14 @@ template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, false> {
 public:
  typedef typename Expression::Index Index;
-  static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };
 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, true> {
 public:
  typedef typename Expression::Index Index;
-  static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };
 #if defined(__CUDACC__)
@ -213,7 +213,7 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
 /*static*/
 template <typename Expression>
-inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -232,7 +232,7 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression&
 /*static*/
 template<typename Expression>
-inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
 {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -115,8 +115,8 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    eigen_assert(false && "Should only be called on floats");
+    assert(false && "Should only be called on floats");
  }
  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@ -210,11 +210,11 @@ struct InnerReducer<Self, Op, GpuDevice> {
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
  template <typename Device, typename OutputType>
-  static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
    assert(false && "Should only be called to reduce floats on a gpu device");
  }
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    typedef typename Self::Index Index;
    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@ -264,11 +264,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
  template <typename Device, typename OutputType>
-  static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
    assert(false && "Should only be called to reduce floats on a gpu device");
  }
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    typedef typename Self::Index Index;
    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;