applying EIGEN_DECLARE_TEST to *gpu* tests

Also, a few minor fixes for GPU tests running in HIP mode. 1. Adding an include for hip/hip_runtime.h in the Macros.h file For HIP __host__ and __device__ are macros which are defined in hip headers. Their definitions need to be included before their use in the file. 2. Fixing the compile failure in TensorContractionGpu introduced by the commit to "Fuse computations into the Tensor contractions using output kernel" 3. Fixing a HIP/clang specific compile error by making the struct-member assignment explicit
2025-09-26 08:13:13 +08:00 · 2018-07-17 14:16:48 -04:00 · 2018-07-17 14:16:48 -04:00 · f124f07965
commit f124f07965
parent 82f0ce2726
14 changed files with 24 additions and 22 deletions
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -395,8 +395,10 @@
  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
  #define EIGEN_HIPCC __HIPCC__

-  // We need hip_common.h here because __HIP_DEVICE_COMPILE__ is defined in this header.
-  #include <hip/hip_common.h>
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__ 
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>

  #if defined(__HIP_DEVICE_COMPILE__)
    // analogous to EIGEN_CUDA_ARCH, but for HIP
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -580,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> {

 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
  #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
    #define EIGEN_ALLOCA alloca
  #elif EIGEN_COMP_MSVC
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@ -610,6 +610,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
  {
    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
    Index i0;
    // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
    mat.diagonal().cwiseAbs().maxCoeff(&i0);
@ -620,8 +621,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
    VectorType c0, c1;
    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);

    return true;
  }
@ -723,7 +724,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
  EIGEN_DEVICE_FUNC
  static inline void computeRoots(const MatrixType& m, VectorType& roots)
  {
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(sqrt);
    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
    const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
    roots(0) = t1 - t0;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -449,8 +449,7 @@ struct TensorContractionEvaluatorBase
    // tensor dimensions (i, j) into the original tensor dimensions.
    // TODO(ezhulenev): Add parameters required to infer output tensor index for
    // more complex contractions than 2x2 on internal dimension.
-    m_tensor_contraction_params = {
-        /**swapped_arguments=*/static_cast<int>(Layout) == RowMajor};
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@ -1215,16 +1215,16 @@ EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
 }


-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {

  typedef GpuDevice Device;

-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
  typedef TensorContractionEvaluatorBase<Self> Base;

-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  typedef typename XprType::Index Index;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@ -242,7 +242,7 @@ void test_gpu_argmin_dim()
  }
 }

-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
--- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@ -72,7 +72,7 @@ void test_fallback_conversion() {
 }


-void test_cxx11_tensor_cast_float16_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
  CALL_SUBTEST(test_gpu_conversion());
  CALL_SUBTEST(test_fallback_conversion());
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@ -193,7 +193,7 @@ void test_gpu_contraction_sizes() {
  }
 }

-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@ -389,7 +389,7 @@ void test_gpu() {
 }


-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
  CALL_SUBTEST_1(test_cpu());
  CALL_SUBTEST_2(test_gpu());
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@ -1472,7 +1472,7 @@ void test_gpu_gamma_sample_der_alpha()
  gpuFree(d_out);
 }

-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 {
  CALL_SUBTEST_1(test_gpu_nullary());
  CALL_SUBTEST_1(test_gpu_elementwise_small());
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@ -479,7 +479,7 @@ void test_gpu_forced_evals() {
 #endif


-void test_cxx11_tensor_of_float16_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
  CALL_SUBTEST_1(test_gpu_numext<void>());

--- a/unsupported/test/cxx11_tensor_random_gpu.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@ -78,7 +78,7 @@ static void test_complex()
 }


-void test_cxx11_tensor_random_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
  CALL_SUBTEST(test_gpu_random_uniform());
  CALL_SUBTEST(test_gpu_random_normal());
--- a/unsupported/test/cxx11_tensor_reduction_gpu.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
 }


-void test_cxx11_tensor_reduction_gpu() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
--- a/unsupported/test/cxx11_tensor_scan_gpu.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@ -71,7 +71,7 @@ void test_gpu_cumsum(int m_size, int k_size, int n_size)
 }


-void test_cxx11_tensor_scan_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));