Add CUDA complex sqrt.

This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
2025-10-10 15:16:31 +08:00 · 2020-12-22 22:49:06 -08:00 · 2020-12-22 22:49:06 -08:00 · 070d303d56
commit 070d303d56
parent fdf2ee62c5
7 changed files with 217 additions and 28 deletions
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -323,6 +323,27 @@ struct abs2_retval
  typedef typename NumTraits<Scalar>::Real type;
 };
 /****************************************************************************
 * Implementation of sqrt                                                 *
 ****************************************************************************/
 template<typename Scalar>
 struct sqrt_impl
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
  {
    EIGEN_USING_STD(sqrt);
    return sqrt(x);
  }
 };
 template<typename Scalar>
 struct sqrt_retval
 {
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
@ -1368,12 +1389,11 @@ inline int log2(int x)
  *
  * It's usage is justified in performance critical functions, like norm/normalize.
  */
-template<typename T>
+template<typename Scalar>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+EIGEN_DEVICE_FUNC
-T sqrt(const T &x)
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
 {
-  EIGEN_USING_STD(sqrt);
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
  return sqrt(x);
 }
 // Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@ -12,12 +12,12 @@
 // clang-format off
 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
 namespace Eigen {
 namespace internal {
 #if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
 // functions and thus Eigen functors making use of these operators fail to
@ -94,10 +94,53 @@ template<typename T> struct scalar_quotient_op<const std::complex<T>, const std:
 template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
 template<typename T>
 struct sqrt_impl<std::complex<T>> {
  static EIGEN_DEVICE_FUNC std::complex<T> run(const std::complex<T>& z) {
    // Computes the principal sqrt of the input.
    //
    // For a complex square root of the number x + i*y. We want to find real
    // numbers u and v such that
    //    (u + i*v)^2 = x + i*y  <=>
    //    u^2 - v^2 + i*2*u*v = x + i*v.
    // By equating the real and imaginary parts we get:
    //    u^2 - v^2 = x
    //    2*u*v = y.
    //
    // For x >= 0, this has the numerically stable solution
    //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
    //    v = y / (2 * u)
    // and for x < 0,
    //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
    //    u = y / (2 * v)
    //
    // Letting w = sqrt(0.5 * (|x| + |z|)),
    //   if x == 0: u = w, v = sign(y) * w
    //   if x > 0:  u = w, v = y / (2 * w)
    //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
    const T x = numext::real(z);
    const T y = numext::imag(z);
    const T zero = T(0);
    const T cst_half = T(0.5);
    // Special case of isinf(y)
    if ((numext::isinf)(y)) {
      const T inf = std::numeric_limits<T>::infinity();
      return std::complex<T>(inf, y);
    }
    T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z)));
    return
      x == zero ? std::complex<T>(w, y < zero ? -w : w)
      : x > zero ? std::complex<T>(w, y / (2 * w))
        : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
  }
 };
 }  // namespace internal
 }  // namespace Eigen
 #endif
-} // end namespace internal
+#endif  // EIGEN_COMPLEX_CUDA_H
 } // end namespace Eigen
 #endif // EIGEN_COMPLEX_CUDA_H
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -703,8 +703,8 @@ Packet psqrt_complex(const Packet& a) {
  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
  //    v = 0.5 * (y / u)
  // and for x < 0,
-  //    v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
-  //    u = |0.5 * (y / v)|
+  //    u = 0.5 * (y / v)
  //
  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -395,6 +395,12 @@ find_package(CUDA 5.0)
 if(CUDA_FOUND)
  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
  if (${CUDA_VERSION} STREQUAL "7.0")
    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
  endif()
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
  endif()
@ -404,7 +410,12 @@ if(CUDA_FOUND)
    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
      string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
    endforeach()
  else()
    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
      string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
    endforeach()
  endif()
  string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
  set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
  ei_add_test(gpu_basic)
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@ -14,7 +14,6 @@
 #endif
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #include "main.h"
@ -54,6 +53,59 @@ struct coeff_wise {
  }
 };
 template<typename T>
 struct complex_sqrt {
  EIGEN_DEVICE_FUNC
  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  {
    using namespace Eigen;
    typedef typename T::Scalar ComplexType;
    typedef typename T::Scalar::value_type ValueType;
    const int num_special_inputs = 18;
    if (i == 0) {
      const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
      typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
      SpecialInputs special_in;
      special_in.setZero();
      int idx = 0;
      special_in[idx++] = ComplexType(0, 0);
      special_in[idx++] = ComplexType(-0, 0);
      special_in[idx++] = ComplexType(0, -0);
      special_in[idx++] = ComplexType(-0, -0);
      // GCC's fallback sqrt implementation fails for inf inputs.
      // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
      // clang includes the GCC header (which temporarily disables
      // _GLIBCXX_USE_C99_COMPLEX)
      #if !defined(_GLIBCXX_COMPLEX) || \
        (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
      const ValueType inf = std::numeric_limits<ValueType>::infinity();
      special_in[idx++] = ComplexType(1.0, inf);
      special_in[idx++] = ComplexType(nan, inf);
      special_in[idx++] = ComplexType(1.0, -inf);
      special_in[idx++] = ComplexType(nan, -inf);
      special_in[idx++] = ComplexType(-inf, 1.0);
      special_in[idx++] = ComplexType(inf, 1.0);
      special_in[idx++] = ComplexType(-inf, -1.0);
      special_in[idx++] = ComplexType(inf, -1.0);
      special_in[idx++] = ComplexType(-inf, nan);
      special_in[idx++] = ComplexType(inf, nan);
      #endif
      special_in[idx++] = ComplexType(1.0, nan);
      special_in[idx++] = ComplexType(nan, 1.0);
      special_in[idx++] = ComplexType(nan, -1.0);
      special_in[idx++] = ComplexType(nan, nan);
      Map<SpecialInputs> special_out(out);
      special_out = special_in.cwiseSqrt();
    }
    T x1(in + i);
    Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
    res = x1.cwiseSqrt();
  }
 };
 template<typename T>
 struct replicate {
  EIGEN_DEVICE_FUNC
@ -161,17 +213,58 @@ struct matrix_inverse {
  }
 };
 template<typename Type1, typename Type2>
 bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
 {
  if (a.rows() != b.rows()) {
    return false;
  }
  if (a.cols() != b.cols()) {
    return false;
  }
  for (Index r = 0; r < a.rows(); ++r) {
    for (Index c = 0; c < a.cols(); ++c) {
      if (a(r, c) != b(r, c)
          && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 
          && !test_isApprox(a(r, c), b(r, c))) {
        return false;
      }
    }
  }
  return true;
 }
 template<typename Kernel, typename Input, typename Output>
 void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
 {
  Output out_ref, out_gpu;
  #if !defined(EIGEN_GPU_COMPILE_PHASE)
  out_ref = out_gpu = out;
  #else
  EIGEN_UNUSED_VARIABLE(in);
  EIGEN_UNUSED_VARIABLE(out);
  #endif
  run_on_cpu (ker, n, in,  out_ref);
  run_on_gpu(ker, n, in, out_gpu);
  #if !defined(EIGEN_GPU_COMPILE_PHASE)
  verifyIsApproxWithInfsNans(out_ref, out_gpu);
  #endif
 }
 EIGEN_DECLARE_TEST(gpu_basic)
 {
  ei_test_init_gpu();
  int nthreads = 100;
  Eigen::VectorXf in, out;
  Eigen::VectorXcf cfin, cfout;
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
  int data_size = nthreads * 512;
  in.setRandom(data_size);
-  out.setRandom(data_size);
+  out.setConstant(data_size, -1);
  cfin.setRandom(data_size);
  cfout.setConstant(data_size, -1);
  #endif
  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic)
  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
  CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
 #if defined(__NVCC__)
  // FIXME
  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
 #else
  run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
 #endif
  // Pre-launch errors.
  gpuError_t err = gpuGetLastError();
  if (err != gpuSuccess) {
    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
    gpu_assert(false);
  }
  // Kernel execution errors.
  err = gpuDeviceSynchronize();
  if (err != gpuSuccess) {
    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
    gpu_assert(false);
  }
  gpuDeviceSynchronize();
  // check inputs have not been modified
  gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
 {
  Input  in_ref,  in_gpu;
  Output out_ref, out_gpu;
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
  in_ref = in_gpu = in;
  out_ref = out_gpu = out;
  #else
@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
  #endif
  run_on_cpu (ker, n, in_ref,  out_ref);
  run_on_gpu(ker, n, in_gpu, out_gpu);
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
  VERIFY_IS_APPROX(in_ref, in_gpu);
  VERIFY_IS_APPROX(out_ref, out_gpu);
  #endif
@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
 struct compile_time_device_info {
  EIGEN_DEVICE_FUNC
-  void operator()(int /*i*/, const int* /*in*/, int* info) const
+  void operator()(int i, const int* /*in*/, int* info) const
  {
-    #if defined(__CUDA_ARCH__)
+    if (i == 0) {
-    info[0] = int(__CUDA_ARCH__ +0);
+      #if defined(__CUDA_ARCH__)
-    #endif
+      info[0] = int(__CUDA_ARCH__ +0);
-    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      #endif
-    info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
-    #endif
+      info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
      #endif
    }
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@ -16,7 +16,7 @@
 // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
 // When compiling such files, gcc will end up trying to pick up the CUDA headers by 
 // default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
 // To work around this issue for HIP systems (and leave the default behaviour intact), the
 // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
 // "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
@ -30,6 +30,9 @@
 #define gpuSuccess hipSuccess
 #define gpuErrorNotReady hipErrorNotReady
 #define gpuGetDeviceCount hipGetDeviceCount
 #define gpuGetLastError hipGetLastError
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorName hipGetErrorName
 #define gpuGetErrorString hipGetErrorString
 #define gpuGetDeviceProperties hipGetDeviceProperties
 #define gpuStreamDefault hipStreamDefault
@ -57,6 +60,9 @@
 #define gpuSuccess cudaSuccess
 #define gpuErrorNotReady cudaErrorNotReady
 #define gpuGetDeviceCount cudaGetDeviceCount
 #define gpuGetLastError cudaGetLastError
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorName cudaGetErrorName
 #define gpuGetErrorString cudaGetErrorString
 #define gpuGetDeviceProperties cudaGetDeviceProperties
 #define gpuStreamDefault cudaStreamDefault