From bbd97b4095ff9cbe9898d68b3ab7bdff8125f3fb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 01:02:51 +0200 Subject: [PATCH 01/30] Add a EIGEN_NO_CUDA option, and introduce EIGEN_CUDACC and EIGEN_CUDA_ARCH aliases --- Eigen/Core | 18 +++++-- Eigen/src/Core/GeneralProduct.h | 4 +- Eigen/src/Core/GenericPacketMath.h | 4 +- Eigen/src/Core/MathFunctions.h | 48 +++++++++---------- Eigen/src/Core/MatrixBase.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 4 +- Eigen/src/Core/arch/CUDA/Complex.h | 2 +- Eigen/src/Core/arch/CUDA/Half.h | 30 ++++++------ Eigen/src/Core/arch/CUDA/MathFunctions.h | 2 +- Eigen/src/Core/arch/CUDA/PacketMath.h | 10 ++-- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 28 +++++------ Eigen/src/Core/arch/CUDA/TypeCasting.h | 8 ++-- Eigen/src/Core/functors/AssignmentFunctors.h | 2 +- Eigen/src/Core/util/Macros.h | 6 +-- Eigen/src/Core/util/Meta.h | 8 ++-- Eigen/src/SVD/BDCSVD.h | 2 +- Eigen/src/SparseQR/SparseQR.h | 2 +- .../CXX11/src/Tensor/TensorContractionCuda.h | 4 +- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 12 ++--- .../CXX11/src/Tensor/TensorDeviceDefault.h | 10 ++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 12 ++--- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 6 +-- .../CXX11/src/Tensor/TensorReductionCuda.h | 10 ++-- .../Eigen/CXX11/src/Tensor/TensorScan.h | 4 +- .../Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../SpecialFunctions/SpecialFunctionsImpl.h | 4 +- .../arch/CUDA/CudaSpecialFunctions.h | 2 +- 33 files changed, 134 insertions(+), 126 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index a16942e19..d2edbd2bc 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -14,8 +14,16 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + // Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG @@ -30,7 +38,7 @@ #endif // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ + #ifdef EIGEN_CUDACC // Do not try to vectorize on CUDA and SYCL! #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE @@ -50,13 +58,13 @@ // When compiling CUDA device code with NVCC, pull in math functions from the // global namespace. In host mode, and when device doee with clang, use the // std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) +#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) #define EIGEN_EXCEPTIONS #endif @@ -233,7 +241,7 @@ #define EIGEN_HAS_FP16_C #endif -#if defined __CUDACC__ +#if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_CUDA #include #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index b206b0a7a..95dadfea8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -379,7 +379,7 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template template @@ -412,7 +412,7 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ +#endif // EIGEN_CUDACC /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d19d5bbd2..30878eda6 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 75f34aa91..a906b7629 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH template struct imag_impl > { @@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -874,7 +874,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1088,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1146,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1167,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1225,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1253,7 +1253,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1283,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1303,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1323,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1343,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1363,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1393,7 +1393,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1422,7 +1422,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1451,7 +1451,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1472,7 +1472,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1492,7 +1492,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1510,12 +1510,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1535,7 +1535,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 200e57741..908d35dfe 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -160,7 +160,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template EIGEN_DEVICE_FUNC const Product diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index c42725dbd..86966abdb 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -851,7 +851,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -895,7 +895,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index ca0aaed32..57d1201f4 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, clang does not treat them as device diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index e4e639fcd..e99847e24 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -140,7 +140,7 @@ struct half : public half_impl::half_base { namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -281,7 +281,7 @@ union FP32 { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(ff); #elif defined(EIGEN_HAS_FP16_C) @@ -336,7 +336,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -370,7 +370,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); @@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -446,7 +446,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -455,7 +455,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -610,14 +610,14 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast(__shfl_xor(static_cast(var), laneMask, width)); } #endif // ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -625,7 +625,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) namespace Eigen { namespace numext { diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 987a5291c..ff6256ce0 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 8c46af09b..97a8abe59 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const float4*)from); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const double2*)from); #else return make_double2(from[0], from[1]); @@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); #else return make_double2(from[0], from[1]); diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index f4ae3c3c5..4a730a0e0 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 template<> struct is_arithmetic { enum { value = true }; }; @@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 +#if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); @@ -78,7 +78,7 @@ template<> template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 +#if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); @@ -116,7 +116,7 @@ ptranspose(PacketBlock& kernel) { } template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; @@ -125,7 +125,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& } template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else float a1 = __low2float(a); @@ -139,7 +139,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else float a1 = __low2float(a); @@ -153,7 +153,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else float a1 = __low2float(a); @@ -165,7 +165,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else float a1 = __low2float(a); @@ -179,7 +179,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else float a1 = __low2float(a); @@ -225,7 +225,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -235,7 +235,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hgt(first, second) ? first : second; @@ -247,7 +247,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hlt(first, second) ? first : second; @@ -259,7 +259,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index aa5fbce8e..30f870c3d 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -19,7 +19,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(a); #else return Eigen::half(a); @@ -37,7 +37,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(static_cast(a)); #else return Eigen::half(static_cast(a)); @@ -55,7 +55,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(a); #else return static_cast(a); @@ -69,7 +69,7 @@ struct functor_traits > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 template <> struct type_casting_traits { diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877c..1077d8eb0 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 755646795..322e8d899 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -427,7 +427,7 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR -#if defined(__CUDACC__) +#if defined(EIGEN_CUDACC) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 @@ -669,7 +669,7 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __CUDACC__) +#if (defined EIGEN_CUDACC) #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) #elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) @@ -990,7 +990,7 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# ifdef EIGEN_CUDA_ARCH # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") # else diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 8de605500..0fa818008 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,7 +11,7 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) #include #include #endif @@ -169,7 +169,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -523,13 +523,13 @@ template struct scalar_product_traits namespace numext { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) using internal::device::numeric_limits; #else using std::numeric_limits; diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index d7a4271cb..b8c41c560 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1211,7 +1211,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 2d4498b03..f7111fe2e 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -327,7 +327,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data()); m_isEtreeOk = true; - m_R.resize(m, n); + m_R.resize(diagSize, n); m_Q.resize(m, diagSize); // Allocate space for nonzero elements : rough estimation diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index c04b784a4..428b18499 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -12,7 +12,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) namespace Eigen { @@ -1382,5 +1382,5 @@ struct TensorEvaluator struct GetKernelSize { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index be8d69386..ded7129da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -211,7 +211,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) @@ -239,7 +239,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); @@ -265,7 +265,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) cudaError_t err = cudaStreamSynchronize(stream_->stream()); if (err != cudaSuccess) { std::cerr << "Error detected in CUDA stream: " @@ -304,7 +304,7 @@ struct GpuDevice { // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC cudaError_t error = cudaStreamQuery(stream_->stream()); return (error == cudaSuccess) || (error == cudaErrorNotReady); #else @@ -323,9 +323,9 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6cb2..341889e88 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,7 +35,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running on the host CPU return 1; #else @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,13 +65,13 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; #else // Running on a CUDA device - return __CUDA_ARCH__ / 100; + return EIGEN_CUDA_ARCH / 100; #endif } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index fcf330b10..2264be391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -131,7 +131,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f01d77c0a..0ffe68ab3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(__CUDACC__) +#if defined(EIGEN_CUDACC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -264,7 +264,7 @@ inline void TensorExecutor::run( evaluator.cleanup(); } -#endif // __CUDACC__ +#endif // EIGEN_CUDACC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ef1c9c42c..fb6454623 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -35,7 +35,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clz(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -53,7 +53,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clzll(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -90,7 +90,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umulhi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -101,7 +101,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umul64hi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -203,7 +203,7 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return (__umulhi(magic, n) >> shift); #elif defined(__SYCL_DEVICE_ONLY__) return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index f92e39d69..c9e61f359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 77c9c6c6e..5431eb740 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index f108c349f..230915db2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 69079805d..da0ffe728 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -334,7 +334,7 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); @@ -694,7 +694,7 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); @@ -781,7 +781,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 24a55a3d5..974eb7deb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // Full reducers for GPU, don't vectorize for now // Reducer function that enables multiple cuda thread to safely accumulate at the same @@ -23,7 +23,7 @@ namespace internal { // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); @@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer&) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); #else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); @@ -124,7 +124,7 @@ template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsigned int* semaphore) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; if (gridDim.x == 1) { @@ -372,7 +372,7 @@ template __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 2a85ed840..1f545ef1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -242,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -281,7 +281,7 @@ struct ScanLauncher { LAUNCH_CUDA_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); } }; -#endif // EIGEN_USE_GPU && __CUDACC__ +#endif // EIGEN_USE_GPU && EIGEN_CUDACC } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 573ca435a..96b3a8261 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template class array { diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 369ad97b4..5d1b8fcc2 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -121,7 +121,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -134,7 +134,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index ec4fa8448..e0e3a8be6 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma(const float4& a) From 9f8136ff747088b9a74c89ab0b3d89ac1081e83d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 10:43:18 +0200 Subject: [PATCH 02/30] disable nvcc boolean-expr-is-constant warning --- Eigen/src/Core/util/DisableStupidWarnings.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index b91d1d1af..8ef0f3594 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -55,6 +55,7 @@ #endif #if defined __NVCC__ + #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message From 3182bdbae68b11aa8278107ca71c5df2de789e66 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 11:01:28 +0200 Subject: [PATCH 03/30] Disable vectorization when compiled by nvcc, even is EIGEN_NO_CUDA is defined --- Eigen/Core | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index d2edbd2bc..f613ee9aa 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -55,6 +55,10 @@ #define EIGEN_DEVICE_FUNC #endif +#ifdef __NVCC__ +#define EIGEN_DONT_VECTORIZE +#endif + // When compiling CUDA device code with NVCC, pull in math functions from the // global namespace. In host mode, and when device doee with clang, use the // std versions. From a74b9ba7cda08b8fbcb187aa5e96f0e99cf9b684 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 11:05:26 +0200 Subject: [PATCH 04/30] Update documentation for CUDA --- doc/PreprocessorDirectives.dox | 2 ++ doc/UsingNVCC.dox | 12 +++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index f01b39aec..0919d4190 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -120,6 +120,8 @@ run time. However, these assertions do cost time and can thus be turned off. - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. + - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only, + and never called from device code. - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox index f8e755b79..9bcdf0bfc 100644 --- a/doc/UsingNVCC.dox +++ b/doc/UsingNVCC.dox @@ -3,18 +3,16 @@ namespace Eigen { /** \page TopicCUDA Using Eigen in CUDA kernels -\b Disclaimer: this page is about an \b experimental feature in %Eigen. - -Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost). -A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels. -To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC. +Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code. +This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header. +This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only. +However, in both cases, host's SIMD vectorization has to be disabled in .cu files. +It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files. Known issues: - \c nvcc with MS Visual Studio does not work (patch welcome) - - \c nvcc with \c clang does not work (patch welcome) - - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \ header file. To workaround this, you can add the following before including any other files: \code // workaround issue between gcc >= 4.7 and cuda 5.5 From cda47c42c23035130488fd9a4437b2c1910d0bab Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 21:08:20 +0200 Subject: [PATCH 05/30] Fix compilation in c++98 mode. --- Eigen/src/Core/MathFunctions.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index a906b7629..5ba5293a0 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1378,13 +1378,14 @@ T acos(const T &x) { return acos(x); } - +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T &x) { EIGEN_USING_STD_MATH(acosh); return acosh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); } @@ -1408,12 +1409,14 @@ T asin(const T &x) { return asin(x); } +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T &x) { EIGEN_USING_STD_MATH(asinh); return asinh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); } @@ -1437,12 +1440,14 @@ T atan(const T &x) { return atan(x); } +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T &x) { EIGEN_USING_STD_MATH(atanh); return atanh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); } From 55d7181557aebe43fb009ff8669f32741e15cfba Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 09:47:28 +0200 Subject: [PATCH 06/30] Fix lazyness of operator* with CUDA --- Eigen/src/Core/GeneralProduct.h | 21 +++++++++++---------- Eigen/src/Core/MatrixBase.h | 9 --------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 95dadfea8..dec24848d 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -24,12 +24,17 @@ template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_CUDA_ARCH + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -379,8 +384,6 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef EIGEN_CUDACC - template template inline const Product @@ -412,8 +415,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // EIGEN_CUDACC - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 908d35dfe..5a2146455 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef EIGEN_CUDACC template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product From d580a90c9ab5ed5521a79670f73bcea5ee755fe0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 10:03:54 +0200 Subject: [PATCH 07/30] Disable BDCSVD preallocation check. --- test/bdcsvd.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index f9f687aac..6c7b09696 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -104,7 +104,8 @@ void test_bdcsvd() CALL_SUBTEST_7( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - CALL_SUBTEST_9( svd_preallocate() ); + // Disbaled because not supported by BDCSVD + // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); } From 1f4b24d2df6200c1074c2fca45d3905a33c54a3b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 10:13:48 +0200 Subject: [PATCH 08/30] Do not preallocate more space than the matrix size (when the sparse matrix boils down to a vector --- Eigen/src/SparseCore/SparseAssign.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 18352a847..113463258 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::max)(src.rows(),src.cols())*2); + dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); for (Index j=0; j Date: Thu, 17 Aug 2017 11:51:22 +0200 Subject: [PATCH 09/30] Make NoAlias and JacobiRotation compatible with CUDA. --- Eigen/src/Core/MatrixBase.h | 4 +++- Eigen/src/Core/NoAlias.h | 1 + Eigen/src/Jacobi/Jacobi.h | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 5a2146455..5786f76e0 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -296,7 +296,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -428,8 +428,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 33908010b..41fae5096 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index c30326e1d..75595517a 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,19 +59,26 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); protected: + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); Scalar m_c, m_s; @@ -264,6 +274,7 @@ namespace internal { * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } From 89c01a494aff2bd03b48a9858eed95a4a7ce9556 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 11:55:00 +0200 Subject: [PATCH 10/30] Add unit test for has_ReturnType --- test/meta.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/meta.cpp b/test/meta.cpp index b8dea68e8..bd505762e 100644 --- a/test/meta.cpp +++ b/test/meta.cpp @@ -15,6 +15,10 @@ bool check_is_convertible(const From&, const To&) return internal::is_convertible::value; } +struct FooReturnType { + typedef int ReturnType; +}; + void test_meta() { VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value)); @@ -75,6 +79,11 @@ void test_meta() VERIFY((!check_is_convertible(A*B, f) )); VERIFY(( check_is_convertible(A*B, A) )); } + + VERIFY(( internal::has_ReturnType::value )); + VERIFY(( internal::has_ReturnType >::value )); + VERIFY(( !internal::has_ReturnType::value )); + VERIFY(( !internal::has_ReturnType::value )); VERIFY(internal::meta_sqrt<1>::ret == 1); #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt::ret == int(std::sqrt(double(X)))) From b95f92843c58a914c46ab091009146288b8b775c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 12:07:10 +0200 Subject: [PATCH 11/30] Fix support for MKL's BLAS when using MKL_DIRECT_CALL. --- .../GeneralMatrixMatrixTriangular_BLAS.h | 8 +++- .../Core/products/GeneralMatrixMatrix_BLAS.h | 19 +++++--- .../Core/products/GeneralMatrixVector_BLAS.h | 19 +++++--- .../products/SelfadjointMatrixMatrix_BLAS.h | 48 ++++++++++++------- .../products/SelfadjointMatrixVector_BLAS.h | 9 +++- .../products/TriangularMatrixMatrix_BLAS.h | 39 ++++++++++----- .../products/TriangularMatrixVector_BLAS.h | 46 +++++++++++------- .../products/TriangularSolverMatrix_BLAS.h | 40 ++++++++++------ Eigen/src/Core/util/MKL_support.h | 8 ++-- 9 files changed, 157 insertions(+), 79 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 41e18ff07..9176a1382 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index e3a5d5892..6e36c2b3c 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float) EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d69..9a5318507 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -81,13 +81,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -144,20 +144,26 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -197,13 +203,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -259,15 +265,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23accf..1238345e3 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6bb..a25197ab0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 88c0fb794..f0775116a 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -38,7 +38,7 @@ namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \ template \ struct triangular_solve_matrix \ { \ @@ -80,18 +80,24 @@ struct triangular_solve_matrix \ struct triangular_solve_matrix \ { \ @@ -133,16 +139,22 @@ struct triangular_solve_matrix /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ @@ -108,6 +109,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +126,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H From 8c858bd8919936f250d2e7b090c0d17f00dbb85b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 12:17:45 +0200 Subject: [PATCH 12/30] Clarify doc regarding the usage of MKL_DIRECT_CALL --- Eigen/src/Core/util/MKL_support.h | 2 +- doc/UsingIntelMKL.dox | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index e656799bf..b7d6ecc76 100755 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -49,7 +49,7 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL) #define EIGEN_USE_MKL #endif diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index a1a3a18f2..53e5de42c 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,6 +63,8 @@ In addition you can choose which parts will be substituted by defining one or mu \c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML +In order to be able to use \b MKL_DIRECT_CALL for BLAS level 2 and 3 routines, then the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. This is needed to tell Eigen that the BLAS backend is the MKL and that the MKL interface must be used instead of the generic F77 one. + Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module. From f727844658f8c9c01302b5cb08d81c62c572b82b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 21:58:39 +0200 Subject: [PATCH 13/30] use MKL's lapacke.h header when using MKL --- Eigen/Cholesky | 4 ++++ Eigen/Eigenvalues | 4 ++++ Eigen/LU | 4 ++++ Eigen/QR | 4 ++++ Eigen/SVD | 4 ++++ 5 files changed, 20 insertions(+) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index 369d1f5ec..bb899a2ce 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -31,7 +31,11 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues index 009e529e1..f3f661b07 100644 --- a/Eigen/Eigenvalues +++ b/Eigen/Eigenvalues @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" diff --git a/Eigen/LU b/Eigen/LU index 6f6c55629..6418a86e1 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -28,7 +28,11 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/Eigen/QR b/Eigen/QR index 80838e3bd..c7e914469 100644 --- a/Eigen/QR +++ b/Eigen/QR @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/Eigen/SVD b/Eigen/SVD index 86143c23d..5d0e75f7f 100644 --- a/Eigen/SVD +++ b/Eigen/SVD @@ -37,7 +37,11 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif From 2810ba194be85af0012f786e6c032b2bfe432be9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 22:12:26 +0200 Subject: [PATCH 14/30] Clarify MKL_DIRECT_CALL doc. --- doc/UsingIntelMKL.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 53e5de42c..6de14afad 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,7 +63,7 @@ In addition you can choose which parts will be substituted by defining one or mu \c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML -In order to be able to use \b MKL_DIRECT_CALL for BLAS level 2 and 3 routines, then the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. This is needed to tell Eigen that the BLAS backend is the MKL and that the MKL interface must be used instead of the generic F77 one. +The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. From e6021cc8cc6298196026119e8486c67ea2604376 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 11:32:55 +0200 Subject: [PATCH 15/30] bug #1458: fix documentation of LLT and LDLT info() method. --- Eigen/src/Cholesky/LDLT.h | 2 +- Eigen/src/Cholesky/LLT.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 9b4fdb414..968427b3a 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -248,7 +248,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index e6c02d803..3b0b9faf4 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -177,7 +177,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { From a6e7a41a553d3663cefc45a5d2b509494d8adb37 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 11:37:32 +0200 Subject: [PATCH 16/30] bug #1455: Cholesky module depends on Jacobi for rank-updates. --- Eigen/Cholesky | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index bb899a2ce..1332b540d 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" From 2c3d70d915a939d0da33ca22742a26c271adcb82 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:04:09 +0200 Subject: [PATCH 17/30] Re-enable hidden doc in LLT --- Eigen/src/Cholesky/LLT.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 3b0b9faf4..3de94f726 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -24,7 +24,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -43,12 +43,11 @@ template struct LLT_Traits; * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT { public: From 21d0a0bcf5eef2fb89f1ca48b65d52ec03e97272 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:46:35 +0200 Subject: [PATCH 18/30] bug #1456: add perf recommendation for LLT and storage format --- Eigen/src/Cholesky/LLT.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 3de94f726..52d101aff 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -41,6 +41,11 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format, + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. From e27f17bf5c921dca73b4d2dc1a90863b36292fdc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 13:27:37 +0200 Subject: [PATCH 19/30] Gub 1453: fix Map with non-default inner-stride but no outer-stride. --- Eigen/src/Cholesky/LLT.h | 2 +- Eigen/src/Core/Map.h | 15 ++++++++--- test/mapstride.cpp | 57 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 52d101aff..7f29dad9c 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -42,7 +42,7 @@ template struct LLT_Traits; * Output: \verbinclude LLT_example.out * * \b Performance: for best performance, it is recommended to use a column-major storage format - * with the Lower triangular part (the default), or, equivalently, a row-major storage format, + * with the Lower triangular part (the default), or, equivalently, a row-major storage format * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization * step, and rank-updates can be up to 3 times slower. * diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 06d196702..7ca6a9280 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -108,9 +114,10 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/test/mapstride.cpp b/test/mapstride.cpp index 4858f8fea..de77dc5de 100644 --- a/test/mapstride.cpp +++ b/test/mapstride.cpp @@ -58,7 +58,7 @@ template void map_class_matrix(const MatrixTy MatrixType m = MatrixType::Random(rows,cols); Scalar s1 = internal::random(); - Index arraysize = 2*(rows+4)*(cols+4); + Index arraysize = 4*(rows+4)*(cols+4); Scalar* a_array1 = internal::aligned_new(arraysize+1); Scalar* array1 = a_array1; @@ -143,9 +143,62 @@ template void map_class_matrix(const MatrixTy VERIFY_IS_APPROX(map,s1*m); } + // test inner stride and no outer stride + for(int k=0; k<2; ++k) + { + if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2) + break; + Scalar* array = (k==0 ? array1 : array2); + + Map > map(array, rows, cols, InnerStride(2)); + map = m; + VERIFY(map.outerStride() == map.innerSize()*2); + for(int i = 0; i < m.outerSize(); ++i) + for(int j = 0; j < m.innerSize(); ++j) + { + VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j)); + VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + } + VERIFY_IS_APPROX(s1*map,s1*m); + map *= s1; + VERIFY_IS_APPROX(map,s1*m); + } + internal::aligned_delete(a_array1, arraysize+1); } +// Additional tests for inner-stride but no outer-stride +template +void bug1453() +{ + const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + typedef Matrix RowMatrixXi; + typedef Matrix ColMatrix23i; + typedef Matrix ColMatrix32i; + typedef Matrix RowMatrix23i; + typedef Matrix RowMatrix32i; + + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); +} + void test_mapstride() { for(int i = 0; i < g_repeat; i++) { @@ -175,6 +228,8 @@ void test_mapstride() CALL_SUBTEST_5( map_class_matrix(MatrixXi(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); + + CALL_SUBTEST_5( bug1453<0>() ); TEST_SET_BUT_UNUSED_VARIABLE(maxn); } From be281e528967ed00ed52f50a476beef10ff0dec3 Mon Sep 17 00:00:00 2001 From: Jim Radford Date: Wed, 4 Jan 2017 14:43:56 -0800 Subject: [PATCH 20/30] LLT: avoid making a copy when decomposing in place --- Eigen/src/Cholesky/LLT.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 7f29dad9c..c7683232d 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -428,7 +428,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); From 0c226644d8cf21d35cfcf46c60ce66d2183f530e Mon Sep 17 00:00:00 2001 From: Jim Radford Date: Wed, 4 Jan 2017 14:42:57 -0800 Subject: [PATCH 21/30] LLT: const the arg to solveInPlace() to allow passing .transpose(), .block(), etc. --- Eigen/src/Cholesky/LLT.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index c7683232d..152837bfd 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -150,7 +150,7 @@ template class LLT } template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -493,7 +493,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); From b223918ea99dcff9f6a3f8d017e7bd79ff4a7db7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 14:12:47 +0200 Subject: [PATCH 22/30] Doc: warn about constness in LLT::solveInPlace --- Eigen/src/Cholesky/LLT.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 152837bfd..814174d47 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -489,6 +489,9 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template From fc39d5954b72ca2307921beb8a784cd78c2a8d10 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:17:37 +0000 Subject: [PATCH 23/30] Merged in dtrebbien/eigen/patch-1 (pull request PR-312) Work around a compilation error seen with nvcc V8.0.61 --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0b..539b6c0c6 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); From bc91a2df8b9f1c5fa47bfeb9b03c2036890570b5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 15:10:42 +0200 Subject: [PATCH 24/30] bug #1461: fix compilation of Map::x() --- Eigen/src/Geometry/Quaternion.h | 29 +++++++++++++++++------------ test/geo_quaternion.cpp | 13 +++++++++++++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index 3e5a9badb..c3fd8c3e0 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 96889e722..8ee8fdb27 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -231,6 +231,19 @@ template void mapQuaternion(void){ VERIFY_IS_APPROX(mq3*mq2, q3*q2); VERIFY_IS_APPROX(mcq1*mq2, q1*q2); VERIFY_IS_APPROX(mcq3*mq2, q3*q2); + + // Bug 1461, compilation issue with Map::w(), and other reference/constness checks: + VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum()); + VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum()); + mq3.w() = 1; + const Quaternionx& cq3(q3); + VERIFY( &cq3.x() == &q3.x() ); + const MQuaternionUA& cmq3(mq3); + VERIFY( &cmq3.x() == &mq3.x() ); + // FIXME the following should be ok. The problem is that currently the LValueBit flag + // is used to determine wether we can return a coeff by reference or not, which is not enough for Map. + //const MCQuaternionUA& cmcq3(mcq3); + //VERIFY( &cmcq3.x() == &mcq3.x() ); } template void quaternionAlignment(void){ From bc4dae9aeb84cc3d3114ee496d55654cc7256584 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 15:59:08 +0200 Subject: [PATCH 25/30] bug #1449: fix redux_3 unit test --- test/redux.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/redux.cpp b/test/redux.cpp index 989e1057b..2bade3735 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -9,6 +9,8 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define TEST_ENABLE_TEMPORARY_TRACKING +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +// ^^ see bug 1449 #include "main.h" From 9deee79922c38415125e4d6c2cd34cd05bda7889 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 16:48:07 +0200 Subject: [PATCH 26/30] bug #1457: add setUnit() methods for consistency. --- Eigen/src/Core/CwiseNullaryOp.h | 36 +++++++++++++++++++++++++++++++++ Eigen/src/Core/MatrixBase.h | 2 ++ doc/QuickReference.dox | 10 ++++++++- test/nullary.cpp | 18 +++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 144608ec2..b1923da0f 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -861,6 +861,42 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 44f5410db..59d7d05e4 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -261,6 +261,8 @@ x.setIdentity(); Vector3f::UnitX() // 1 0 0 Vector3f::UnitY() // 0 1 0 Vector3f::UnitZ() // 0 0 1 +Vector4f::Unit(i) +x.setUnit(i); \endcode @@ -278,6 +280,7 @@ N/A VectorXf::Unit(size,i) +x.setUnit(size,i); VectorXf::Unit(4,1) == Vector4f(0,1,0,0) == Vector4f::UnitY() \endcode @@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0) - +Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes. +For instance: +\code +MatrixXi M(3,3); +M.setIdentity(); +\endcode \subsection QuickRef_Map Mapping external arrays diff --git a/test/nullary.cpp b/test/nullary.cpp index acd55506e..22ec92352 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -191,6 +191,24 @@ void testVectorType(const VectorType& base) } } } + + // test setUnit() + if(m.size()>0) + { + for(Index k=0; k<10; ++k) + { + Index i = internal::random(0,m.size()-1); + m.setUnit(i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) ); + } + if(VectorType::SizeAtCompileTime==Dynamic) + { + Index i = internal::random(0,2*m.size()-1); + m.setUnit(2*m.size(),i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) ); + } + } + } template From 600e52fc7f574504fa832d67c9d94c991e504bdc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 17:06:57 +0200 Subject: [PATCH 27/30] Add missing scalar conversion --- Eigen/src/Geometry/AngleAxis.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index 0af3c1b08..83ee1be46 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } From 39864ebe1eb7c8028769cf5d8750faaabce22446 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 17:18:43 +0200 Subject: [PATCH 28/30] bug #336: improve doc for PlainObjectBase::Map --- Eigen/src/Core/PlainObjectBase.h | 4 ++++ doc/snippets/Matrix_Map_stride.cpp | 7 +++++++ 2 files changed, 11 insertions(+) create mode 100644 doc/snippets/Matrix_Map_stride.cpp diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 77f4f6066..1dc7e223a 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ diff --git a/doc/snippets/Matrix_Map_stride.cpp b/doc/snippets/Matrix_Map_stride.cpp new file mode 100644 index 000000000..ae42a127a --- /dev/null +++ b/doc/snippets/Matrix_Map_stride.cpp @@ -0,0 +1,7 @@ +Matrix4i A; +A << 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16; + +std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl; From 12249849b5ef7ec0c64f74440690fb00708b8da6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Aug 2017 10:43:21 +0200 Subject: [PATCH 29/30] Make the threshold from gemm to coeff-based-product configurable, and add some explanations. --- Eigen/src/Core/GeneralProduct.h | 10 ++++++++++ Eigen/src/Core/products/GeneralMatrixMatrix.h | 12 +++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index dec24848d..483277fe6 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -18,6 +18,16 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6440e1d09..ed4d3182b 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -427,7 +427,13 @@ struct generic_product_impl template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::evalTo(dst, lhs, rhs); else { @@ -439,7 +445,7 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::addTo(dst, lhs, rhs); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); @@ -448,7 +454,7 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::subTo(dst, lhs, rhs); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); From 21633e585b61564159d9cfbfbbad9006b8a09d64 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Aug 2017 11:06:47 +0200 Subject: [PATCH 30/30] bug #1462: remove all occurences of the deprecated __CUDACC_VER__ macro by introducing EIGEN_CUDACC_VER --- Eigen/Core | 10 +++++++++- Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++------ Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 2 +- Eigen/src/Core/util/Macros.h | 7 ++++--- test/cuda_basic.cu | 2 +- unsupported/test/cxx11_tensor_argmax_cuda.cu | 2 +- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 2 +- unsupported/test/cxx11_tensor_complex_cuda.cu | 2 +- .../test/cxx11_tensor_complex_cwise_ops_cuda.cu | 2 +- unsupported/test/cxx11_tensor_contract_cuda.cu | 2 +- unsupported/test/cxx11_tensor_cuda.cu | 2 +- unsupported/test/cxx11_tensor_device.cu | 2 +- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 2 +- unsupported/test/cxx11_tensor_random_cuda.cu | 2 +- unsupported/test/cxx11_tensor_reduction_cuda.cu | 2 +- unsupported/test/cxx11_tensor_scan_cuda.cu | 2 +- 16 files changed, 32 insertions(+), 23 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index f613ee9aa..f6fe4b0ec 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,14 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_CUDACC_VER 0 +#endif + // Handle NVCC/CUDA/SYCL #if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! @@ -248,7 +256,7 @@ #if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_CUDA #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #if EIGEN_CUDACC_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index e99847e24..8cedd65ad 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); @@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 4a730a0e0..ba6a7f920 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 322e8d899..b63ea2697 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -413,7 +413,7 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -429,7 +429,7 @@ #if defined(EIGEN_CUDACC) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ @@ -837,7 +837,8 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) + // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index cb2e4167a..0ff13477d 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -20,7 +20,7 @@ #include #include -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 653443dc5..0dfd6cfe1 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 88c233994..83a740e7a 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index 87cf28920..cbff5a9b2 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -11,7 +11,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 2baf5eaad..9133fce5a 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -11,7 +11,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index dd68430ce..0b2f3f0f4 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,7 +14,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 0ba9d52e9..ad8c9662f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index fde20ddf2..ae21f492a 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 908a5e5a9..0ba7657b8 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index b3be199e1..94d5f4e5a 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 6858b43a7..fd09d013b 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if dEIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index 5f146f3c9..46571cfea 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h"