From f7362772e3236cdb8ae4d9be175f83a0b19902a0 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 24 Dec 2015 21:15:38 -0800 Subject: [PATCH 001/102] Add digamma for CPU + CUDA. Includes tests. --- Eigen/src/Core/GenericPacketMath.h | 5 + Eigen/src/Core/GlobalFunctions.h | 1 + Eigen/src/Core/SpecialFunctions.h | 426 +++++++++++++++--- Eigen/src/Core/arch/CUDA/MathFunctions.h | 14 + Eigen/src/Core/arch/CUDA/PacketMath.h | 2 + Eigen/src/Core/functors/UnaryFunctors.h | 22 + Eigen/src/plugins/ArrayCwiseUnaryOps.h | 14 + test/array.cpp | 19 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 6 + 9 files changed, 447 insertions(+), 62 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8ad51bad5..4c7d1d848 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -75,6 +75,7 @@ struct default_packet_traits HasCosh = 0, HasTanh = 0, HasLGamma = 0, + HasDiGamma = 0, HasErf = 0, HasErfc = 0, @@ -439,6 +440,10 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + /** \internal \returns the erf(\a a) (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perf(const Packet& a) { using numext::erf; return erf(a); } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 62fec7008..396da8e71 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -50,6 +50,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index d43cf23a1..9fff3d74b 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -13,79 +13,390 @@ namespace Eigen { namespace internal { +namespace cephes { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct polevl { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_STATIC_ASSERT(N > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + return polevl::run(x, coef) * x + coef[N]; + } +}; + +template +struct polevl { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar run(const Scalar, const Scalar coef[]) { + return coef[0]; + } +}; + +} // end namespace cephes + /**************************************************************************** * Implementation of lgamma * ****************************************************************************/ -template -struct lgamma_impl -{ +template +struct lgamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct lgamma_retval -{ +template +struct lgamma_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct lgamma_impl -{ +template <> +struct lgamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); } + static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); } }; -template<> -struct lgamma_impl -{ +template <> +struct lgamma_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); } + static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); } }; #endif +/**************************************************************************** + * Implementation of digamma (psi) * + ****************************************************************************/ + +template +struct digamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template +struct digamma_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template <> +struct digamma_impl { + /* + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * float x, y, psif(); + * + * y = psif( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY: + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1984, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + EIGEN_DEVICE_FUNC + static float run(float xx) { + float p, q, nz, x, s, w, y, z; + bool negative; + + // Some necessary constants + const float PIF = 3.141592653589793238; + const float MAXNUMF = std::numeric_limits::infinity(); + + const float A[] = { + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + x = xx; + nz = 0.0f; + negative = 0; + if (x <= 0.0f) { + negative = 1; + q = x; + p = ::floor(q); + if (p == q) { + return (MAXNUMF); + } + nz = q - p; + if (nz != 0.5f) { + if (nz > 0.5f) { + p += 1.0f; + nz = q - p; + } + nz = PIF / ::tan(PIF * nz); + } else { + nz = 0.0f; + } + x = 1.0f - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = 0.0f; + while (s < 10.0f) { + w += 1.0f / s; + s += 1.0f; + } + + if (s < 1.0e8f) { + z = 1.0f / (s * s); + y = z * cephes::polevl::run(z, A); + } else + y = 0.0f; + + y = ::log(s) - (0.5f / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +template <> +struct digamma_impl { + EIGEN_DEVICE_FUNC + static double run(double x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY: + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + /* + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ + double p, q, nz, s, w, y, z; + bool negative; + + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + const double MAXNUM = std::numeric_limits::infinity(); + const double PI = 3.14159265358979323846; + + negative = 0; + nz = 0.0; + + if (x <= 0.0) { + negative = 1; + q = x; + p = ::floor(q); + if (p == q) { + return MAXNUM; + } + /* Remove the zeros of tan(PI x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != 0.5) { + if (nz > 0.5) { + p += 1.0; + nz = q - p; + } + nz = PI / ::tan(PI * nz); + } + else { + nz = 0.0; + } + x = 1.0 - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = 0.0; + while (s < 10.0) { + w += 1.0 / s; + s += 1.0; + } + + if (s < 1.0e17) { + z = 1.0 / (s * s); + y = z * cephes::polevl::run(z, A); + } + else + y = 0.0; + + y = ::log(s) - (0.5 / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +#endif + /**************************************************************************** * Implementation of erf * ****************************************************************************/ -template -struct erf_impl -{ +template +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct erf_retval -{ +template +struct erf_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct erf_impl -{ +template <> +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); } + static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); } }; -template<> -struct erf_impl -{ +template <> +struct erf_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); } + static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); } }; #endif // EIGEN_HAS_C99_MATH @@ -93,35 +404,30 @@ struct erf_impl * Implementation of erfc * ****************************************************************************/ -template -struct erfc_impl -{ +template +struct erfc_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Scalar&) - { + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); } }; -template -struct erfc_retval -{ +template +struct erfc_retval { typedef Scalar type; }; #ifdef EIGEN_HAS_C99_MATH -template<> -struct erfc_impl -{ +template <> +struct erfc_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } }; -template<> -struct erfc_impl -{ +template <> +struct erfc_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } }; @@ -129,27 +435,29 @@ struct erfc_impl } // end namespace internal - namespace numext { -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); } -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); } -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x) -{ +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); } diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index ecd5c444e..a2c06a817 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -78,6 +78,20 @@ double2 plgamma(const double2& a) return make_double2(lgamma(a.x), lgamma(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perf(const float4& a) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 9d5773106..d3d9f910e 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -40,6 +40,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, HasLGamma = 1, + HasDiGamma = 1, HasErf = 1, HasErfc = 1, @@ -63,6 +64,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, HasLGamma = 1, + HasDiGamma = 1, HasErf = 1, HasErfc = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 01727f250..897ab04ba 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -427,6 +427,28 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasDiGamma + }; +}; + /** \internal * \brief Template functor to compute the Gauss error function of a * scalar diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 01432e2f3..e818ac588 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -22,6 +22,7 @@ typedef CwiseUnaryOp, const Derived> TanhReturn typedef CwiseUnaryOp, const Derived> SinhReturnType; typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> LgammaReturnType; +typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; typedef CwiseUnaryOp, const Derived> PowReturnType; @@ -318,6 +319,19 @@ lgamma() const return LgammaReturnType(derived()); } +/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma). + * + * Example: \include Cwise_digamma.cpp + * Output: \verbinclude Cwise_digamma.out + * + * \sa cos(), sin(), tan() + */ +inline const DigammaReturnType +digamma() const +{ + return DigammaReturnType(derived()); +} + /** \returns an expression of the coefficient-wise Gauss error * function of *this. * diff --git a/test/array.cpp b/test/array.cpp index 6adedfb06..9366b73fd 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -219,6 +219,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); #ifdef EIGEN_HAS_C99_MATH VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); + VERIFY_IS_APPROX(m1.digamma(), digamma(m1)); VERIFY_IS_APPROX(m1.erf(), erf(m1)); VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); #endif // EIGEN_HAS_C99_MATH @@ -309,7 +310,20 @@ template void array_real(const ArrayType& m) s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); - + + // check special functions (comparing against numpy implementation) + if (!NumTraits::IsComplex) { + VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329)); + VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645)); + VERIFY_IS_APPROX(numext::digamma(Scalar(4)), RealScalar(1.2561176684318)); + VERIFY_IS_APPROX(numext::digamma(Scalar(-10.5)), RealScalar(2.398239129535781)); + VERIFY_IS_APPROX(numext::digamma(Scalar(10000.5)), RealScalar(9.210340372392849)); + VERIFY_IS_EQUAL(numext::digamma(Scalar(0)), + std::numeric_limits::infinity()); + VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), + std::numeric_limits::infinity()); + } + // check inplace transpose m3 = m1; m3.transposeInPlace(); @@ -336,8 +350,6 @@ template void array_complex(const ArrayType& m) Array m3(rows, cols); - Scalar s1 = internal::random(); - for (Index i = 0; i < m.rows(); ++i) for (Index j = 0; j < m.cols(); ++j) m2(i,j) = sqrt(m1(i,j)); @@ -410,6 +422,7 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1); // scalar by array division + Scalar s1 = internal::random(); const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 392acf302..cca716d6f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -128,6 +128,12 @@ class TensorBase return unaryExpr(internal::scalar_lgamma_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> erf() const { From 14897600b77ce8400780f0f34a7bb3661ce5db62 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 24 Dec 2015 21:28:18 -0800 Subject: [PATCH 002/102] Protect digamma tests behind a EIGEN_HAS_C99_MATH check. --- test/array.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/array.cpp b/test/array.cpp index 9366b73fd..96aef31c7 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -312,6 +312,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); // check special functions (comparing against numpy implementation) +#ifdef EIGEN_HAS_C99_MATH if (!NumTraits::IsComplex) { VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329)); VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645)); @@ -323,6 +324,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); } +#endif // EIGEN_HAS_C99_MATH // check inplace transpose m3 = m1; From afb35385bf565d3ddaee50b1da5b664422818934 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 28 Dec 2015 17:34:06 -0800 Subject: [PATCH 003/102] Change PI* to M_PI* in SpecialFunctions to avoid possible breakage with external DEFINEs. --- Eigen/src/Core/SpecialFunctions.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 9fff3d74b..a5f9cb62a 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -189,7 +189,7 @@ struct digamma_impl { bool negative; // Some necessary constants - const float PIF = 3.141592653589793238; + const float M_PIF = 3.141592653589793238; const float MAXNUMF = std::numeric_limits::infinity(); const float A[] = { @@ -215,7 +215,7 @@ struct digamma_impl { p += 1.0f; nz = q - p; } - nz = PIF / ::tan(PIF * nz); + nz = M_PIF / ::tan(M_PIF * nz); } else { nz = 0.0f; } @@ -315,7 +315,7 @@ struct digamma_impl { }; const double MAXNUM = std::numeric_limits::infinity(); - const double PI = 3.14159265358979323846; + const double M_PI = 3.14159265358979323846; negative = 0; nz = 0.0; @@ -327,7 +327,7 @@ struct digamma_impl { if (p == q) { return MAXNUM; } - /* Remove the zeros of tan(PI x) + /* Remove the zeros of tan(M_PI x) * by subtracting the nearest integer from x */ nz = q - p; @@ -336,7 +336,7 @@ struct digamma_impl { p += 1.0; nz = q - p; } - nz = PI / ::tan(PI * nz); + nz = M_PI / ::tan(M_PI * nz); } else { nz = 0.0; From f2471f31e0d65203c9e098727facdfda2ff7a076 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 28 Dec 2015 17:48:38 -0800 Subject: [PATCH 004/102] Modify constants in SpecialFunctions to lowercase (avoid name conflicts). --- Eigen/src/Core/SpecialFunctions.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index a5f9cb62a..8cf26f4d1 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -189,8 +189,8 @@ struct digamma_impl { bool negative; // Some necessary constants - const float M_PIF = 3.141592653589793238; - const float MAXNUMF = std::numeric_limits::infinity(); + const float m_pif = 3.141592653589793238; + const float maxnumf = std::numeric_limits::infinity(); const float A[] = { -4.16666666666666666667E-3, @@ -207,7 +207,7 @@ struct digamma_impl { q = x; p = ::floor(q); if (p == q) { - return (MAXNUMF); + return (maxnumf); } nz = q - p; if (nz != 0.5f) { @@ -215,7 +215,7 @@ struct digamma_impl { p += 1.0f; nz = q - p; } - nz = M_PIF / ::tan(M_PIF * nz); + nz = m_pif / ::tan(m_pif * nz); } else { nz = 0.0f; } @@ -314,8 +314,8 @@ struct digamma_impl { 8.33333333333333333333E-2 }; - const double MAXNUM = std::numeric_limits::infinity(); - const double M_PI = 3.14159265358979323846; + const double maxnum = std::numeric_limits::infinity(); + const double m_pi = 3.14159265358979323846; negative = 0; nz = 0.0; @@ -325,9 +325,9 @@ struct digamma_impl { q = x; p = ::floor(q); if (p == q) { - return MAXNUM; + return maxnum; } - /* Remove the zeros of tan(M_PI x) + /* Remove the zeros of tan(m_pi x) * by subtracting the nearest integer from x */ nz = q - p; @@ -336,7 +336,7 @@ struct digamma_impl { p += 1.0; nz = q - p; } - nz = M_PI / ::tan(M_PI * nz); + nz = m_pi / ::tan(m_pi * nz); } else { nz = 0.0; From 6a75e7e0d5505bac0e1a5be39d37f2717ab03134 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Fri, 15 Jan 2016 16:32:21 -0800 Subject: [PATCH 005/102] Digamma cleanup * Added permission from cephes author to use his code * Cleanup in ArrayCwiseUnaryOps --- Eigen/src/Core/SpecialFunctions.h | 32 ++++++++++++++++++-------- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 3 --- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 8cf26f4d1..bd022946c 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -13,6 +13,29 @@ namespace Eigen { namespace internal { +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + namespace cephes { /* polevl (modified for Eigen) @@ -178,11 +201,6 @@ struct digamma_impl { * message condition value returned * psi singularity x integer <=0 INFINITY */ - /* - Cephes Math Library Release 2.2: June, 1992 - Copyright 1984, 1987, 1992 by Stephen L. Moshier - Direct inquiries to 30 Frost Street, Cambridge, MA 02140 - */ EIGEN_DEVICE_FUNC static float run(float xx) { float p, q, nz, x, s, w, y, z; @@ -297,10 +315,6 @@ struct digamma_impl { * psi singularity x integer <=0 INFINITY */ - /* - * Cephes Math Library Release 2.8: June, 2000 - * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier - */ double p, q, nz, s, w, y, z; bool negative; diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index e818ac588..2ce7414a1 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -320,9 +320,6 @@ lgamma() const } /** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma). - * - * Example: \include Cwise_digamma.cpp - * Output: \verbinclude Cwise_digamma.out * * \sa cos(), sin(), tan() */ From 63fb66f53a576e4ae7bd6b28d011a7e33b7757de Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Sun, 17 Jan 2016 21:25:36 -0700 Subject: [PATCH 006/102] Add ctor for long --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 4f2adb671..19352eb5e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -40,6 +40,12 @@ struct TensorUInt128 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(unsigned int x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(long x) : high(0), low(x) { + eigen_assert(x >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(unsigned long x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); } From 2832175a689313ba08523489a1a1b8bb6458ac5c Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 19 Jan 2016 20:12:17 -0700 Subject: [PATCH 007/102] Use explicitly 32 bit integer types in constructors. --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 19352eb5e..f43f64cde 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -34,17 +34,11 @@ struct TensorUInt128 LOW low; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int x) : high(0), low(x) { + TensorUInt128(int32_t x) : high(0), low(x) { eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned int x) : high(0), low(x) { } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(long x) : high(0), low(x) { - eigen_assert(x >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned long x) : high(0), low(x) { } + TensorUInt128(uint32_t x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(int64_t x) : high(0), low(x) { eigen_assert(x >= 0); From 915e7667cd64d45fe6b07d40b0c04d2209a5f6de Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Tue, 19 Jan 2016 21:17:29 -0700 Subject: [PATCH 008/102] Remove executable bit from header files --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/arch/AltiVec/PacketMath.h | 0 Eigen/src/Core/arch/SSE/PacketMath.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 Eigen/src/Core/util/BlasUtil.h | 0 Eigen/src/Core/util/DisableStupidWarnings.h | 0 Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 bench/btl/generic_bench/timers/portable_timer.hh | 0 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 0 22 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Core/AssignEvaluator.h mode change 100755 => 100644 Eigen/src/Core/Assign_MKL.h mode change 100755 => 100644 Eigen/src/Core/ProductEvaluators.h mode change 100755 => 100644 Eigen/src/Core/VectorwiseOp.h mode change 100755 => 100644 Eigen/src/Core/arch/AltiVec/PacketMath.h mode change 100755 => 100644 Eigen/src/Core/arch/SSE/PacketMath.h mode change 100755 => 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h mode change 100755 => 100644 Eigen/src/Core/util/BlasUtil.h mode change 100755 => 100644 Eigen/src/Core/util/DisableStupidWarnings.h mode change 100755 => 100644 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealQZ.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100755 => 100644 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100755 => 100644 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100755 => 100644 Eigen/src/SVD/JacobiSVD.h mode change 100755 => 100644 Eigen/src/SparseLU/SparseLU.h mode change 100755 => 100644 bench/btl/generic_bench/timers/portable_timer.hh mode change 100755 => 100644 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100755 new mode 100644 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100755 new mode 100644 diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh old mode 100755 new mode 100644 diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h old mode 100755 new mode 100644 From 3aeeca32af00b1921b4424d7be2e03bbaeaa05b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 22 Jan 2016 16:36:30 -0800 Subject: [PATCH 009/102] Leverage the new blocking code in the tensor contraction code. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +--- .../Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 5 +++-- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 624e814e2..e6a008ba7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -582,10 +582,8 @@ struct TensorEvaluator BlockingType; - // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, 1, true); + internal::TensorContractionBlocking blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9b6d18090..63c8ae126 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -426,15 +426,16 @@ class TensorContractionSubMapper { }; -template class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: + typedef Scalar_ Scalar; typedef BaseTensorContractionMapper Base; typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 576bea295..51a3b9490 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -176,10 +176,10 @@ struct TensorEvaluatorm_device.numThreads(); - Index mc = m; - Index nc = n; - Index kc = k; - internal::computeProductBlockingSizes(kc, mc, nc, num_threads); + internal::TensorContractionBlocking blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); From 9f94e030c1c0f334de812cd5220dbb95a0a1e145 Mon Sep 17 00:00:00 2001 From: Ville Kallioniemi Date: Fri, 22 Jan 2016 20:08:45 -0700 Subject: [PATCH 010/102] Re-add executable flags to minimize changeset. --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/arch/AltiVec/PacketMath.h | 0 Eigen/src/Core/arch/SSE/PacketMath.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 Eigen/src/Core/util/BlasUtil.h | 0 Eigen/src/Core/util/DisableStupidWarnings.h | 0 Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 bench/btl/generic_bench/timers/portable_timer.hh | 0 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 0 22 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 Eigen/src/Core/AssignEvaluator.h mode change 100644 => 100755 Eigen/src/Core/Assign_MKL.h mode change 100644 => 100755 Eigen/src/Core/ProductEvaluators.h mode change 100644 => 100755 Eigen/src/Core/VectorwiseOp.h mode change 100644 => 100755 Eigen/src/Core/arch/AltiVec/PacketMath.h mode change 100644 => 100755 Eigen/src/Core/arch/SSE/PacketMath.h mode change 100644 => 100755 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100644 => 100755 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100644 => 100755 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h mode change 100644 => 100755 Eigen/src/Core/util/BlasUtil.h mode change 100644 => 100755 Eigen/src/Core/util/DisableStupidWarnings.h mode change 100644 => 100755 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealQZ.h mode change 100644 => 100755 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100644 => 100755 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100644 => 100755 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100644 => 100755 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100644 => 100755 Eigen/src/SVD/JacobiSVD.h mode change 100644 => 100755 Eigen/src/SparseLU/SparseLU.h mode change 100644 => 100755 bench/btl/generic_bench/timers/portable_timer.hh mode change 100644 => 100755 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100644 new mode 100755 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100644 new mode 100755 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100644 new mode 100755 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100644 new mode 100755 diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh old mode 100644 new mode 100755 diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h old mode 100644 new mode 100755 From 0caa4b1531def27bde0ffeb942cc10f9917a47c6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:13:54 +0100 Subject: [PATCH 011/102] bug #1150: make IncompleteCholesky more robust by iteratively increase the shift until the factorization succeed (with at most 10 attempts). --- .../IncompleteCholesky.h | 186 ++++++++++-------- test/incomplete_cholesky.cpp | 30 ++- 2 files changed, 135 insertions(+), 81 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 18e9d1466..babc14d9e 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -240,7 +240,7 @@ void IncompleteCholesky::factorize(const _MatrixType else m_scale(j) = 1; - // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) + // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) // Scale and compute the shift for the matrix RealScalar mindiag = NumTraits::highest(); @@ -251,96 +251,122 @@ void IncompleteCholesky::factorize(const _MatrixType eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored"); mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag); } + + FactorType L_save = m_L; RealScalar shift = 0; if(mindiag <= RealScalar(0.)) shift = m_initialShift - mindiag; - // Apply the shift to the diagonal elements of the matrix - for (Index j = 0; j < n; j++) - vals[colPtr[j]] += shift; - - // jki version of the Cholesky factorization - for (Index j=0; j < n; ++j) - { - // Left-looking factorization of the j-th column - // First, load the j-th column into col_vals - Scalar diag = vals[colPtr[j]]; // It is assumed that only the lower part is stored - col_nnz = 0; - for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++) + m_info = NumericalIssue; + + // Try to perform the incomplete factorization using the current shift + int iter = 0; + do + { + // Apply the shift to the diagonal elements of the matrix + for (Index j = 0; j < n; j++) + vals[colPtr[j]] += shift; + + // jki version of the Cholesky factorization + Index j=0; + for (; j < n; ++j) { - StorageIndex l = rowIdx[i]; - col_vals(col_nnz) = vals[i]; - col_irow(col_nnz) = l; - col_pattern(l) = col_nnz; - col_nnz++; - } - { - typename std::list::iterator k; - // Browse all previous columns that will update column j - for(k = listCol[j].begin(); k != listCol[j].end(); k++) + // Left-looking factorization of the j-th column + // First, load the j-th column into col_vals + Scalar diag = vals[colPtr[j]]; // It is assumed that only the lower part is stored + col_nnz = 0; + for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++) { - Index jk = firstElt(*k); // First element to use in the column - eigen_internal_assert(rowIdx[jk]==j); - Scalar v_j_jk = numext::conj(vals[jk]); - - jk += 1; - for (Index i = jk; i < colPtr[*k+1]; i++) - { - StorageIndex l = rowIdx[i]; - if(col_pattern[l]<0) - { - col_vals(col_nnz) = vals[i] * v_j_jk; - col_irow[col_nnz] = l; - col_pattern(l) = col_nnz; - col_nnz++; - } - else - col_vals(col_pattern[l]) -= vals[i] * v_j_jk; - } - updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol); + StorageIndex l = rowIdx[i]; + col_vals(col_nnz) = vals[i]; + col_irow(col_nnz) = l; + col_pattern(l) = col_nnz; + col_nnz++; } + { + typename std::list::iterator k; + // Browse all previous columns that will update column j + for(k = listCol[j].begin(); k != listCol[j].end(); k++) + { + Index jk = firstElt(*k); // First element to use in the column + eigen_internal_assert(rowIdx[jk]==j); + Scalar v_j_jk = numext::conj(vals[jk]); + + jk += 1; + for (Index i = jk; i < colPtr[*k+1]; i++) + { + StorageIndex l = rowIdx[i]; + if(col_pattern[l]<0) + { + col_vals(col_nnz) = vals[i] * v_j_jk; + col_irow[col_nnz] = l; + col_pattern(l) = col_nnz; + col_nnz++; + } + else + col_vals(col_pattern[l]) -= vals[i] * v_j_jk; + } + updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol); + } + } + + // Scale the current column + if(numext::real(diag) <= 0) + { + if(++iter>=10) + return; + + // increase shift + shift = numext::maxi(m_initialShift,RealScalar(2)*shift); + // restore m_L, col_pattern, and listCol + vals = Map(L_save.valuePtr(), nnz); + rowIdx = Map(L_save.innerIndexPtr(), nnz); + colPtr = Map(L_save.outerIndexPtr(), n+1); + col_pattern.fill(-1); + for(Index i=0; i cvals = col_vals.head(col_nnz); + Ref cirow = col_irow.head(col_nnz); + internal::QuickSplit(cvals,cirow, p); + // Insert the largest p elements in the matrix + Index cpt = 0; + for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++) + { + vals[i] = col_vals(cpt); + rowIdx[i] = col_irow(cpt); + // restore col_pattern: + col_pattern(col_irow(cpt)) = -1; + cpt++; + } + // Get the first smallest row index and put it after the diagonal element + Index jk = colPtr(j)+1; + updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); } - - // Scale the current column - if(numext::real(diag) <= 0) + + if(j==n) { - m_info = NumericalIssue; - return; + m_factorizationIsOk = true; + m_info = Success; } - - RealScalar rdiag = sqrt(numext::real(diag)); - vals[colPtr[j]] = rdiag; - for (Index k = 0; k cvals = col_vals.head(col_nnz); - Ref cirow = col_irow.head(col_nnz); - internal::QuickSplit(cvals,cirow, p); - // Insert the largest p elements in the matrix - Index cpt = 0; - for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++) - { - vals[i] = col_vals(cpt); - rowIdx[i] = col_irow(cpt); - // restore col_pattern: - col_pattern(col_irow(cpt)) = -1; - cpt++; - } - // Get the first smallest row index and put it after the diagonal element - Index jk = colPtr(j)+1; - updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); - } - m_factorizationIsOk = true; - m_info = Success; + } while(m_info!=Success); } template diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 7acad9872..59ffe9259 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2015 Gael Guennebaud +// Copyright (C) 2015-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -34,4 +34,32 @@ void test_incomplete_cholesky() CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); + +#ifdef EIGEN_TEST_PART_1 + // regression for bug 1150 + for(int N = 1; N<20; ++N) + { + Eigen::MatrixXd b( N, N ); + b.setOnes(); + + Eigen::SparseMatrix m( N, N ); + m.reserve(Eigen::VectorXi::Constant(N,4)); + for( int i = 0; i < N; ++i ) + { + m.insert( i, i ) = 1; + m.coeffRef( i, i / 2 ) = 2; + m.coeffRef( i, i / 3 ) = 2; + m.coeffRef( i, i / 4 ) = 2; + } + + Eigen::SparseMatrix A; + A = m * m.transpose(); + + Eigen::ConjugateGradient, + Eigen::Lower | Eigen::Upper, + Eigen::IncompleteCholesky > solver( A ); + VERIFY(solver.preconditioner().info() == Eigen::Success); + VERIFY(solver.info() == Eigen::Success); + } +#endif } From 369d6d1ae31c3e1a0f03196ccb9c792c6913ed76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:16:03 +0100 Subject: [PATCH 012/102] Add link to reference paper. --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index babc14d9e..adf3686b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -185,6 +185,10 @@ class IncompleteCholesky : public SparseSolverBase colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); }; +// Based on the following paper: +// C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with +// Limited memory, SIAM J. Sci. Comput. 21(1), pp. 24-45, 1999 +// http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf template template void IncompleteCholesky::factorize(const _MatrixType& mat) @@ -316,7 +320,7 @@ void IncompleteCholesky::factorize(const _MatrixType { if(++iter>=10) return; - + // increase shift shift = numext::maxi(m_initialShift,RealScalar(2)*shift); // restore m_L, col_pattern, and listCol From 1cf85bd875ecbcfa1240b4ec08122d40d79101fd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Jan 2016 22:40:11 +0100 Subject: [PATCH 013/102] bug #977: add stableNormalize[d] methods: they are analogues to normalize[d] but with carefull handling of under/over-flow --- Eigen/src/Core/Dot.h | 46 +++++++++++++++++++++++++++++++++++++ Eigen/src/Core/MatrixBase.h | 2 ++ test/stable_norm.cpp | 14 +++++++++++ 3 files changed, 62 insertions(+) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 221fc3224..82d58fc0b 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -142,6 +142,52 @@ inline void MatrixBase::normalize() derived() /= numext::sqrt(z); } +/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow. + * + * \only_for_vectors + * + * This method is analogue to the normalized() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. + * + * \sa stableNorm(), stableNormalize(), normalized() + */ +template +inline const typename MatrixBase::PlainObject +MatrixBase::stableNormalized() const +{ + typedef typename internal::nested_eval::type _Nested; + _Nested n(derived()); + RealScalar w = n.cwiseAbs().maxCoeff(); + RealScalar z = (n/w).squaredNorm(); + if(z>RealScalar(0)) + return n / (numext::sqrt(z)*w); + else + return n; +} + +/** Normalizes the vector while avoid underflow and overflow + * + * \only_for_vectors + * + * This method is analogue to the normalize() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * + * \sa stableNorm(), stableNormalized(), normalize() + */ +template +inline void MatrixBase::stableNormalize() +{ + RealScalar w = cwiseAbs().maxCoeff(); + RealScalar z = (derived()/w).squaredNorm(); + if(z>RealScalar(0)) + derived() /= numext::sqrt(z)*w; +} + //---------- implementation of other norms ---------- namespace internal { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index f3935802d..338879c73 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -204,7 +204,9 @@ template class MatrixBase RealScalar blueNorm() const; RealScalar hypotNorm() const; EIGEN_DEVICE_FUNC const PlainObject normalized() const; + EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const; EIGEN_DEVICE_FUNC void normalize(); + EIGEN_DEVICE_FUNC void stableNormalize(); EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const; EIGEN_DEVICE_FUNC void adjointInPlace(); diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 7561ae8be..9f12320e0 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -163,6 +163,20 @@ template void stable_norm(const MatrixType& m) VERIFY(!(numext::isfinite)(v.blueNorm())); VERIFY((numext::isnan)(v.blueNorm())); VERIFY(!(numext::isfinite)(v.hypotNorm())); VERIFY((numext::isnan)(v.hypotNorm())); } + + // stableNormalize[d] + { + VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized()); + MatrixType vcopy(vrand); + vcopy.stableNormalize(); + VERIFY_IS_APPROX(vcopy, vrand.normalized()); + VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1)); + VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1)); + VERIFY_IS_APPROX(vbig, vbig.stableNorm() * vbig.stableNormalized()); + VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized()); + } } void test_stable_norm() From 19e437daf09e7c449bb644a072e6caad261f5d36 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Sun, 24 Jan 2016 15:50:36 +0100 Subject: [PATCH 014/102] Copyedit documentation: typos, spelling --- doc/TutorialReductionsVisitorsBroadcasting.dox | 14 ++++++-------- doc/UsingIntelMKL.dox | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox index 6d25ff0ea..f5322b4a6 100644 --- a/doc/TutorialReductionsVisitorsBroadcasting.dox +++ b/doc/TutorialReductionsVisitorsBroadcasting.dox @@ -101,17 +101,16 @@ row and column position are to be stored. These variables should be of type \verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out -Note that both functions also return the value of the minimum or maximum coefficient if needed, -as if it was a typical reduction operation. +Both functions also return the value of the minimum or maximum coefficient. \section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions Partial reductions are reductions that can operate column- or row-wise on a Matrix or Array, applying the reduction operation on each column or row and -returning a column or row-vector with the corresponding values. Partial reductions are applied +returning a column or row vector with the corresponding values. Partial reductions are applied with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink. A simple example is obtaining the maximum of the elements -in each column in a given matrix, storing the result in a row-vector: +in each column in a given matrix, storing the result in a row vector: @@ -133,8 +132,7 @@ The same operation can be performed row-wise: \verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
Example:Output:
-Note that column-wise operations return a 'row-vector' while row-wise operations -return a 'column-vector' +Note that column-wise operations return a row vector, while row-wise operations return a column vector. \subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations It is also possible to use the result of a partial reduction to do further processing. @@ -176,7 +174,7 @@ The concept behind broadcasting is similar to partial reductions, with the diffe constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in one direction. -A simple example is to add a certain column-vector to each column in a matrix. +A simple example is to add a certain column vector to each column in a matrix. This can be accomplished with: @@ -253,7 +251,7 @@ is a new matrix whose size is the same as matrix m: \f[ \f] - (m.colwise() - v).colwise().squaredNorm() is a partial reduction, computing the squared norm column-wise. The result of -this operation is a row-vector where each coefficient is the squared Euclidean distance between each column in m and v: \f[ +this operation is a row vector where each coefficient is the squared Euclidean distance between each column in m and v: \f[ \mbox{(m.colwise() - v).colwise().squaredNorm()} = \begin{bmatrix} 1 & 505 & 32 & 50 diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 84db992b6..02c62ad85 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -52,7 +52,7 @@ When doing so, a number of Eigen's algorithms are silently substituted with call These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex, and \c complex. Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms. -In addition you can coarsely select choose which parts will be substituted by defining one or multiple of the following macros: +In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
From cc482e32f1b6026d965aaa6b5bcab63a96701049 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Sun, 24 Jan 2016 15:50:59 +0100 Subject: [PATCH 015/102] Method is called visit, not visitor --- Eigen/src/Core/Visitor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 7aac0b6e1..d71dfc968 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -197,7 +197,7 @@ struct functor_traits > { /** \returns the minimum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template template @@ -215,7 +215,7 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const /** \returns the minimum of all coefficients of *this and puts in *index its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template template @@ -233,7 +233,7 @@ DenseBase::minCoeff(IndexType* index) const /** \returns the maximum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template template From bd207ce11e8133874d5a12573921ea93874a0f9e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 24 Jan 2016 20:36:05 -0800 Subject: [PATCH 016/102] Added missing EIGEN_DEVICE_FUNC qualifier --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index e7daf7304..bd83d5de8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_buffer + index); } From e3a15a03a4fe758ed0a00f3a2b083d7ca58ca16b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 24 Jan 2016 23:04:50 -0800 Subject: [PATCH 017/102] Don't explicitely evaluate the subexpression from TensorForcedEval::evalSubExprIfNeeded, as it will be done when executing the EvalTo subexpression --- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c9b0b2f28..58b864787 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -106,7 +106,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); const Index numValues = m_impl.dimensions().TotalSize(); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. @@ -119,7 +118,6 @@ struct TensorEvaluator, Device> EvalTo evalToTmp(m_buffer, m_op); const bool PacketAccess = internal::IsVectorizable::value; internal::TensorExecutor::run(evalToTmp, m_device); - m_impl.cleanup(); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { From 869b4443ac4a55c09a0632e2dbf621587749e164 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 11:55:39 +0100 Subject: [PATCH 018/102] Add SparseVector::conservativeResize() method. --- Eigen/src/SparseCore/SparseVector.h | 18 ++++++++++ test/sparse_vector.cpp | 54 ++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 7ec73a365..4db3003d2 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -222,6 +222,24 @@ class SparseVector m_data.clear(); } + /** Resizes the sparse vector to \a newSize, while leaving old values untouched. + * + * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved. + * Call .data().squeeze() to free extra memory. + * + * \sa reserve(), setZero() + */ + void conservativeResize(Index newSize) + { + if (newSize < m_size) + { + Index i = 0; + while (i void sparse_vector(int rows, int cols) +template void sparse_vector(int rows, int cols) { double densityMat = (std::max)(8./(rows*cols), 0.01); double densityVec = (std::max)(8./float(rows), 0.1); typedef Matrix DenseMatrix; typedef Matrix DenseVector; - typedef SparseVector SparseVectorType; - typedef SparseMatrix SparseMatrixType; + typedef SparseVector SparseVectorType; + typedef SparseMatrix SparseMatrixType; Scalar eps = 1e-6; SparseMatrixType m1(rows,rows); @@ -87,8 +87,10 @@ template void sparse_vector(int rows, int cols) VERIFY_IS_APPROX(m1*v2, refM1*refV2); VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2)); - int i = internal::random(0,rows-1); - VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i))); + { + int i = internal::random(0,rows-1); + VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i))); + } VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm()); @@ -111,15 +113,51 @@ template void sparse_vector(int rows, int cols) VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); + // test conservative resize + { + std::vector inc; + if(rows > 3) + inc.push_back(-3); + inc.push_back(0); + inc.push_back(3); + inc.push_back(1); + inc.push_back(10); + + for(std::size_t i = 0; i< inc.size(); i++) { + StorageIndex incRows = inc[i]; + SparseVectorType vec1(rows); + DenseVector refVec1 = DenseVector::Zero(rows); + initSparse(densityVec, refVec1, vec1); + + vec1.conservativeResize(rows+incRows); + refVec1.conservativeResize(rows+incRows); + if (incRows > 0) refVec1.tail(incRows).setZero(); + + VERIFY_IS_APPROX(vec1, refVec1); + + // Insert new values + if (incRows > 0) + vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1; + + VERIFY_IS_APPROX(vec1, refVec1); + } + } + } void test_sparse_vector() { for(int i = 0; i < g_repeat; i++) { + int r = Eigen::internal::random(1,500), c = Eigen::internal::random(1,500); + if(Eigen::internal::random(0,4) == 0) { + r = c; // check square matrices in 25% of tries + } + EIGEN_UNUSED_VARIABLE(r+c); + CALL_SUBTEST_1(( sparse_vector(8, 8) )); - CALL_SUBTEST_2(( sparse_vector, int>(16, 16) )); - CALL_SUBTEST_1(( sparse_vector(299, 535) )); - CALL_SUBTEST_1(( sparse_vector(299, 535) )); + CALL_SUBTEST_2(( sparse_vector, int>(r, c) )); + CALL_SUBTEST_1(( sparse_vector(r, c) )); + CALL_SUBTEST_1(( sparse_vector(r, c) )); } } From b114e6fd3b61c7ef93f6b94e194d316f0ab19036 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 11:56:25 +0100 Subject: [PATCH 019/102] Improve documentation. --- Eigen/src/SparseCore/SparseMatrix.h | 7 ++++++- Eigen/src/SparseCore/SparseVector.h | 13 +++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 91bada40f..8b57445e6 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -538,7 +538,12 @@ class SparseMatrix } /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched. - * \sa reserve(), setZero() + * + * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode + * and the storage of the out of bounds coefficients is kept and reserved. + * Call makeCompressed() to pack the entries and squeeze extra memory. + * + * \sa reserve(), setZero(), makeCompressed() */ void conservativeResize(Index rows, Index cols) { diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 4db3003d2..40a754300 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -205,17 +205,30 @@ class SparseVector inline void finalize() {} + /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { m_data.prune(reference,epsilon); } + /** Resizes the sparse vector to \a rows x \a cols + * + * This method is provided for compatibility with matrices. + * For a column vector, \a cols must be equal to 1. + * For a row vector, \a rows must be equal to 1. + * + * \sa resize(Index) + */ void resize(Index rows, Index cols) { eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1"); resize(IsColVector ? rows : cols); } + /** Resizes the sparse vector to \a newSize + * This method deletes all entries, thus leaving an empty sparse vector + * + * \sa conservativeResize(), setZero() */ void resize(Index newSize) { m_size = newSize; From c10021c00a6cb6033bc479a46aef058c48836efd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 15:50:55 +0100 Subject: [PATCH 020/102] bug #1144: clarify the doc about aliasing in case of resizing and matrix product. --- doc/TopicAliasing.dox | 30 ++++++++++++++++++++++++---- doc/snippets/TopicAliasing_mult4.cpp | 5 +++++ doc/snippets/TopicAliasing_mult5.cpp | 5 +++++ 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 doc/snippets/TopicAliasing_mult4.cpp create mode 100644 doc/snippets/TopicAliasing_mult5.cpp diff --git a/doc/TopicAliasing.dox b/doc/TopicAliasing.dox index c2654aed2..a8f164428 100644 --- a/doc/TopicAliasing.dox +++ b/doc/TopicAliasing.dox @@ -153,10 +153,11 @@ not necessary to evaluate the right-hand side explicitly. \section TopicAliasingMatrixMult Aliasing and matrix multiplication -Matrix multiplication is the only operation in %Eigen that assumes aliasing by default. Thus, if \c matA is a -matrix, then the statement matA = matA * matA; is safe. All other operations in %Eigen assume that -there are no aliasing problems, either because the result is assigned to a different matrix or because it is a -component-wise operation. +Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, under the +condition that the destination matrix is not resized. +Thus, if \c matA is a \b squared matrix, then the statement matA = matA * matA; is safe. +All other operations in %Eigen assume that there are no aliasing problems, +either because the result is assigned to a different matrix or because it is a component-wise operation.
\c EIGEN_USE_BLAS Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)
@@ -198,6 +199,27 @@ may get wrong results: \verbinclude TopicAliasing_mult3.out
ExampleOutput
+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination. +Therefore, the following example is also wrong: + + + + +
ExampleOutput
+\include TopicAliasing_mult4.cpp + +\verbinclude TopicAliasing_mult4.out +
+ +As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment: + + + +
ExampleOutput
+\include TopicAliasing_mult5.cpp + +\verbinclude TopicAliasing_mult5.out +
\section TopicAliasingSummary Summary diff --git a/doc/snippets/TopicAliasing_mult4.cpp b/doc/snippets/TopicAliasing_mult4.cpp new file mode 100644 index 000000000..8a8992f6c --- /dev/null +++ b/doc/snippets/TopicAliasing_mult4.cpp @@ -0,0 +1,5 @@ +MatrixXf A(2,2), B(3,2); +B << 2, 0, 0, 3, 1, 1; +A << 2, 0, 0, -2; +A = (B * A).cwiseAbs(); +cout << A; \ No newline at end of file diff --git a/doc/snippets/TopicAliasing_mult5.cpp b/doc/snippets/TopicAliasing_mult5.cpp new file mode 100644 index 000000000..1a36defde --- /dev/null +++ b/doc/snippets/TopicAliasing_mult5.cpp @@ -0,0 +1,5 @@ +MatrixXf A(2,2), B(3,2); +B << 2, 0, 0, 3, 1, 1; +A << 2, 0, 0, -2; +A = (B * A).eval().cwiseAbs(); +cout << A; From e58827d2ed32cee5362e4d7d007da06a2bdc7309 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 17:16:33 +0100 Subject: [PATCH 021/102] bug #51: make general_matrix_matrix_triangular_product use L3-blocking helper so that general symmetric rank-updates and general-matrix-to-triangular products do not trigger dynamic memory allocation for fixed size matrices. --- .../products/GeneralMatrixMatrixTriangular.h | 44 ++++++++++++++----- Eigen/src/Core/products/SelfadjointProduct.h | 24 +++++++--- test/nomalloc.cpp | 9 ++-- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index a36eb2fe0..f80f3b410 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -42,13 +42,14 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); } }; @@ -58,7 +59,8 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; @@ -73,12 +75,20 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc, 1); + + kc = blocking.kc(); + mc = (std::min)(size,blocking.mc()); + nc = (std::min)(size,blocking.nc()); + // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -256,13 +266,27 @@ struct general_product_to_triangular_selector typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + enum { + IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, + LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualLhs.cols(); + + typedef internal::gemm_blocking_space BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product - ::run(mat.cols(), actualLhs.cols(), + typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 2af00058d..f038d686f 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -92,15 +92,27 @@ struct selfadjoint_product_selector Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); - enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0 }; + enum { + IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, + OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualOther.cols(); + + typedef internal::gemm_blocking_space BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product::IsComplex, - Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualOther.cols(), + Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits::IsComplex, + Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 060276a20..d85e9e5bc 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -81,11 +81,12 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1.row(0),-1); // The following fancy matrix-matrix products are not safe yet regarding static allocation -// m1 += m1.template triangularView() * m2.col(; -// m1.template selfadjointView().rankUpdate(m2); -// m1 += m1.template triangularView() * m2; +// m1.col(1) += m1.template triangularView() * m2.col(0); + m2.template selfadjointView().rankUpdate(m1); + m2 += m2.template triangularView() * m1; + m2.template triangularView() = m2 * m2; // m1 += m1.template selfadjointView() * m2; -// VERIFY_IS_APPROX(m1,m1); + VERIFY_IS_APPROX(m2,m2); } template From 2f9e6314b183d333ad75ec578815073ed9fb390e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 21:56:05 +0100 Subject: [PATCH 022/102] update BLAS interface to general_matrix_matrix_triangular_product --- blas/level3_impl.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 6a6b00728..835e53680 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -315,7 +315,7 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp { // std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; #if !ISCOMPLEX - typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&); + typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); static functype func[8]; static bool init = false; @@ -381,8 +381,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp matrix(c, *n, *n, *ldc).triangularView() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda); } #else + internal::gemm_blocking_space blocking(*n,*n,*k,1,false); + int code = OP(*op) | (UPLO(*uplo) << 2); - func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha); + func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking); #endif return 0; @@ -516,7 +518,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp { // std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; - typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&); + typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); static functype func[8]; static bool init = false; @@ -571,7 +573,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp if(*k>0 && alpha!=RealScalar(0)) { - func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha); + internal::gemm_blocking_space blocking(*n,*n,*k,1,false); + func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking); matrix(c, *n, *n, *ldc).diagonal().imag().setZero(); } return 0; From 8328caa618731fb2a5802daaf8088db4175567a2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 22:06:42 +0100 Subject: [PATCH 023/102] bug #51: add block preallocation mechanism to selfadjoit*matrix product. --- .../Core/products/SelfadjointMatrixMatrix.h | 46 ++++++++++--------- blas/level3_impl.h | 23 ++++++---- test/nomalloc.cpp | 2 +- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index f84f54982..ba8ee1d53 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -291,7 +291,7 @@ struct product_selfadjoint_matrix& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -314,7 +314,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = rows; @@ -340,17 +340,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); - // kc must smaller than mc + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc kc = (std::min)(kc,mc); + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; @@ -410,7 +409,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = cols; @@ -432,14 +431,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; @@ -498,6 +495,11 @@ struct selfadjoint_product_impl Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); + internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), @@ -509,7 +511,7 @@ struct selfadjoint_product_impl &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + actualAlpha, blocking // alpha ); } }; diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 835e53680..b2772b190 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -275,9 +275,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); #if ISCOMPLEX // FIXME add support for symmetric complex matrix - int size = (SIDE(*side)==LEFT) ? (*m) : (*n); Matrix matA(size,size); if(UPLO(*uplo)==UP) { @@ -294,13 +294,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa else if(SIDE(*side)==RIGHT) matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA; #else + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; else if(SIDE(*side)==RIGHT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; else return 0; @@ -488,20 +490,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) { if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; } else if(SIDE(*side)==RIGHT) { if(UPLO(*uplo)==UP) matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView();/*internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/ + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/ else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; } else diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index d85e9e5bc..077ecae55 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -85,7 +85,7 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1); m2 += m2.template triangularView() * m1; m2.template triangularView() = m2 * m2; -// m1 += m1.template selfadjointView() * m2; + m1 += m1.template selfadjointView() * m2; VERIFY_IS_APPROX(m2,m2); } From 5eb2790be093cc23d4c4808b5c53d79eba22ecb0 Mon Sep 17 00:00:00 2001 From: Hauke Heibel Date: Mon, 25 Jan 2016 22:17:52 +0100 Subject: [PATCH 024/102] Fixed minor typo in SplineFitting. --- unsupported/Eigen/src/Splines/SplineFitting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h index d3c245fa9..8e6a5aaed 100644 --- a/unsupported/Eigen/src/Splines/SplineFitting.h +++ b/unsupported/Eigen/src/Splines/SplineFitting.h @@ -167,7 +167,7 @@ namespace Eigen derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(), temporaryKnots.data()); - // Number of control points (one for each point and derivative) plus spline order. + // Number of knots (one for each point and derivative) plus spline order. DenseIndex numKnots = numParameters + numDerivatives + degree + 1; knots.resize(numKnots); From 44d4674955213013e7db7871c99d7968b8555075 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 26 Jan 2016 16:45:33 +0100 Subject: [PATCH 025/102] bug #1153: Don't rely on __GXX_EXPERIMENTAL_CXX0X__ to detect C++11 support --- Eigen/src/Core/util/Macros.h | 1 - Eigen/src/Core/util/StaticAssert.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9b4f8faa7..cf6b03ec7 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -336,7 +336,6 @@ // Do we support r-value references? #if (__has_feature(cxx_rvalue_references) || \ (defined(__cplusplus) && __cplusplus >= 201103L) || \ - defined(__GXX_EXPERIMENTAL_CXX0X__) || \ (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAVE_RVALUE_REFERENCES #endif diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index e174509e0..afae2e51e 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -26,7 +26,7 @@ #ifndef EIGEN_NO_STATIC_ASSERT - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600) + #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); From 639b1d864a3d8e3cd5f2f060af9e1a5fdae27008 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 26 Jan 2016 11:44:16 -0500 Subject: [PATCH 026/102] bug #1152: Fix data race in static initialization of blas --- blas/level2_cplx_impl.h | 95 ++++--------- blas/level2_impl.h | 303 ++++++++++++++++++++------------------- blas/level2_real_impl.h | 123 ++++------------ blas/level3_impl.h | 305 ++++++++++++++++++++++------------------ 4 files changed, 388 insertions(+), 438 deletions(-) diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index 9b845de22..2edc51596 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -19,19 +19,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) { typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_matrix_vector_product::run); - func[LO] = (internal::selfadjoint_matrix_vector_product::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_matrix_vector_product::run), + // array index: LO + (internal::selfadjoint_matrix_vector_product::run), + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -111,19 +104,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_packed_rank1_update::run); - func[LO] = (internal::selfadjoint_packed_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_packed_rank1_update::run), + // array index: LO + (internal::selfadjoint_packed_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* ap = reinterpret_cast(pap); @@ -162,19 +148,12 @@ int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::packed_rank2_update_selector::run); - func[LO] = (internal::packed_rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::packed_rank2_update_selector::run), + // array index: LO + (internal::packed_rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); @@ -217,19 +196,12 @@ int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (selfadjoint_rank1_update::run); - func[LO] = (selfadjoint_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (selfadjoint_rank1_update::run), + // array index: LO + (selfadjoint_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* a = reinterpret_cast(pa); @@ -271,19 +243,12 @@ int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda) { typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::rank2_update_selector::run); - func[LO] = (internal::rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::rank2_update_selector::run), + // array index: LO + (internal::rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); diff --git a/blas/level2_impl.h b/blas/level2_impl.h index 917f2e372..d09db0cc6 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -26,20 +26,15 @@ struct general_matrix_vector_product_wrapper int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar); - static functype func[4]; - - static bool init = false; - if(!init) - { - for(int k=0; k<4; ++k) - func[k] = 0; - - func[NOTR] = (general_matrix_vector_product_wrapper::run); - func[TR ] = (general_matrix_vector_product_wrapper::run); - func[ADJ ] = (general_matrix_vector_product_wrapper::run); - - init = true; - } + static const functype func[4] = { + // array index: NOTR + (general_matrix_vector_product_wrapper::run), + // array index: TR + (general_matrix_vector_product_wrapper::run), + // array index: ADJ + (general_matrix_vector_product_wrapper::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -90,32 +85,36 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) { typedef void (*functype)(int, const Scalar *, int, Scalar *); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::triangular_solve_vector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::triangular_solve_vector::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -145,32 +144,36 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::triangular_matrix_vector_product::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::triangular_matrix_vector_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -346,32 +349,36 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) { typedef void (*functype)(int, int, const Scalar *, int, Scalar *); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int i=0; i<16; ++i) - func[i] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::band_solve_triangular_selector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::band_solve_triangular_selector::run), + 0, + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -416,32 +423,36 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx) { typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_matrix_vector_product::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_matrix_vector_product::run), + 0 + }; Scalar* ap = reinterpret_cast(pap); Scalar* x = reinterpret_cast(px); @@ -487,32 +498,36 @@ int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx) { typedef void (*functype)(int, const Scalar*, Scalar*); - static functype func[16]; - - static bool init = false; - if(!init) - { - for(int k=0; k<16; ++k) - func[k] = 0; - - func[NOTR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (UP << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - func[NOTR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[TR | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - func[ADJ | (LO << 2) | (UNIT << 3)] = (internal::packed_triangular_solve_vector::run); - - init = true; - } + static const functype func[16] = { + // array index: NOTR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (NUNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (UP << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0, + // array index: NOTR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: TR | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + // array index: ADJ | (LO << 2) | (UNIT << 3) + (internal::packed_triangular_solve_vector::run), + 0 + }; Scalar* ap = reinterpret_cast(pap); Scalar* x = reinterpret_cast(px); diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h index cac89b268..4896a03d9 100644 --- a/blas/level2_real_impl.h +++ b/blas/level2_real_impl.h @@ -13,19 +13,12 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy) { typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_matrix_vector_product::run); - func[LO] = (internal::selfadjoint_matrix_vector_product::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_matrix_vector_product::run), + // array index: LO + (internal::selfadjoint_matrix_vector_product::run), + }; Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); @@ -71,34 +64,13 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc) { -// typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar); -// static functype func[2]; - -// static bool init = false; -// if(!init) -// { -// for(int k=0; k<2; ++k) -// func[k] = 0; -// -// func[UP] = (internal::selfadjoint_product::run); -// func[LO] = (internal::selfadjoint_product::run); - -// init = true; -// } typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (selfadjoint_rank1_update::run); - func[LO] = (selfadjoint_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (selfadjoint_rank1_update::run), + // array index: LO + (selfadjoint_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* c = reinterpret_cast(pc); @@ -131,34 +103,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, // C := alpha*x*y' + alpha*y*x' + C int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc) { -// typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar); -// static functype func[2]; -// -// static bool init = false; -// if(!init) -// { -// for(int k=0; k<2; ++k) -// func[k] = 0; -// -// func[UP] = (internal::selfadjoint_product::run); -// func[LO] = (internal::selfadjoint_product::run); -// -// init = true; -// } typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::rank2_update_selector::run); - func[LO] = (internal::rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::rank2_update_selector::run), + // array index: LO + (internal::rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); @@ -234,19 +185,12 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::selfadjoint_packed_rank1_update::run); - func[LO] = (internal::selfadjoint_packed_rank1_update::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::selfadjoint_packed_rank1_update::run), + // array index: LO + (internal::selfadjoint_packed_rank1_update::run), + }; Scalar* x = reinterpret_cast(px); Scalar* ap = reinterpret_cast(pap); @@ -285,19 +229,12 @@ int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *in int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap) { typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar); - static functype func[2]; - - static bool init = false; - if(!init) - { - for(int k=0; k<2; ++k) - func[k] = 0; - - func[UP] = (internal::packed_rank2_update_selector::run); - func[LO] = (internal::packed_rank2_update_selector::run); - - init = true; - } + static const functype func[2] = { + // array index: UP + (internal::packed_rank2_update_selector::run), + // array index: LO + (internal::packed_rank2_update_selector::run), + }; Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index b2772b190..267a727ef 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -13,24 +13,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal { // std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking&, Eigen::internal::GemmParallelInfo*); - static functype func[12]; - - static bool init = false; - if(!init) - { - for(int i=0; i<12; ++i) - func[i] = 0; - func[NOTR | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (NOTR << 2)] = (internal::general_matrix_matrix_product::run); - func[NOTR | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (TR << 2)] = (internal::general_matrix_matrix_product::run); - func[NOTR | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - func[TR | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - func[ADJ | (ADJ << 2)] = (internal::general_matrix_matrix_product::run); - init = true; - } + static const functype func[12] = { + // array index: NOTR | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (NOTR << 2) + (internal::general_matrix_matrix_product::run), + 0, + // array index: NOTR | (TR << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (TR << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (TR << 2) + (internal::general_matrix_matrix_product::run), + 0, + // array index: NOTR | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + // array index: TR | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + // array index: ADJ | (ADJ << 2) + (internal::general_matrix_matrix_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -73,49 +78,64 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, { // std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n"; typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking&); - static functype func[32]; - - static bool init = false; - if(!init) - { - for(int i=0; i<32; ++i) - func[i] = 0; - - func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix::run); - - - func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::triangular_solve_matrix::run); - - init = true; - } + static const functype func[32] = { + // array index: NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run),\ + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::triangular_solve_matrix::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -162,47 +182,64 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, { // std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[32]; - static bool init = false; - if(!init) - { - for(int k=0; k<32; ++k) - func[k] = 0; - - func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (internal::product_triangular_matrix_matrix::run); - - init = true; - } + static const functype func[32] = { + // array index: NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0, + // array index: NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + // array index: ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4) + (internal::product_triangular_matrix_matrix::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* b = reinterpret_cast(pb); @@ -318,24 +355,22 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; #if !ISCOMPLEX typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[8]; - - static bool init = false; - if(!init) - { - for(int i=0; i<8; ++i) - func[i] = 0; - - func[NOTR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[TR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - func[NOTR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[TR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - init = true; - } + static const functype func[8] = { + // array index: NOTR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: TR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: ADJ | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: NOTR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: TR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + // array index: ADJ | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0 + }; #endif Scalar* a = reinterpret_cast(pa); @@ -524,22 +559,20 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp // std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n"; typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking&); - static functype func[8]; - - static bool init = false; - if(!init) - { - for(int i=0; i<8; ++i) - func[i] = 0; - - func[NOTR | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (UP << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - func[NOTR | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - func[ADJ | (LO << 2)] = (internal::general_matrix_matrix_triangular_product::run); - - init = true; - } + static const functype func[8] = { + // array index: NOTR | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: ADJ | (UP << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: NOTR | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0, + // array index: ADJ | (LO << 2) + (internal::general_matrix_matrix_triangular_product::run), + 0 + }; Scalar* a = reinterpret_cast(pa); Scalar* c = reinterpret_cast(pc); From 5b0a9ee0032bbdbf6c8f26e788e9b992c9532432 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:30:24 +0100 Subject: [PATCH 027/102] Make sure that block sizes are smaller than input matrix sizes. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 229e96ceb..d2e6f26c8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -252,7 +252,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = 576; + max_mc = (std::min)(576,max_mc); } Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; From aa8c6a251e0c988aa4d8f6914d2052a9d2c9cbc2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:31:48 +0100 Subject: [PATCH 028/102] Make sure that micro-panel-size is smaller than blocking sizes (otherwise we might get a buffer overflow) --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 39ab87df8..8a2f7cd78 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -126,6 +126,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix(actual_kc-k1, SmallPanelWidth); + Index actualPanelWidth = std::min(actual_kc-k1, panelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; From 6850eab33b71556f62c29ff29bbcdf0240cfd8e2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:32:48 +0100 Subject: [PATCH 029/102] Re-enable blocking on rows in non-l3 blocking mode. --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d77fc2630..a39c7808c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -352,9 +352,8 @@ class gemm_blocking_spacem_mc; Index n = this->m_nc; - computeProductBlockingSizes(this->m_kc, m, n, num_threads); + computeProductBlockingSizes(this->m_kc, this->m_mc, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; From cfa21f812339fce7531eb9f27f6d2c5287de661e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:33:15 +0100 Subject: [PATCH 030/102] Remove dead code. --- .../src/Core/products/GeneralMatrixMatrixTriangular.h | 10 ++-------- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index f80f3b410..2c6798d63 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -71,14 +71,8 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc, 1); - - kc = blocking.kc(); - mc = (std::min)(size,blocking.mc()); - nc = (std::min)(size,blocking.nc()); + Index kc = blocking.kc(); + Index mc = (std::min)(size,blocking.mc()); // !!! mc must be a multiple of nr: if(mc > Traits::nr) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index ba8ee1d53..da6f82abc 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -344,10 +344,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Date: Tue, 26 Jan 2016 23:34:48 +0100 Subject: [PATCH 031/102] Doc: add flip* and arrayfun MatLab equivalent. --- doc/AsciiQuickReference.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index c604e575c..9599df60b 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -44,7 +44,7 @@ C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' VectorXi::LinSpaced(((hi-low)/step)+1, // low:step:hi - low,low+step*(size-1)) + low,low+step*(size-1)) // // Matrix slicing and blocks. All expressions listed here are read/write. @@ -94,6 +94,8 @@ R.transpose() // R.' or conj(R') // Read-write R.diagonal() // diag(R) // Read-write x.asDiagonal() // diag(x) R.transpose().colwise().reverse() // rot90(R) // Read-write +R.rowwise().reverse() // fliplr(R) +R.colwise().reverse() // flipud(R) R.replicate(i,j) // repmat(P,i,j) @@ -139,6 +141,7 @@ R.cwiseAbs2() // abs(P.^2) R.array().abs2() // abs(P.^2) (R.array() < s).select(P,Q ); // (R < s ? P : Q) R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) +R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P) // with: scalar func(const scalar &x); // Reductions. From 412bb5a631a3ff32ff7256e9473f97e40023b2e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Jan 2016 23:35:30 +0100 Subject: [PATCH 032/102] Remove redundant test. --- test/nomalloc.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 077ecae55..743ee6aa8 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -81,7 +81,6 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1.row(0),-1); // The following fancy matrix-matrix products are not safe yet regarding static allocation -// m1.col(1) += m1.template triangularView() * m2.col(0); m2.template selfadjointView().rankUpdate(m1); m2 += m2.template triangularView() * m1; m2.template triangularView() = m2 * m2; From fecea26d93a9e1f4178986791943796363391a06 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 15:55:15 +0100 Subject: [PATCH 033/102] Extend doc on shifting strategy --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index adf3686b4..e45c272b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -37,6 +37,8 @@ namespace Eigen { * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$. + * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by + * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ template Date: Wed, 27 Jan 2016 17:11:39 +0100 Subject: [PATCH 034/102] Add meta_least_common_multiple helper. --- Eigen/src/Core/util/Meta.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 617ba0a65..e3e6d763d 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -326,6 +326,22 @@ class meta_sqrt template class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; + +/** \internal Computes the least common multiple of two positive integer A and B + * at compile-time. It implements a naive algorithm testing all multiples of A. + * It thus works better if A>=B. + */ +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; +template +struct meta_least_common_multiple +{ + enum { ret = A*K }; +}; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { From 9801c959e685a0341fa35c5843ad16a150018f39 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:12:25 +0100 Subject: [PATCH 035/102] Fix tri = complex * real product, and add respective unit test. --- .../Core/products/GeneralMatrixMatrixTriangular.h | 2 +- test/mixingtypes.cpp | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 2c6798d63..831089dee 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -140,7 +140,7 @@ struct tribb_kernel typedef typename Traits::ResScalar ResScalar; enum { - BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + BlockSize = meta_least_common_multiple::ret }; void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 32d9d0be9..0a583897d 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -44,6 +44,7 @@ template void mixingtypes(int size = SizeAtCompileType) Mat_d md = mf.template cast(); Mat_cf mcf = Mat_cf::Random(size,size); Mat_cd mcd = mcf.template cast >(); + Mat_cd rcd = mcd; Vec_f vf = Vec_f::Random(size,1); Vec_d vd = vf.template cast(); Vec_cf vcf = Vec_cf::Random(size,1); @@ -103,7 +104,6 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast >()); // check matrix-matrix products - VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast().eval()*mcd); VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); @@ -147,6 +147,16 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast().eval()); VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); + + rcd.setZero(); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * mcd * md), + Mat_cd((sd * mcd * md.template cast().eval()).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * md * mcd), + Mat_cd((sd * md.template cast().eval() * mcd).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = scd * mcd * md), + Mat_cd((scd * mcd * md.template cast().eval()).template triangularView())); + VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = scd * md * mcd), + Mat_cd((scd * md.template cast().eval() * mcd).template triangularView())); } void test_mixingtypes() From 6da5d87f9258c27a1fb142d7cb5ec915497fbeeb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:26:48 +0100 Subject: [PATCH 036/102] add nomalloc unit test for rank2 updates --- test/nomalloc.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 743ee6aa8..50756c2fb 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -78,7 +78,8 @@ template void nomalloc(const MatrixType& m) VERIFY_IS_APPROX(m2,m2); m2.template selfadjointView().rankUpdate(m1.col(0),-1); - m2.template selfadjointView().rankUpdate(m1.row(0),-1); + m2.template selfadjointView().rankUpdate(m1.row(0),-1); + m2.template selfadjointView().rankUpdate(m1.col(0), m1.col(0)); // rank-2 // The following fancy matrix-matrix products are not safe yet regarding static allocation m2.template selfadjointView().rankUpdate(m1); From 9ac8e8c6a17c596de3c8afb1a4fc2fc198cb8323 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 17:29:53 +0100 Subject: [PATCH 037/102] Extend mixing type unit test with trmv, and the following not yet supported products: trmm, symv, symm --- test/mixingtypes.cpp | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 0a583897d..a3b469af8 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -108,19 +108,19 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast()); - + VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast()*mcf); VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast()); VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast()*mcf); VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast()); - + VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast().eval().adjoint()*mcd); VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast()); VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast().eval().adjoint()*mcd.adjoint()); VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast().adjoint()); VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast().eval()*mcd.adjoint()); VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast().adjoint()); - + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast().eval().adjoint()*mcf); VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast()); VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast().eval().adjoint()*mcf.adjoint()); @@ -148,6 +148,29 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); + VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView(), scd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + + // Not supported yet: trmm +// VERIFY_IS_APPROX(sd*mcd*md.template triangularView(), sd*mcd*md.template cast().eval().template triangularView()); +// VERIFY_IS_APPROX(scd*mcd*md.template triangularView(), scd*mcd*md.template cast().eval().template triangularView()); +// VERIFY_IS_APPROX(sd*md*mcd.template triangularView(), sd*md.template cast().eval()*mcd.template triangularView()); +// VERIFY_IS_APPROX(scd*md*mcd.template triangularView(), scd*md.template cast().eval()*mcd.template triangularView()); + + // Not supported yet: symv +// VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView(), sd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView(), scd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView(), sd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); +// VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView(), scd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); + + // Not supported yet: symm +// VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView(), sd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView(), scd*vcd.adjoint()*md.template cast().eval().template selfadjointView()); +// VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView(), sd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); +// VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView(), scd*vd.adjoint().template cast().eval()*mcd.template selfadjointView()); + rcd.setZero(); VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView() = sd * mcd * md), Mat_cd((sd * mcd * md.template cast().eval()).template triangularView())); From 9aa6fae123053cac30ca55ccaf9f1832d30e4b99 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 18:03:51 +0100 Subject: [PATCH 038/102] bug #1154: move to dynamic scheduling for spmv products. --- Eigen/src/SparseCore/SparseDenseProduct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 87c946b9b..c9da8a2bb 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -48,7 +48,7 @@ struct sparse_time_dense_product_impl1 && lhsEval.nonZerosEstimate() > 20000) { - #pragma omp parallel for schedule(static) num_threads(threads) + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) for(Index i=0; i Date: Wed, 27 Jan 2016 18:34:42 +0100 Subject: [PATCH 039/102] bug #1156: fix several function declarations whose arguments were passed by value instead of being passed by reference --- Eigen/src/Core/MathFunctions.h | 6 +++--- Eigen/src/Geometry/ParametrizedLine.h | 2 +- Eigen/src/Geometry/Translation.h | 2 +- Eigen/src/SparseCore/SparseView.h | 2 +- unsupported/Eigen/AlignedVector3 | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 1c7b28a4b..e47070a46 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1080,21 +1080,21 @@ struct scalar_fuzzy_impl : scalar_fuzzy_default_impl:: template EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::template isMuchSmallerThan(x, y, precision); } template EIGEN_DEVICE_FUNC inline bool isApprox(const Scalar& x, const Scalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::isApprox(x, y, precision); } template EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, - typename NumTraits::Real precision = NumTraits::dummy_precision()) + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) { return scalar_fuzzy_impl::isApproxOrLessThan(x, y, precision); } diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h index fdcd69760..c43dce773 100644 --- a/Eigen/src/Geometry/ParametrizedLine.h +++ b/Eigen/src/Geometry/ParametrizedLine.h @@ -129,7 +129,7 @@ public: * determined by \a prec. * * \sa MatrixBase::isApprox() */ - bool isApprox(const ParametrizedLine& other, typename NumTraits::Real prec = NumTraits::dummy_precision()) const + bool isApprox(const ParametrizedLine& other, const typename NumTraits::Real& prec = NumTraits::dummy_precision()) const { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); } protected: diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 87ea445e9..82d7777f0 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -162,7 +162,7 @@ public: * determined by \a prec. * * \sa MatrixBase::isApprox() */ - bool isApprox(const Translation& other, typename NumTraits::Real prec = NumTraits::dummy_precision()) const + bool isApprox(const Translation& other, const typename NumTraits::Real& prec = NumTraits::dummy_precision()) const { return m_coeffs.isApprox(other.m_coeffs, prec); } }; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index c945c4dab..b867877d8 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -38,7 +38,7 @@ public: typedef typename internal::remove_all::type NestedExpression; explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0), - RealScalar epsilon = NumTraits::dummy_precision()) + const RealScalar &epsilon = NumTraits::dummy_precision()) : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {} inline Index rows() const { return m_matrix.rows(); } diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3 index f5c40a189..135eec572 100644 --- a/unsupported/Eigen/AlignedVector3 +++ b/unsupported/Eigen/AlignedVector3 @@ -188,7 +188,7 @@ template class AlignedVector3 } template - inline bool isApprox(const MatrixBase& other, RealScalar eps=NumTraits::dummy_precision()) const + inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const { return m_coeffs.template head<3>().isApprox(other,eps); } From c8d94ae944407c05ae7600347afb6a532783c962 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 27 Jan 2016 09:52:29 -0800 Subject: [PATCH 040/102] digamma special function: merge shared code. Moved type-specific code into a helper class digamma_impl_maybe_poly. --- Eigen/src/Core/SpecialFunctions.h | 219 +++++++++++------------------- 1 file changed, 82 insertions(+), 137 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index bd022946c..21583e6f5 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -134,8 +134,23 @@ struct lgamma_impl { * Implementation of digamma (psi) * ****************************************************************************/ +#ifdef EIGEN_HAS_C99_MATH + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ template -struct digamma_impl { +struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { EIGEN_STATIC_ASSERT((internal::is_same::value == false), @@ -144,72 +159,11 @@ struct digamma_impl { } }; -template -struct digamma_retval { - typedef Scalar type; -}; -#ifdef EIGEN_HAS_C99_MATH template <> -struct digamma_impl { - /* - * Psi (digamma) function (modified for Eigen) - * - * - * SYNOPSIS: - * - * float x, y, psif(); - * - * y = psif( x ); - * - * - * DESCRIPTION: - * - * d - - * psi(x) = -- ln | (x) - * dx - * - * is the logarithmic derivative of the gamma function. - * For integer x, - * n-1 - * - - * psi(n) = -EUL + > 1/k. - * - - * k=1 - * - * If x is negative, it is transformed to a positive argument by the - * reflection formula psi(1-x) = psi(x) + pi cot(pi x). - * For general positive x, the argument is made greater than 10 - * using the recurrence psi(x+1) = psi(x) + 1/x. - * Then the following asymptotic expansion is applied: - * - * inf. B - * - 2k - * psi(x) = log(x) - 1/2x - > ------- - * - 2k - * k=1 2k x - * - * where the B2k are Bernoulli numbers. - * - * ACCURACY: - * Absolute error, relative when |psi| > 1 : - * arithmetic domain # trials peak rms - * IEEE -33,0 30000 8.2e-7 1.2e-7 - * IEEE 0,33 100000 7.3e-7 7.7e-8 - * - * ERROR MESSAGES: - * message condition value returned - * psi singularity x integer <=0 INFINITY - */ +struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC - static float run(float xx) { - float p, q, nz, x, s, w, y, z; - bool negative; - - // Some necessary constants - const float m_pif = 3.141592653589793238; - const float maxnumf = std::numeric_limits::infinity(); - + static EIGEN_STRONG_INLINE float run(const float s) { const float A[] = { -4.16666666666666666667E-3, 3.96825396825396825397E-3, @@ -217,53 +171,49 @@ struct digamma_impl { 8.33333333333333333333E-2 }; - x = xx; - nz = 0.0f; - negative = 0; - if (x <= 0.0f) { - negative = 1; - q = x; - p = ::floor(q); - if (p == q) { - return (maxnumf); - } - nz = q - p; - if (nz != 0.5f) { - if (nz > 0.5f) { - p += 1.0f; - nz = q - p; - } - nz = m_pif / ::tan(m_pif * nz); - } else { - nz = 0.0f; - } - x = 1.0f - x; - } - - /* use the recurrence psi(x+1) = psi(x) + 1/x. */ - s = x; - w = 0.0f; - while (s < 10.0f) { - w += 1.0f / s; - s += 1.0f; - } - + float z; if (s < 1.0e8f) { z = 1.0f / (s * s); - y = z * cephes::polevl::run(z, A); - } else - y = 0.0f; - - y = ::log(s) - (0.5f / s) - y - w; - - return (negative) ? y - nz : y; + return z * cephes::polevl::run(z, A); + } else return 0.0f; } }; template <> -struct digamma_impl { +struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC - static double run(double x) { + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * cephes::polevl::run(z, A); + } + else return 0.0; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +template +struct digamma_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { /* * * Psi (digamma) function (modified for Eigen) @@ -304,38 +254,38 @@ struct digamma_impl { * * where the B2k are Bernoulli numbers. * - * ACCURACY: + * ACCURACY (float): * Relative error (except absolute when |psi| < 1): * arithmetic domain # trials peak rms * IEEE 0,30 30000 1.3e-15 1.4e-16 * IEEE -30,0 40000 1.5e-15 2.2e-16 * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * * ERROR MESSAGES: * message condition value returned * psi singularity x integer <=0 INFINITY */ - double p, q, nz, s, w, y, z; + Scalar p, q, nz, s, w, y; bool negative; - const double A[] = { - 8.33333333333333333333E-2, - -2.10927960927960927961E-2, - 7.57575757575757575758E-3, - -4.16666666666666666667E-3, - 3.96825396825396825397E-3, - -8.33333333333333333333E-3, - 8.33333333333333333333E-2 - }; - - const double maxnum = std::numeric_limits::infinity(); - const double m_pi = 3.14159265358979323846; + const Scalar maxnum = std::numeric_limits::infinity(); + const Scalar m_pi = 3.14159265358979323846; negative = 0; nz = 0.0; - if (x <= 0.0) { - negative = 1; + const Scalar zero = 0.0; + const Scalar one = 1.0; + const Scalar half = 0.5; + + if (x <= zero) { + negative = one; q = x; p = ::floor(q); if (p == q) { @@ -345,41 +295,36 @@ struct digamma_impl { * by subtracting the nearest integer from x */ nz = q - p; - if (nz != 0.5) { - if (nz > 0.5) { - p += 1.0; + if (nz != half) { + if (nz > half) { + p += one; nz = q - p; } nz = m_pi / ::tan(m_pi * nz); } else { - nz = 0.0; + nz = zero; } - x = 1.0 - x; + x = one - x; } /* use the recurrence psi(x+1) = psi(x) + 1/x. */ s = x; - w = 0.0; - while (s < 10.0) { - w += 1.0 / s; - s += 1.0; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; } - if (s < 1.0e17) { - z = 1.0 / (s * s); - y = z * cephes::polevl::run(z, A); - } - else - y = 0.0; + y = digamma_impl_maybe_poly::run(s); - y = ::log(s) - (0.5 / s) - y - w; + y = ::log(s) - (half / s) - y - w; return (negative) ? y - nz : y; } }; -#endif +#endif // EIGEN_HAS_C99_MATH /**************************************************************************** * Implementation of erf * From 5973bcf939be278f81b20d6250405aaaa0791b9d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 12:04:42 -0800 Subject: [PATCH 041/102] Properly specify the namespace when calling cout/endl --- unsupported/test/cxx11_tensor_contract_cuda.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp index 035a093e6..ac447dd7b 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cpp +++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp @@ -24,7 +24,7 @@ typedef Tensor::DimensionPair DimPair; template static void test_cuda_contraction(int m_size, int k_size, int n_size) { - cout<<"Calling with ("<= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << std::endl; assert(false); } } @@ -83,7 +83,7 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { - cout<<"Calling contraction tests"<(128, 128, 128)); CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { From 9dfbd4fe8d752f96be55dc0d847aa6df172d3f7e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 12:22:17 -0800 Subject: [PATCH 042/102] Made the cuda tests compile using make check --- unsupported/test/CMakeLists.txt | 28 +++++++++++++------ ...x_cuda.cpp => cxx11_tensor_argmax_cuda.cu} | 0 ...cuda.cpp => cxx11_tensor_contract_cuda.cu} | 0 ...1_tensor_cuda.cpp => cxx11_tensor_cuda.cu} | 0 ...nsor_device.cpp => cxx11_tensor_device.cu} | 0 ...m_cuda.cpp => cxx11_tensor_random_cuda.cu} | 0 ...ion_cuda.cpp => cxx11_tensor_reduction.cu} | 0 7 files changed, 20 insertions(+), 8 deletions(-) rename unsupported/test/{cxx11_tensor_argmax_cuda.cpp => cxx11_tensor_argmax_cuda.cu} (100%) rename unsupported/test/{cxx11_tensor_contract_cuda.cpp => cxx11_tensor_contract_cuda.cu} (100%) rename unsupported/test/{cxx11_tensor_cuda.cpp => cxx11_tensor_cuda.cu} (100%) rename unsupported/test/{cxx11_tensor_device.cpp => cxx11_tensor_device.cu} (100%) rename unsupported/test/{cxx11_tensor_random_cuda.cpp => cxx11_tensor_random_cuda.cu} (100%) rename unsupported/test/{cxx11_tensor_reduction_cuda.cpp => cxx11_tensor_reduction.cu} (100%) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97257b183..5c383aab6 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -148,12 +148,24 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_fft "-std=c++0x") ei_add_test(cxx11_tensor_ifft "-std=c++0x") - # These tests needs nvcc -# ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_random_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x") - endif() + +# These tests needs nvcc +find_package(CUDA 7) +if(CUDA_FOUND) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + ei_add_test(cxx11_tensor_device "-std=c++11") + ei_add_test(cxx11_tensor_cuda "-std=c++11") + ei_add_test(cxx11_tensor_contract_cuda "-std=c++11") + ei_add_test(cxx11_tensor_reduction_cuda "-std=c++11") + ei_add_test(cxx11_tensor_random_cuda "-std=c++11") + ei_add_test(cxx11_tensor_argmax_cuda "-std=c++11 -I/opt-cuda-7.0/include") + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) +endif(CUDA_FOUND) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu similarity index 100% rename from unsupported/test/cxx11_tensor_argmax_cuda.cpp rename to unsupported/test/cxx11_tensor_argmax_cuda.cu diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu similarity index 100% rename from unsupported/test/cxx11_tensor_contract_cuda.cpp rename to unsupported/test/cxx11_tensor_contract_cuda.cu diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu similarity index 100% rename from unsupported/test/cxx11_tensor_cuda.cpp rename to unsupported/test/cxx11_tensor_cuda.cu diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu similarity index 100% rename from unsupported/test/cxx11_tensor_device.cpp rename to unsupported/test/cxx11_tensor_device.cu diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu similarity index 100% rename from unsupported/test/cxx11_tensor_random_cuda.cpp rename to unsupported/test/cxx11_tensor_random_cuda.cu diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction.cu similarity index 100% rename from unsupported/test/cxx11_tensor_reduction_cuda.cpp rename to unsupported/test/cxx11_tensor_reduction.cu From 4865e1e73265e12d564f8b4d9069a2159f777d90 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Jan 2016 22:48:40 +0100 Subject: [PATCH 043/102] Update link to suitesparse. --- Eigen/CholmodSupport | 2 +- Eigen/SPQRSupport | 2 +- Eigen/UmfPackSupport | 2 +- Eigen/src/OrderingMethods/Amd.h | 2 +- Eigen/src/OrderingMethods/Eigen_Colamd.h | 6 +----- doc/SparseLinearSystems.dox | 6 +++--- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport index 83e2c1da4..bed8924d3 100644 --- a/Eigen/CholmodSupport +++ b/Eigen/CholmodSupport @@ -19,7 +19,7 @@ extern "C" { /** \ingroup Support_modules * \defgroup CholmodSupport_Module CholmodSupport module * - * This module provides an interface to the Cholmod library which is part of the suitesparse package. + * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport index f9489dcd8..f70390c17 100644 --- a/Eigen/SPQRSupport +++ b/Eigen/SPQRSupport @@ -17,7 +17,7 @@ /** \ingroup Support_modules * \defgroup SPQRSupport_Module SuiteSparseQR module * - * This module provides an interface to the SPQR library, which is part of the suitesparse package. + * This module provides an interface to the SPQR library, which is part of the suitesparse package. * * \code * #include diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport index 4a9f46a1e..00eec8087 100644 --- a/Eigen/UmfPackSupport +++ b/Eigen/UmfPackSupport @@ -19,7 +19,7 @@ extern "C" { /** \ingroup Support_modules * \defgroup UmfPackSupport_Module UmfPackSupport module * - * This module provides an interface to the UmfPack library which is part of the suitesparse package. + * This module provides an interface to the UmfPack library which is part of the suitesparse package. * It provides the following factorization class: * - class UmfPackLU: a multifrontal sequential LU factorization. * diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index d1d08ca57..f91ecb24e 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -8,7 +8,7 @@ NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. -http://www.cise.ufl.edu/research/sparse/CSparse +http://www.suitesparse.com CSparse is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 70c987afa..960df4a46 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -41,12 +41,8 @@ // // The colamd/symamd library is available at // -// http://www.cise.ufl.edu/research/sparse/colamd/ +// http://www.suitesparse.com -// This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h -// file. It is required by the colamd.c, colamdmex.c, and symamdmex.c -// files, and by any C code that calls the routines whose prototypes are -// listed below, or that uses the colamd/symamd definitions listed below. #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 61cc50afa..ee4f53a4e 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -65,17 +65,17 @@ They are summarized in the following tables: Requires the PaStiX package, \b CeCILL-C optimized for tough problems and symmetric patterns CholmodSupernodalLLT\link CholmodSupport_Module CholmodSupport \endlinkDirect LLt factorizationSPDFill-in reducing, Leverage fast dense algebra - Requires the SuiteSparse package, \b GPL + Requires the SuiteSparse package, \b GPL UmfPackLU\link UmfPackSupport_Module UmfPackSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra - Requires the SuiteSparse package, \b GPL + Requires the SuiteSparse package, \b GPL SuperLU\link SuperLUSupport_Module SuperLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuperLU library, (BSD-like) SPQR\link SPQRSupport_Module SPQRSupport \endlink QR factorization Any, rectangularfill-in reducing, multithreaded, fast dense algebra - requires the SuiteSparse package, \b GPL recommended for linear least-squares problems, has a rank-revealing feature + requires the SuiteSparse package, \b GPL recommended for linear least-squares problems, has a rank-revealing feature Here \c SPD means symmetric positive definite. From 55a5204319f119f3609e0522302ff24e69ab60ae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 14:46:34 -0800 Subject: [PATCH 044/102] Fixed the flags passed to nvcc to compile the tensor code. --- unsupported/test/CMakeLists.txt | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5c383aab6..3a90a5371 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -157,15 +157,16 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() + set(CUDA_NVCC_FLAGS "-std=c++11 -arch compute_30") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") - set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(cxx11_tensor_device "-std=c++11") - ei_add_test(cxx11_tensor_cuda "-std=c++11") - ei_add_test(cxx11_tensor_contract_cuda "-std=c++11") - ei_add_test(cxx11_tensor_reduction_cuda "-std=c++11") - ei_add_test(cxx11_tensor_random_cuda "-std=c++11") - ei_add_test(cxx11_tensor_argmax_cuda "-std=c++11 -I/opt-cuda-7.0/include") + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_cuda) + ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_reduction_cuda) + ei_add_test(cxx11_tensor_random_cuda) + ei_add_test(cxx11_tensor_argmax_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif(CUDA_FOUND) From 47ca9dc809801b60c1bfe49b391a37bd62eb8888 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 14:58:48 -0800 Subject: [PATCH 045/102] Fixed the tensor_cuda test --- unsupported/test/cxx11_tensor_cuda.cu | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 49e1894ab..79f1c5315 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -131,8 +131,7 @@ void test_cuda_reduction() cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_in1(d_in1, 72,53,97,113); @@ -189,8 +188,7 @@ static void test_cuda_contraction() cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -214,7 +212,7 @@ static void test_cuda_contraction() for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } @@ -243,8 +241,7 @@ static void test_cuda_convolution_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input, 74,37,11,137); @@ -293,8 +290,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,9,11,7); @@ -343,8 +339,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input, 7,9,11,74); @@ -394,8 +389,7 @@ static void test_cuda_convolution_2d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,37,11,137); @@ -455,8 +449,7 @@ static void test_cuda_convolution_3d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > gpu_input(d_input,74,37,11,137,17); @@ -644,10 +637,6 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST(test_cuda_erfc(0.01f)); CALL_SUBTEST(test_cuda_erfc(0.001f)); - CALL_SUBTEST(test_cuda_tanh(1.0)); - CALL_SUBTEST(test_cuda_tanh(100.0)); - CALL_SUBTEST(test_cuda_tanh(0.01)); - CALL_SUBTEST(test_cuda_tanh(0.001)); CALL_SUBTEST(test_cuda_lgamma(1.0)); CALL_SUBTEST(test_cuda_lgamma(100.0)); CALL_SUBTEST(test_cuda_lgamma(0.01)); From 291069e885dccad6059e4bda34aad30ab69cbd85 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 15:37:03 -0800 Subject: [PATCH 046/102] Fixed some compilation problems with nvcc + clang --- Eigen/src/Core/util/Memory.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 823e077af..415bc48cb 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -526,9 +526,9 @@ template EIGEN_DEVICE_FUNC inline void conditional_align template EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) { - static const Index ScalarSize = sizeof(Scalar); - static const Index AlignmentSize = Alignment / ScalarSize; - static const Index AlignmentMask = AlignmentSize-1; + const Index ScalarSize = sizeof(Scalar); + const Index AlignmentSize = Alignment / ScalarSize; + const Index AlignmentMask = AlignmentSize-1; if(AlignmentSize<=1) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 09ee0c2c6..7a5dfbfea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -346,7 +346,7 @@ struct InnerReducer { static const bool HasOptimizedImplementation = false; static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + eigen_assert(false && "Not implemented"); } }; @@ -356,7 +356,7 @@ struct OuterReducer { static const bool HasOptimizedImplementation = false; static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + eigen_assert(false && "Not implemented"); } }; From 4bf9eaf77aa8c9a75b5d60c781d5d86b833b93d1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Jan 2016 17:09:30 -0800 Subject: [PATCH 047/102] Deleted an invalid assertion that prevented the assignment of empty tensors. --- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 1 - unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_empty.cpp | 36 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_empty.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 98631fc7f..18a916e46 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -105,7 +105,6 @@ class TensorStorage, Options_> EIGEN_DEVICE_FUNC void resize(Index size, const array& nbDimensions) { - eigen_assert(size >= 1); const Index currentSz = internal::array_prod(m_dimensions); if(size != currentSz) { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3a90a5371..d70bf2b88 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -147,6 +147,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_sugar "-std=c++0x") ei_add_test(cxx11_tensor_fft "-std=c++0x") ei_add_test(cxx11_tensor_ifft "-std=c++0x") + ei_add_test(cxx11_tensor_empty "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp new file mode 100644 index 000000000..ca03a297c --- /dev/null +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -0,0 +1,36 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_empty_tensor() +{ + Tensor source; + Tensor tgt1 = source; + Tensor tgt2; + tgt2 = source; +} + +static void test_empty_fixed_size_tensor() +{ + TensorFixedSize> source; + TensorFixedSize> tgt1 = source; + TensorFixedSize> tgt2; + tgt2 = source; +} + + +void test_cxx11_tensor_empty() +{ + CALL_SUBTEST(test_empty_tensor()); + CALL_SUBTEST(test_empty_fixed_size_tensor()); +} From 7802a6bb1cc6477810dff0e83ec90af954784612 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 09:35:37 +0100 Subject: [PATCH 048/102] Fix unit test filename. --- unsupported/test/CMakeLists.txt | 4 ++-- ...x11_tensor_reduction.cu => cxx11_tensor_reduction_cuda.cu} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename unsupported/test/{cxx11_tensor_reduction.cu => cxx11_tensor_reduction_cuda.cu} (100%) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d70bf2b88..d16c42656 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,7 +152,7 @@ if(EIGEN_TEST_CXX11) endif() # These tests needs nvcc -find_package(CUDA 7) +find_package(CUDA 7.0) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") @@ -170,4 +170,4 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_argmax_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) -endif(CUDA_FOUND) +endif() diff --git a/unsupported/test/cxx11_tensor_reduction.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu similarity index 100% rename from unsupported/test/cxx11_tensor_reduction.cu rename to unsupported/test/cxx11_tensor_reduction_cuda.cu From 2bad3e78d9e23dec7e4ad31c4ad2bdc761b8f3b6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:12:06 +0100 Subject: [PATCH 049/102] bug #96, bug #1006: fix by value argument in result_of. --- Eigen/src/Core/CwiseBinaryOp.h | 4 ++-- Eigen/src/Core/CwiseUnaryOp.h | 2 +- Eigen/src/Core/CwiseUnaryView.h | 2 +- Eigen/src/Core/VectorwiseOp.h | 2 +- Eigen/src/Core/util/Meta.h | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index f94629e6d..39820fd7d 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -32,8 +32,8 @@ struct traits > // we still want to handle the case when the result type is different. typedef typename result_of< BinaryOp( - typename Lhs::Scalar, - typename Rhs::Scalar + const typename Lhs::Scalar&, + const typename Rhs::Scalar& ) >::type Scalar; typedef typename cwise_promote_storage_type::StorageKind, diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 5a809cf21..22db783b5 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -19,7 +19,7 @@ struct traits > : traits { typedef typename result_of< - UnaryOp(typename XprType::Scalar) + UnaryOp(const typename XprType::Scalar&) >::type Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 5a7db2b19..a9252eddf 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -18,7 +18,7 @@ struct traits > : traits { typedef typename result_of< - ViewOp(typename traits::Scalar) + ViewOp(const typename traits::Scalar&) >::type Scalar; typedef typename MatrixType::Nested MatrixTypeNested; typedef typename remove_all::type _MatrixTypeNested; diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 95bcaa86f..193891189 100755 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -124,7 +124,7 @@ struct member_lpnorm { template struct member_redux { typedef typename result_of< - BinaryOp(Scalar,Scalar) + BinaryOp(const Scalar&,const Scalar&) >::type result_type; template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index e3e6d763d..b01437d88 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -257,7 +257,7 @@ struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; template -struct unary_result_of_select {typedef ArgType type;}; +struct unary_result_of_select {typedef typename internal::remove_all::type type;}; template struct unary_result_of_select {typedef typename Func::result_type type;}; @@ -279,7 +279,7 @@ struct result_of { }; template -struct binary_result_of_select {typedef ArgType0 type;}; +struct binary_result_of_select {typedef typename internal::remove_all::type type;}; template struct binary_result_of_select From b4d87fff4a47797154b661129a0b73d31688d582 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:12:30 +0100 Subject: [PATCH 050/102] Fix MSVC warning. --- Eigen/src/Core/MathFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e47070a46..e87b60f8f 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -748,9 +748,9 @@ template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only -EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x); } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } From 9bcadb7fd1d8852dbc74fe054878b0a12f4aed4e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 12:14:16 +0100 Subject: [PATCH 051/102] Disable stupid MSVC warning --- Eigen/src/Core/util/DisableStupidWarnings.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 747232938..91c61fcf2 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -15,10 +15,11 @@ // 4522 - 'class' : multiple assignment operators specified // 4700 - uninitialized local variable 'xyz' used // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow + // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning) #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 ) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) // ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body From df15fbc4520402a00b053bd02c782b77a5b72f61 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 13:16:30 +0100 Subject: [PATCH 052/102] bug #1158: PartialReduxExpr is a vector expression, and it thus must expose the LinearAccessBit flag --- Eigen/src/Core/CoreEvaluators.h | 2 +- test/vectorwiseop.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 8bd73b814..7776948d1 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -994,7 +994,7 @@ struct evaluator > CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))), + Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized }; diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 87476f95b..3cc198772 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -210,6 +210,9 @@ template void vectorwiseop_matrix(const MatrixType& m) VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm()); VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm()); + // regression for bug 1158 + VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum()); + // test normalized m2 = m1.colwise().normalized(); VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized()); From f50bb1e6f327055a6113787e9a417bc9c192c4e4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 13:25:26 +0100 Subject: [PATCH 053/102] Fix compilation with gcc --- Eigen/src/Core/SpecialFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 21583e6f5..9f89e184d 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -81,7 +81,7 @@ template struct polevl { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar run(const Scalar x, const Scalar coef[]) { - EIGEN_STATIC_ASSERT(N > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); return polevl::run(x, coef) * x + coef[N]; } From c4e47630b16a716d01dc20b36afa8882b03681a1 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 28 Jan 2016 10:35:14 -0800 Subject: [PATCH 054/102] benchmark modifications to make it compilable in a standalone fashion. --- bench/tensors/benchmark.h | 48 ++++++ bench/tensors/benchmark_main.cc | 215 +++++++++++++++++++++++++ bench/tensors/tensor_benchmarks.h | 87 +++++----- bench/tensors/tensor_benchmarks_cpu.cc | 20 +-- bench/tensors/tensor_benchmarks_gpu.cc | 4 +- 5 files changed, 318 insertions(+), 56 deletions(-) create mode 100644 bench/tensors/benchmark.h create mode 100644 bench/tensors/benchmark_main.cc diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h new file mode 100644 index 000000000..d8b4fd4c6 --- /dev/null +++ b/bench/tensors/benchmark.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +namespace testing { +class Benchmark { + public: + Benchmark(const char* name, void (*fn)(int)) { + Register(name, fn, NULL); + } + Benchmark(const char* name, void (*fn_range)(int, int)) { + Register(name, NULL, fn_range); + } + Benchmark* Arg(int x); + Benchmark* Range(int lo, int hi); + const char* Name(); + bool ShouldRun(int argc, char* argv[]); + void Run(); + private: + const char* name_; + void (*fn_)(int); + void (*fn_range_)(int, int); + std::vector args_; + void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)); + void RunRepeatedlyWithArg(int iterations, int arg); + void RunWithArg(int arg); +}; +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t); +void StopBenchmarkTiming(); +void StartBenchmarkTiming(); +#define BENCHMARK(f) \ + static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ + (new ::testing::Benchmark(#f, f)) \ No newline at end of file diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc new file mode 100644 index 000000000..0fc12960e --- /dev/null +++ b/bench/tensors/benchmark_main.cc @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "benchmark.h" +#include +#include +#include +#include +#include +#include +#include + +static int64_t g_bytes_processed; +static int64_t g_benchmark_total_time_ns; +static int64_t g_benchmark_start_time_ns; +typedef std::map BenchmarkMap; +typedef BenchmarkMap::iterator BenchmarkMapIt; +static BenchmarkMap g_benchmarks; +static int g_name_column_width = 20; +static int Round(int n) { + int base = 1; + while (base*10 < n) { + base *= 10; + } + if (n < 2*base) { + return 2*base; + } + if (n < 5*base) { + return 5*base; + } + return 10*base; +} +static int64_t NanoTime() { + struct timespec t; + t.tv_sec = t.tv_nsec = 0; + clock_gettime(CLOCK_MONOTONIC, &t); + return static_cast(t.tv_sec) * 1000000000LL + t.tv_nsec; +} +namespace testing { +Benchmark* Benchmark::Arg(int arg) { + args_.push_back(arg); + return this; +} + +Benchmark* Benchmark::Range(int lo, int hi) { + const int kRangeMultiplier = 8; + if (hi < lo) { + int temp = hi; + hi = lo; + lo = temp; + } + while (lo < hi) { + args_.push_back(lo); + lo *= kRangeMultiplier; + } + // We always run the hi number. + args_.push_back(hi); + return this; +} + +const char* Benchmark::Name() { + return name_; +} +bool Benchmark::ShouldRun(int argc, char* argv[]) { + if (argc == 1) { + return true; // With no arguments, we run all benchmarks. + } + // Otherwise, we interpret each argument as a regular expression and + // see if any of our benchmarks match. + for (int i = 1; i < argc; i++) { + regex_t re; + if (regcomp(&re, argv[i], 0) != 0) { + fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]); + exit(EXIT_FAILURE); + } + int match = regexec(&re, name_, 0, NULL, 0); + regfree(&re); + if (match != REG_NOMATCH) { + return true; + } + } + return false; +} +void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) { + name_ = name; + fn_ = fn; + fn_range_ = fn_range; + if (fn_ == NULL && fn_range_ == NULL) { + fprintf(stderr, "%s: missing function\n", name_); + exit(EXIT_FAILURE); + } + g_benchmarks.insert(std::make_pair(name, this)); +} +void Benchmark::Run() { + if (fn_ != NULL) { + RunWithArg(0); + } else { + if (args_.empty()) { + fprintf(stderr, "%s: no args!\n", name_); + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < args_.size(); ++i) { + RunWithArg(args_[i]); + } + } +} +void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { + g_bytes_processed = 0; + g_benchmark_total_time_ns = 0; + g_benchmark_start_time_ns = NanoTime(); + if (fn_ != NULL) { + fn_(iterations); + } else { + fn_range_(iterations, arg); + } + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } +} +void Benchmark::RunWithArg(int arg) { + // run once in case it's expensive + int iterations = 1; + RunRepeatedlyWithArg(iterations, arg); + while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) { + int last = iterations; + if (g_benchmark_total_time_ns/iterations == 0) { + iterations = 1e9; + } else { + iterations = 1e9 / (g_benchmark_total_time_ns/iterations); + } + iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last)); + iterations = Round(iterations); + RunRepeatedlyWithArg(iterations, arg); + } + char throughput[100]; + throughput[0] = '\0'; + if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) { + double mib_processed = static_cast(g_bytes_processed)/1e6; + double seconds = static_cast(g_benchmark_total_time_ns)/1e9; + snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds); + } + char full_name[100]; + if (fn_range_ != NULL) { + if (arg >= (1<<20)) { + snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20)); + } else if (arg >= (1<<10)) { + snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10)); + } else { + snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg); + } + } else { + snprintf(full_name, sizeof(full_name), "%s", name_); + } + printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name, + iterations, g_benchmark_total_time_ns/iterations, throughput); + fflush(stdout); +} +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t x) { + g_bytes_processed = x; +} +void StopBenchmarkTiming() { + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } + g_benchmark_start_time_ns = 0; +} +void StartBenchmarkTiming() { + if (g_benchmark_start_time_ns == 0) { + g_benchmark_start_time_ns = NanoTime(); + } +} +int main(int argc, char* argv[]) { + if (g_benchmarks.empty()) { + fprintf(stderr, "No benchmarks registered!\n"); + exit(EXIT_FAILURE); + } + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + int name_width = static_cast(strlen(it->second->Name())); + g_name_column_width = std::max(g_name_column_width, name_width); + } + bool need_header = true; + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + ::testing::Benchmark* b = it->second; + if (b->ShouldRun(argc, argv)) { + if (need_header) { + printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op"); + fflush(stdout); + need_header = false; + } + b->Run(); + } + } + if (need_header) { + fprintf(stderr, "No matching benchmarks!\n"); + fprintf(stderr, "Available benchmarks:\n"); + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + fprintf(stderr, " %s\n", it->second->Name()); + } + exit(EXIT_FAILURE); + } + return 0; +} \ No newline at end of file diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 525b9acda..a1696afda 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -4,12 +4,23 @@ typedef int TensorIndex; #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "testing/base/public/benchmark.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "benchmark.h" + +#define BENCHMARK_RANGE(bench, lo, hi) \ + BENCHMARK(bench)->Range(lo, hi) + +template +std::string StrCat(const Args... args) { + std::stringstream ss; + StrCatRecursive(ss, args...); + return ss.str(); +} using Eigen::Tensor; using Eigen::TensorMap; +typedef int64_t int64; // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. @@ -43,7 +54,7 @@ template class BenchmarkSuite { void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); @@ -56,16 +67,16 @@ template class BenchmarkSuite { void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - const Eigen::DSizes quarter_sizes(Eigen::array(m_/2, m_/2)); - const Eigen::DSizes first_quadrant(Eigen::array(0, 0)); - const Eigen::DSizes second_quadrant(Eigen::array(0, m_/2)); - const Eigen::DSizes third_quadrant(Eigen::array(m_/2, 0)); - const Eigen::DSizes fourth_quadrant(Eigen::array(m_/2, m_/2)); + const Eigen::DSizes quarter_sizes(m_/2, m_/2); + const Eigen::DSizes first_quadrant(0, 0); + const Eigen::DSizes second_quadrant(0, m_/2); + const Eigen::DSizes third_quadrant(m_/2, 0); + const Eigen::DSizes fourth_quadrant(m_/2, m_/2); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -85,12 +96,12 @@ template class BenchmarkSuite { void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array shuffle(1, 0); + const Eigen::array shuffle = {{1, 0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -102,9 +113,9 @@ template class BenchmarkSuite { void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_-3); + const Eigen::array size_a = {{m_, k_-3}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array, 2> paddings; @@ -121,12 +132,12 @@ template class BenchmarkSuite { void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(m_, k_ / 2); + const Eigen::array size_b = {{m_, k_ / 2}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array strides(1, 2); + const Eigen::array strides = {{1, 2}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -137,14 +148,14 @@ template class BenchmarkSuite { } void broadcasting(int num_iters) { - const Eigen::array size_a(m_, 1); + const Eigen::array size_a = {{m_, 1}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_c(m_, n_); + const Eigen::array size_c = {{m_, n_}}; TensorMap, Eigen::Aligned> C(c_, size_c); -#if defined(__CUDACC__) +#ifndef EIGEN_HAS_INDEX_LIST // nvcc doesn't support cxx11 - const Eigen::array broadcast(1, n_); + const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -162,7 +173,7 @@ template class BenchmarkSuite { void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -178,7 +189,7 @@ template class BenchmarkSuite { void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -194,7 +205,7 @@ template class BenchmarkSuite { void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -210,12 +221,12 @@ template class BenchmarkSuite { // Simple reduction void reduction(int num_iters) { - const Eigen::array input_size(k_, n_); + const Eigen::array input_size = {{k_, n_}}; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size(n_); + const Eigen::array output_size = {{n_}}; TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim(0); + const Eigen::array sum_along_dim = {{0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -228,16 +239,16 @@ template class BenchmarkSuite { // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - const Eigen::array sizeA(m_, k_); - const Eigen::array sizeB(k_, n_); - const Eigen::array sizeC(m_, n_); + const Eigen::array sizeA = {{m_, k_}}; + const Eigen::array sizeB = {{k_, n_}}; + const Eigen::array sizeC = {{m_, n_}}; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; - const Eigen::array dims(DimPair(1, 0)); + const Eigen::array dims = {{DimPair(1, 0)}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -249,14 +260,14 @@ template class BenchmarkSuite { } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array input_sizes(m_, n_); + const Eigen::array input_sizes = {{m_, n_}}; TensorMap, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array kernel_sizes(kernel_x, kernel_y); + const Eigen::array kernel_sizes = {{kernel_x, kernel_y}}; TensorMap, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array result_sizes( - m_ - kernel_x + 1, n_ - kernel_y + 1); + const Eigen::array result_sizes = + {{m_ - kernel_x + 1, n_ - kernel_y + 1}}; TensorMap, Eigen::Aligned> C(c_, result_sizes); - Eigen::array::Index, 2> dims(0, 1); + Eigen::array::Index, 2> dims = {{0, 1}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -280,7 +291,7 @@ template class BenchmarkSuite { device_.memset(b_, 23, k_ * n_ * sizeof(float)); device_.memset(c_, 31, m_ * n_ * sizeof(float)); - BenchmarkUseRealTime(); + //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64 num_items) { @@ -290,7 +301,7 @@ template class BenchmarkSuite { } #endif StopBenchmarkTiming(); - SetBenchmarkItemsProcessed(num_items); + SetBenchmarkBytesProcessed(num_items); } diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 68653ba15..248a63861 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -1,19 +1,12 @@ #define EIGEN_USE_THREADS -#include "base/sysinfo.h" -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" -#include "thread/threadpool.h" +#include + +#include "tensor_benchmarks.h" -#ifdef __ANDROID__ #define CREATE_THREAD_POOL(threads) \ -Eigen::ThreadPoolDevice device(threads); -#else -#define CREATE_THREAD_POOL(threads) \ -ThreadPool tp(threads); \ -tp.StartWorkers(); \ -Eigen::ThreadPoolDevice device(&tp, threads); -#endif +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions #define BM_FuncCPU(FUNC, THREADS) \ @@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); @@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12); BenchmarkSuite suite(device, D1, D2, D3); \ suite.FUNC(iters); \ } \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc index adea754ad..9fe8f84d9 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -3,10 +3,8 @@ #include #include #include -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" - +#include "tensor_benchmarks.h" // Simple functions #define BM_FuncGPU(FUNC) \ From 270c4e1ecd8fd10c42760dd67adbbab0b1387da2 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 28 Jan 2016 11:11:45 -0800 Subject: [PATCH 055/102] bugfix --- bench/tensors/benchmark.h | 4 +++- bench/tensors/benchmark_main.cc | 21 ++++++++++++------- bench/tensors/tensor_benchmarks.h | 13 +++--------- ...hmarks_gpu.cc => tensor_benchmarks_gpu.cu} | 12 +++-------- 4 files changed, 23 insertions(+), 27 deletions(-) rename bench/tensors/{tensor_benchmarks_gpu.cc => tensor_benchmarks_gpu.cu} (81%) diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h index d8b4fd4c6..2c06075e0 100644 --- a/bench/tensors/benchmark.h +++ b/bench/tensors/benchmark.h @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -45,4 +46,5 @@ void StopBenchmarkTiming(); void StartBenchmarkTiming(); #define BENCHMARK(f) \ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ - (new ::testing::Benchmark(#f, f)) \ No newline at end of file + (new ::testing::Benchmark(#f, f)) + diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index 0fc12960e..b2f457c96 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,8 +28,14 @@ static int64_t g_benchmark_total_time_ns; static int64_t g_benchmark_start_time_ns; typedef std::map BenchmarkMap; typedef BenchmarkMap::iterator BenchmarkMapIt; -static BenchmarkMap g_benchmarks; + +BenchmarkMap& gBenchmarks() { + static BenchmarkMap g_benchmarks; + return g_benchmarks; +} + static int g_name_column_width = 20; + static int Round(int n) { int base = 1; while (base*10 < n) { @@ -101,7 +108,7 @@ void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int fprintf(stderr, "%s: missing function\n", name_); exit(EXIT_FAILURE); } - g_benchmarks.insert(std::make_pair(name, this)); + gBenchmarks().insert(std::make_pair(name, this)); } void Benchmark::Run() { if (fn_ != NULL) { @@ -183,16 +190,16 @@ void StartBenchmarkTiming() { } } int main(int argc, char* argv[]) { - if (g_benchmarks.empty()) { + if (gBenchmarks().empty()) { fprintf(stderr, "No benchmarks registered!\n"); exit(EXIT_FAILURE); } - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { int name_width = static_cast(strlen(it->second->Name())); g_name_column_width = std::max(g_name_column_width, name_width); } bool need_header = true; - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { ::testing::Benchmark* b = it->second; if (b->ShouldRun(argc, argv)) { if (need_header) { @@ -206,10 +213,10 @@ int main(int argc, char* argv[]) { if (need_header) { fprintf(stderr, "No matching benchmarks!\n"); fprintf(stderr, "Available benchmarks:\n"); - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { fprintf(stderr, " %s\n", it->second->Name()); } exit(EXIT_FAILURE); } return 0; -} \ No newline at end of file +} diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index a1696afda..071326aa7 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -10,13 +10,6 @@ typedef int TensorIndex; #define BENCHMARK_RANGE(bench, lo, hi) \ BENCHMARK(bench)->Range(lo, hi) -template -std::string StrCat(const Args... args) { - std::stringstream ss; - StrCatRecursive(ss, args...); - return ss.str(); -} - using Eigen::Tensor; using Eigen::TensorMap; @@ -305,9 +298,9 @@ template class BenchmarkSuite { } - size_t m_; - size_t k_; - size_t n_; + TensorIndex m_; + TensorIndex k_; + TensorIndex n_; float* a_; float* b_; float* c_; diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu similarity index 81% rename from bench/tensors/tensor_benchmarks_gpu.cc rename to bench/tensors/tensor_benchmarks_gpu.cu index 9fe8f84d9..fbb486efd 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -10,13 +10,11 @@ #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); @@ -35,13 +33,11 @@ BM_FuncGPU(reduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); @@ -55,13 +51,11 @@ BM_FuncWithInputDimsGPU(contraction, N, 64, N); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); From c1d900af61561194ad01b16a89da9f9ce1a09723 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 21:43:20 +0100 Subject: [PATCH 056/102] bug #178: remove additional const on nested expression, and remove several const_cast. --- Eigen/src/Core/ArrayWrapper.h | 38 +++++++++++++++---------------- Eigen/src/Core/Block.h | 31 ++++++++++--------------- Eigen/src/Core/CwiseUnaryOp.h | 9 ++++---- Eigen/src/Core/CwiseUnaryView.h | 7 +++--- Eigen/src/Core/Diagonal.h | 14 ++++++------ Eigen/src/Core/SelfAdjointView.h | 6 ++--- Eigen/src/Core/Transpose.h | 10 ++++---- Eigen/src/Core/Transpositions.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 9 ++++---- Eigen/src/Core/util/XprHelper.h | 10 ++++---- test/array_for_matrix.cpp | 10 ++++++++ test/diagonal.cpp | 7 ++++++ 12 files changed, 83 insertions(+), 70 deletions(-) diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index 4e484f290..6013d4d85 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector::type NestedExpressionType; + typedef typename internal::ref_selector::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -67,7 +67,7 @@ class ArrayWrapper : public ArrayBase > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -80,13 +80,13 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -98,13 +98,13 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template @@ -116,7 +116,7 @@ class ArrayWrapper : public ArrayBase > template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(rowId, colId, val); + m_expression.template writePacket(rowId, colId, val); } template @@ -128,7 +128,7 @@ class ArrayWrapper : public ArrayBase > template inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(index, val); + m_expression.template writePacket(index, val); } template @@ -145,11 +145,11 @@ class ArrayWrapper : public ArrayBase > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; @@ -195,7 +195,7 @@ class MatrixWrapper : public MatrixBase > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector::type NestedExpressionType; + typedef typename internal::ref_selector::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -210,7 +210,7 @@ class MatrixWrapper : public MatrixBase > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -223,7 +223,7 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -241,13 +241,13 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template @@ -259,7 +259,7 @@ class MatrixWrapper : public MatrixBase > template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(rowId, colId, val); + m_expression.template writePacket(rowId, colId, val); } template @@ -271,7 +271,7 @@ class MatrixWrapper : public MatrixBase > template inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket(index, val); + m_expression.template writePacket(index, val); } EIGEN_DEVICE_FUNC @@ -284,11 +284,11 @@ class MatrixWrapper : public MatrixBase > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 599b714cc..cee5591f2 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -221,15 +221,13 @@ template inline PacketScalar packet(Index rowId, Index colId) const { - return m_xpr.template packet - (rowId + m_startRow.value(), colId + m_startCol.value()); + return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket - (rowId + m_startRow.value(), colId + m_startCol.value(), val); + m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template @@ -288,7 +281,7 @@ template inline void writePacket(Index index, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket + m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val); } @@ -320,7 +313,7 @@ template m_startRow; const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 22db783b5..8c182303c 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -58,6 +58,7 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp) + typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC @@ -75,16 +76,16 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type& + const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + typename internal::remove_all::type& + nestedExpression() { return m_xpr; } protected: - typename XprType::Nested m_xpr; + XprTypeNested m_xpr; const UnaryOp m_functor; }; diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index a9252eddf..271033056 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -61,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView) + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type NestedExpression; explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) @@ -75,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::type& + const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_all::type& + typename internal::remove_reference::type& nestedExpression() { return m_matrix.const_cast_derived(); } protected: - typename internal::ref_selector::type m_matrix; + MatrixTypeNested m_matrix; ViewOp m_functor; }; diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index fa3176266..bfea0584b 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -103,21 +103,21 @@ template class Diagonal >::type ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC - inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index) const { - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC @@ -130,13 +130,13 @@ template class Diagonal inline Scalar& coeffRef(Index idx) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index idx) const { - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC @@ -159,7 +159,7 @@ template class Diagonal } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; const internal::variable_if_dynamicindex m_index; private: diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index e709eb213..9fda02691 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -32,7 +32,7 @@ namespace internal { template struct traits > : traits { - typedef typename ref_selector::type MatrixTypeNested; + typedef typename ref_selector::non_const_type MatrixTypeNested; typedef typename remove_all::type MatrixTypeNestedCleaned; typedef MatrixType ExpressionType; typedef typename MatrixType::PlainObject FullMatrixType; @@ -97,7 +97,7 @@ template class SelfAdjointView { EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView); Base::check_coordinates_internal(row, col); - return m_matrix.const_cast_derived().coeffRef(row, col); + return m_matrix.coeffRef(row, col); } /** \internal */ @@ -107,7 +107,7 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; } EIGEN_DEVICE_FUNC - MatrixTypeNestedCleaned& nestedExpression() { return *const_cast(&m_matrix); } + MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; } /** Efficient triangular matrix times vector/matrix product */ template diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index f199d1086..bc232526a 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -54,6 +54,8 @@ template class Transpose { public: + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename TransposeImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose) typedef typename internal::remove_all::type NestedExpression; @@ -68,16 +70,16 @@ template class Transpose /** \returns the nested expression */ EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& + const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; }; namespace internal { diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 678ba3288..19c17bb4a 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -325,7 +325,7 @@ class TranspositionsWrapper protected: - const typename IndicesType::Nested m_indices; + typename IndicesType::Nested m_indices; }; diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 27845e89c..f55b42eed 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -168,7 +168,7 @@ namespace internal { template struct traits > : traits { - typedef typename ref_selector::type MatrixTypeNested; + typedef typename ref_selector::non_const_type MatrixTypeNested; typedef typename remove_reference::type MatrixTypeNestedNonRef; typedef typename remove_all::type MatrixTypeNestedCleaned; typedef typename MatrixType::PlainObject FullMatrixType; @@ -213,7 +213,6 @@ template class TriangularView IsVectorAtCompileTime = false }; - // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} @@ -235,7 +234,7 @@ template class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC - NestedExpression& nestedExpression() { return *const_cast(&m_matrix); } + NestedExpression& nestedExpression() { return m_matrix; } typedef TriangularView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ @@ -255,7 +254,7 @@ template class TriangularView inline TransposeReturnType transpose() { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived()); + typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } @@ -418,7 +417,7 @@ template class TriangularViewImpl<_Mat { EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType); Base::check_coordinates_internal(row, col); - return derived().nestedExpression().const_cast_derived().coeffRef(row, col); + return derived().nestedExpression().coeffRef(row, col); } /** Assigns a triangular matrix to a triangular part of a dense matrix */ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 8b71e2c62..9fe8cfcd1 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -466,17 +466,17 @@ struct special_scalar_op_base : public BaseType template struct special_scalar_op_base : public BaseType { - const CwiseUnaryOp, Derived> + const CwiseUnaryOp, const Derived> operator*(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp, Derived> + return CwiseUnaryOp, const Derived> (*static_cast(this), scalar_multiple2_op(scalar)); } - inline friend const CwiseUnaryOp, Derived> + inline friend const CwiseUnaryOp, const Derived> operator*(const OtherScalar& scalar, const Derived& matrix) { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN @@ -485,13 +485,13 @@ struct special_scalar_op_base : publi return static_cast(matrix).operator*(scalar); } - const CwiseUnaryOp, Derived> + const CwiseUnaryOp, const Derived> operator/(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp, Derived> + return CwiseUnaryOp, const Derived> (*static_cast(this), scalar_quotient2_op(scalar)); } }; diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index 9667e1f14..db5f3b34a 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -68,6 +68,16 @@ template void array_for_matrix(const MatrixType& m) const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0); VERIFY(&ref_a1 == &ref_m1); VERIFY(&ref_a2 == &ref_m2); + + // Check write accessors: + m1.array().coeffRef(0,0) = 1; + VERIFY_IS_APPROX(m1(0,0),Scalar(1)); + m1.array()(0,0) = 2; + VERIFY_IS_APPROX(m1(0,0),Scalar(2)); + m1.array().matrix().coeffRef(0,0) = 3; + VERIFY_IS_APPROX(m1(0,0),Scalar(3)); + m1.array().matrix()(0,0) = 4; + VERIFY_IS_APPROX(m1(0,0),Scalar(4)); } template void comparisons(const MatrixType& m) diff --git a/test/diagonal.cpp b/test/diagonal.cpp index 53814a588..ee00cad55 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -20,6 +20,8 @@ template void diagonal(const MatrixType& m) MatrixType m1 = MatrixType::Random(rows, cols), m2 = MatrixType::Random(rows, cols); + Scalar s1 = internal::random(); + //check diagonal() VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal()); m2.diagonal() = 2 * m1.diagonal(); @@ -58,6 +60,11 @@ template void diagonal(const MatrixType& m) VERIFY_IS_APPROX(m2.template diagonal(), static_cast(2) * m1.diagonal(N2)); m2.diagonal(N2)[0] *= 3; VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast(6) * m1.diagonal(N2)[0]); + + m2.diagonal(N2).x() = s1; + VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1); + m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1; + VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1); } } From b908e071a80fce910efc82c1c50dd6be1e226dcd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jan 2016 22:11:18 +0100 Subject: [PATCH 057/102] bug #178: get rid of some const_cast in SparseCore --- Eigen/src/SparseCore/SparseBlock.h | 37 +++++++++----------- Eigen/src/SparseCore/SparseSelfAdjointView.h | 12 +++---- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 10be84856..43fe788d9 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -100,11 +100,11 @@ protected: enum { OuterSize = IsRowMajor ? BlockRows : BlockCols }; public: - inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index i) + inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i) : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize) {} - inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols)) {} @@ -112,7 +112,7 @@ public: inline BlockType& operator=(const SparseMatrixBase& other) { typedef typename internal::remove_all::type _NestedMatrixType; - _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);; + _NestedMatrixType& matrix = m_matrix; // This assignment is slow if this vector set is not empty // and/or it is not at the end of the nonzeros of the underlying matrix. @@ -209,28 +209,28 @@ public: inline const Scalar* valuePtr() const { return m_matrix.valuePtr(); } inline Scalar* valuePtr() - { return m_matrix.const_cast_derived().valuePtr(); } + { return m_matrix.valuePtr(); } inline const StorageIndex* innerIndexPtr() const { return m_matrix.innerIndexPtr(); } inline StorageIndex* innerIndexPtr() - { return m_matrix.const_cast_derived().innerIndexPtr(); } + { return m_matrix.innerIndexPtr(); } inline const StorageIndex* outerIndexPtr() const { return m_matrix.outerIndexPtr() + m_outerStart; } inline StorageIndex* outerIndexPtr() - { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; } + { return m_matrix.outerIndexPtr() + m_outerStart; } inline const StorageIndex* innerNonZeroPtr() const { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); } inline StorageIndex* innerNonZeroPtr() - { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); } + { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); } bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; } inline Scalar& coeffRef(Index row, Index col) { - return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); + return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); } inline const Scalar coeff(Index row, Index col) const @@ -264,7 +264,7 @@ public: protected: - typename SparseMatrixType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; Index m_outerStart; const internal::variable_if_dynamic m_outerSize; @@ -373,7 +373,7 @@ public: /** Column or Row constructor */ - inline BlockImpl(const XprType& xpr, Index i) + inline BlockImpl(XprType& xpr, Index i) : m_matrix(xpr), m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0), m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0), @@ -383,7 +383,7 @@ public: /** Dynamic-size constructor */ - inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols)) {} @@ -392,8 +392,7 @@ public: inline Scalar& coeffRef(Index row, Index col) { - return m_matrix.const_cast_derived() - .coeffRef(row + m_startRow.value(), col + m_startCol.value()); + return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value()); } inline const Scalar coeff(Index row, Index col) const @@ -403,16 +402,14 @@ public: inline Scalar& coeffRef(Index index) { - return m_matrix.const_cast_derived() - .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } inline const Scalar coeff(Index index) const { - return m_matrix - .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } @@ -430,7 +427,7 @@ public: EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - typename XprType::Nested m_matrix; + typename internal::ref_selector::non_const_type m_matrix; const internal::variable_if_dynamic m_startRow; const internal::variable_if_dynamic m_startCol; const internal::variable_if_dynamic m_blockRows; diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 975cefd28..402733cce 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -55,10 +55,10 @@ template class SparseSelfAdjointView typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix VectorI; - typedef typename MatrixType::Nested MatrixTypeNested; + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; - explicit inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix) + explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix) { eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices"); } @@ -68,7 +68,7 @@ template class SparseSelfAdjointView /** \internal \returns a reference to the nested matrix */ const _MatrixTypeNested& matrix() const { return m_matrix; } - _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); } + typename internal::remove_reference::type& matrix() { return m_matrix; } /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs. * @@ -158,7 +158,7 @@ template class SparseSelfAdjointView protected: - typename MatrixType::Nested m_matrix; + MatrixTypeNested m_matrix; //mutable VectorI m_countPerRow; //mutable VectorI m_countPerCol; private: @@ -194,9 +194,9 @@ SparseSelfAdjointView::rankUpdate(const SparseMatrixBase tmp = u * u.adjoint(); if(alpha==Scalar(0)) - m_matrix.const_cast_derived() = tmp.template triangularView(); + m_matrix = tmp.template triangularView(); else - m_matrix.const_cast_derived() += alpha * tmp.template triangularView(); + m_matrix += alpha * tmp.template triangularView(); return *this; } From 7b3044d086f413fdaf65acd30fc3bc469d43ccc6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 15:36:34 -0800 Subject: [PATCH 058/102] Made sure to call nvcc with the relaxed-constexpr flag. --- unsupported/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d16c42656..eed724bcf 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -158,7 +158,7 @@ if(CUDA_FOUND) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() - set(CUDA_NVCC_FLAGS "-std=c++11 -arch compute_30") + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") From c8d5f21941a41556f94e937ea5a91badb7fb9353 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:20:36 -0800 Subject: [PATCH 059/102] Added extra tensor benchmarks --- bench/tensors/tensor_benchmarks.h | 87 +++++++++++++++++++++++--- bench/tensors/tensor_benchmarks_cpu.cc | 28 ++++++++- bench/tensors/tensor_benchmarks_gpu.cu | 7 ++- 3 files changed, 111 insertions(+), 11 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 071326aa7..6b9d13446 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,6 +45,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array sizes = {{m_, k_}}; + const TensorMap, Eigen::Aligned> A(a_, sizes); + TensorMap, Eigen::Aligned> B((int*)b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.cast(); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * k_ * num_iters); + } + void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); const Eigen::array sizes = {{m_, m_}}; @@ -87,6 +101,34 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void rowChip(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + + void colChip(int num_iters) { + const Eigen::array input_size= {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + void shuffling(int num_iters) { eigen_assert(m_ == n_); const Eigen::array size_a = {{m_, k_}}; @@ -147,7 +189,6 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - // nvcc doesn't support cxx11 const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to @@ -212,14 +253,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { + // Row reduction + void rowReduction(int num_iters) { const Eigen::array input_size = {{k_, n_}}; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim = {{0}}; +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim(0); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -227,7 +274,33 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(k_ * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B( + b_, input_size); + const Eigen::array output_size = {{k_}}; + TensorMap, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim = {{1}}; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(k_ * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 248a63861..6754e1a32 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4); BM_FuncCPU(memcpy, 8); BM_FuncCPU(memcpy, 12); +BM_FuncCPU(typeCasting, 4); +BM_FuncCPU(typeCasting, 8); +BM_FuncCPU(typeCasting, 12); + BM_FuncCPU(random, 4); BM_FuncCPU(random, 8); BM_FuncCPU(random, 12); @@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4); BM_FuncCPU(slicing, 8); BM_FuncCPU(slicing, 12); +BM_FuncCPU(rowChip, 4); +BM_FuncCPU(rowChip, 8); +BM_FuncCPU(rowChip, 12); + +BM_FuncCPU(colChip, 4); +BM_FuncCPU(colChip, 8); +BM_FuncCPU(colChip, 12); + BM_FuncCPU(shuffling, 4); BM_FuncCPU(shuffling, 8); BM_FuncCPU(shuffling, 12); @@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4); BM_FuncCPU(transcendentalFunc, 8); BM_FuncCPU(transcendentalFunc, 12); -BM_FuncCPU(reduction, 4); -BM_FuncCPU(reduction, 8); -BM_FuncCPU(reduction, 12); +BM_FuncCPU(rowReduction, 4); +BM_FuncCPU(rowReduction, 8); +BM_FuncCPU(rowReduction, 12); + +BM_FuncCPU(colReduction, 4); +BM_FuncCPU(colReduction, 8); +BM_FuncCPU(colReduction, 12); // Contractions @@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); + BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fbb486efd..fe807d2ab 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -19,6 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(shuffling); @@ -26,7 +27,10 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); // Contractions @@ -45,6 +49,7 @@ BM_FuncGPU(reduction); BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); // Convolutions From a68864b6bce5e00fdec07a9d4dae7376dedb654e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:51:40 -0800 Subject: [PATCH 060/102] Updated the benchmarking code to print the number of flops processed instead of the number of bytes. --- bench/tensors/benchmark.h | 3 +-- bench/tensors/benchmark_main.cc | 14 +++++++------- bench/tensors/tensor_benchmarks.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h index 2c06075e0..f115b54ad 100644 --- a/bench/tensors/benchmark.h +++ b/bench/tensors/benchmark.h @@ -41,10 +41,9 @@ class Benchmark { void RunWithArg(int arg); }; } // namespace testing -void SetBenchmarkBytesProcessed(int64_t); +void SetBenchmarkFlopsProcessed(int64_t); void StopBenchmarkTiming(); void StartBenchmarkTiming(); #define BENCHMARK(f) \ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ (new ::testing::Benchmark(#f, f)) - diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index b2f457c96..65dbd89bb 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -23,7 +23,7 @@ #include #include -static int64_t g_bytes_processed; +static int64_t g_flops_processed; static int64_t g_benchmark_total_time_ns; static int64_t g_benchmark_start_time_ns; typedef std::map BenchmarkMap; @@ -124,7 +124,7 @@ void Benchmark::Run() { } } void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { - g_bytes_processed = 0; + g_flops_processed = 0; g_benchmark_total_time_ns = 0; g_benchmark_start_time_ns = NanoTime(); if (fn_ != NULL) { @@ -153,10 +153,10 @@ void Benchmark::RunWithArg(int arg) { } char throughput[100]; throughput[0] = '\0'; - if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) { - double mib_processed = static_cast(g_bytes_processed)/1e6; + if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) { + double mflops_processed = static_cast(g_flops_processed)/1e6; double seconds = static_cast(g_benchmark_total_time_ns)/1e9; - snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds); + snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds); } char full_name[100]; if (fn_range_ != NULL) { @@ -175,8 +175,8 @@ void Benchmark::RunWithArg(int arg) { fflush(stdout); } } // namespace testing -void SetBenchmarkBytesProcessed(int64_t x) { - g_bytes_processed = x; +void SetBenchmarkFlopsProcessed(int64_t x) { + g_flops_processed = x; } void StopBenchmarkTiming() { if (g_benchmark_start_time_ns != 0) { diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 6b9d13446..ba7e7eb48 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -367,7 +367,7 @@ template class BenchmarkSuite { } #endif StopBenchmarkTiming(); - SetBenchmarkBytesProcessed(num_items); + SetBenchmarkFlopsProcessed(num_items); } From 120e13b1b68763f50b30bb83733dcefa9ed966c4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:06:00 -0800 Subject: [PATCH 061/102] Added a readme to explain how to compile the tensor benchmarks. --- bench/tensors/README | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 bench/tensors/README diff --git a/bench/tensors/README b/bench/tensors/README new file mode 100644 index 000000000..9e0ac0ce8 --- /dev/null +++ b/bench/tensors/README @@ -0,0 +1,8 @@ +Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. + +To compile the CPU benchmarks, simply call: +g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG-pthread -mavx -o benchmarks_cpu + +To compile the GPU benchmarks, simply call: +nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu + From bd2e5a788ac074535b4f973ac81ac61d4a166288 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:10:40 -0800 Subject: [PATCH 062/102] Made sure the number of floating point operations done by a benchmark is computed using 64 bit integers to avoid overflows. --- bench/tensors/tensor_benchmarks.h | 40 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index ba7e7eb48..365504009 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -13,8 +13,6 @@ typedef int TensorIndex; using Eigen::Tensor; using Eigen::TensorMap; -typedef int64_t int64; - // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. template class BenchmarkSuite { @@ -42,7 +40,7 @@ template class BenchmarkSuite { device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); } // Record the number of values copied per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void typeCasting(int num_iters) { @@ -56,7 +54,7 @@ template class BenchmarkSuite { B.device(device_) = A.cast(); } // Record the number of values copied per second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void random(int num_iters) { @@ -69,7 +67,7 @@ template class BenchmarkSuite { C.device(device_) = C.random(); } // Record the number of random numbers generated per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void slicing(int num_iters) { @@ -98,7 +96,7 @@ template class BenchmarkSuite { } // Record the number of values copied from the rhs slice to the lhs slice // each second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void rowChip(int num_iters) { @@ -112,7 +110,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % k_, 0); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void colChip(int num_iters) { @@ -126,7 +124,7 @@ template class BenchmarkSuite { C.device(device_) = B.chip(iter % n_, 1); } // Record the number of values copied from the rhs chip to the lhs. - finalizeBenchmark(n_ * num_iters); + finalizeBenchmark(static_cast(n_) * num_iters); } void shuffling(int num_iters) { @@ -143,7 +141,7 @@ template class BenchmarkSuite { B.device(device_) = A.shuffle(shuffle); } // Record the number of values shuffled from A and copied to B each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void padding(int num_iters) { @@ -162,7 +160,7 @@ template class BenchmarkSuite { B.device(device_) = A.pad(paddings); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void striding(int num_iters) { @@ -179,7 +177,7 @@ template class BenchmarkSuite { B.device(device_) = A.stride(strides); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast(m_) * k_ * num_iters); } void broadcasting(int num_iters) { @@ -202,7 +200,7 @@ template class BenchmarkSuite { C.device(device_) = A.broadcast(broadcast); } // Record the number of values broadcasted from A and copied to C each second - finalizeBenchmark(m_ * n_ * num_iters); + finalizeBenchmark(static_cast(m_) * n_ * num_iters); } void coeffWiseOp(int num_iters) { @@ -218,7 +216,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (2 multiplications and // 1 addition per value) - finalizeBenchmark(3 * m_ * m_ * num_iters); + finalizeBenchmark(static_cast(3) * m_ * m_ * num_iters); } void algebraicFunc(int num_iters) { @@ -234,7 +232,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } void transcendentalFunc(int num_iters) { @@ -250,7 +248,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast(m_) * m_ * num_iters); } // Row reduction @@ -274,7 +272,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // Column reduction @@ -300,7 +298,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(k_ * n_ * num_iters); + finalizeBenchmark(static_cast(k_) * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication @@ -322,7 +320,7 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (size_ multiplications and // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -341,8 +339,8 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) - finalizeBenchmark( - (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + finalizeBenchmark(static_cast(2) * + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: @@ -360,7 +358,7 @@ template class BenchmarkSuite { //BenchmarkUseRealTime(); } - inline void finalizeBenchmark(int64 num_items) { + inline void finalizeBenchmark(int64_t num_items) { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) if (Eigen::internal::is_same::value) { device_.synchronize(); From 211d350fc332a86e5eeb1c9a4ab598756c2eddbf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 17:13:04 -0800 Subject: [PATCH 063/102] Fixed a typo --- bench/tensors/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/README b/bench/tensors/README index 9e0ac0ce8..6b51fe878 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,7 +1,7 @@ Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. To compile the CPU benchmarks, simply call: -g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG-pthread -mavx -o benchmarks_cpu +g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu To compile the GPU benchmarks, simply call: nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu From 3fde202215875568219a7287415385e57f58413f Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 28 Jan 2016 21:27:00 -0500 Subject: [PATCH 064/102] Making ceil() functor generic w.r.t packet type --- Eigen/src/Core/functors/UnaryFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 897ab04ba..531beead6 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -666,7 +666,7 @@ struct functor_traits > template struct scalar_ceil_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); } - typedef typename packet_traits::type Packet; + template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); } }; template From d3f533b395e56b740c9f7c6f7272d3384c10222a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 20:09:45 -0800 Subject: [PATCH 065/102] Fixed compilation warning --- Eigen/src/Core/GenericPacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 4c7d1d848..8f63af7cb 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -285,7 +285,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template inline void prefetch(const Scalar* addr) +template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { #ifdef __CUDA_ARCH__ #if defined(__LP64__) From 10bea90c4add286b8d10473ba272660ef4210083 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 20:52:08 -0800 Subject: [PATCH 066/102] Fixed clang related compilation error --- bench/tensors/tensor_benchmarks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 365504009..f3ec70a9e 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -259,7 +259,7 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array sum_along_dim(0); + const Eigen::array sum_along_dim = {{0}}; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. From e4f83bae5df854319f917fb15ee781f4b960e77c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 21:08:07 -0800 Subject: [PATCH 067/102] Fixed the tensor benchmarks on apple devices --- bench/tensors/benchmark_main.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index 65dbd89bb..1efa0dbad 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -49,12 +49,27 @@ static int Round(int n) { } return 10*base; } + +#ifdef __APPLE__ + #include + static mach_timebase_info_data_t g_time_info; + static void __attribute__((constructor)) init_info() { + mach_timebase_info(&g_time_info); + } +#endif + static int64_t NanoTime() { +#if defined(__APPLE__) + uint64_t t = mach_absolute_time(); + return t * g_time_info.numer / g_time_info.denom; +#else struct timespec t; t.tv_sec = t.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t); return static_cast(t.tv_sec) * 1000000000LL + t.tv_nsec; +#endif } + namespace testing { Benchmark* Benchmark::Arg(int arg) { args_.push_back(arg); From c5d25bf1d014f7ef87d55901b591d24a32ee8f4a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 23:15:45 -0800 Subject: [PATCH 068/102] Fixed a couple of compilation warnings. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 7a5dfbfea..a03b52629 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -345,7 +345,7 @@ template struct InnerReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); } }; @@ -355,7 +355,7 @@ template struct OuterReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { eigen_assert(false && "Not implemented"); } }; From 963f2d2a8f33eebf90b3ae1944423aa875281469 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 23:37:48 -0800 Subject: [PATCH 069/102] Marked several methods EIGEN_DEVICE_FUNC --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e6a008ba7..1adb68894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase } template - void evalGemv(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -516,7 +516,7 @@ struct TensorEvaluator - void evalProduct(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { this->template evalGemv(buffer); return; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 78ed5038f..3d3f6904f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,7 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -41,9 +41,9 @@ class TensorContractionBlocking { } } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: Index kc_; From d8d37349c3149bffd304057512ee8e6b0f42bc5a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 12:44:49 +0100 Subject: [PATCH 070/102] bug #696: enable zero-sized block at compile-time by relaxing the respective assertion --- Eigen/src/Core/Block.h | 4 ++-- test/zerosized.cpp | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index cee5591f2..cf962aed1 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -129,8 +129,8 @@ template class : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) - eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows() - && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols()); + eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows() + && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols()); } /** Dynamic-size constructor diff --git a/test/zerosized.cpp b/test/zerosized.cpp index da7dd0481..85c553453 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -25,6 +25,7 @@ template void zeroReduction(const MatrixType& m) { template void zeroSizedMatrix() { MatrixType t1; + typedef typename MatrixType::Scalar Scalar; if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0) { @@ -45,6 +46,23 @@ template void zeroSizedMatrix() VERIFY(t1==t2); } } + + if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0) + { + Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::RowsAtCompileTime; + Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random(1,10) : MatrixType::ColsAtCompileTime; + MatrixType m(rows,cols); + zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols)); + zeroReduction(m.template block(0,0,rows,0)); + zeroReduction(m.template block<0,1>(0,0)); + zeroReduction(m.template block<1,0>(0,0)); + Matrix prod = m.template block(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols); + VERIFY(prod.rows()==rows && prod.cols()==cols); + VERIFY(prod.isZero()); + prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0); + VERIFY(prod.size()==1); + VERIFY(prod.isZero()); + } } template void zeroSizedVector() From d4a9e615699bd7f26864d57d2b28021b9f64b6ff Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 22:07:56 +0100 Subject: [PATCH 071/102] Extend SparseView to allow keeping explicit zeros. This is equivalent to sparseView(1,-1) but faster because the test is removed at compile-time. --- Eigen/src/Core/util/ForwardDeclarations.h | 2 +- Eigen/src/SparseCore/SparseUtil.h | 1 - Eigen/src/SparseCore/SparseView.h | 30 +++++++++++------------ 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 483af876f..e6ed965ca 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -126,7 +126,7 @@ template class TriangularBase; template class TriangularView; template class SelfAdjointView; -template class SparseView; +template class SparseView; template class WithFormat; template struct CommaInitializer; template class ReturnByValue; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 74df0d496..3b1cf03ab 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -56,7 +56,6 @@ template class template class SparseSelfAdjointView; template class SparseDiagonalProduct; -template class SparseView; template class SparseSparseProduct; template class SparseTimeDenseProduct; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index b867877d8..f2af1b5d9 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011-2014 Gael Guennebaud +// Copyright (C) 2011-2015 Gael Guennebaud // Copyright (C) 2010 Daniel Lowengrub // // This Source Code Form is subject to the terms of the Mozilla @@ -15,8 +15,8 @@ namespace Eigen { namespace internal { -template -struct traits > : traits +template +struct traits > : traits { typedef typename MatrixType::StorageIndex StorageIndex; typedef Sparse StorageKind; @@ -27,8 +27,8 @@ struct traits > : traits } // end namespace internal -template -class SparseView : public SparseMatrixBase > +template +class SparseView : public SparseMatrixBase > { typedef typename MatrixType::Nested MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; @@ -66,13 +66,13 @@ namespace internal { // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is // not easy because the evaluators do not expose the sizes of the underlying expression. -template -struct unary_evaluator, IteratorBased> - : public evaluator_base > +template +struct unary_evaluator, IteratorBased> + : public evaluator_base > { typedef typename evaluator::InnerIterator EvalIterator; public: - typedef SparseView XprType; + typedef SparseView XprType; class InnerIterator : public EvalIterator { @@ -88,7 +88,7 @@ struct unary_evaluator, IteratorBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { EvalIterator::operator++(); - incrementToNonZero(); + if(!KeepZeros) incrementToNonZero(); return *this; } @@ -119,12 +119,12 @@ struct unary_evaluator, IteratorBased> const XprType &m_view; }; -template -struct unary_evaluator, IndexBased> - : public evaluator_base > +template +struct unary_evaluator, IndexBased> + : public evaluator_base > { public: - typedef SparseView XprType; + typedef SparseView XprType; protected: enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit }; typedef typename XprType::Scalar Scalar; @@ -144,7 +144,7 @@ struct unary_evaluator, IndexBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; - incrementToNonZero(); + if(!KeepZeros) incrementToNonZero(); return *this; } From 15084cf1ac1f58085cd0635676aa1d28efb268de Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 22:09:45 +0100 Subject: [PATCH 072/102] bug #632: add support for "dense +/- sparse" operations. The current implementation is based on SparseView to make the dense subexpression compatible with the sparse one. --- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 30 ++++++++++++++++++++++ test/sparse_basic.cpp | 5 ++++ 2 files changed, 35 insertions(+) diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index d9420ac63..06c6d0e4d 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -60,6 +60,8 @@ namespace internal { // Generic "sparse OP sparse" +template struct binary_sparse_evaluator; + template struct binary_evaluator, IteratorBased, IteratorBased> : evaluator_base > @@ -428,6 +430,34 @@ SparseMatrixBase::cwiseProduct(const MatrixBase &other) c return typename CwiseProductDenseReturnType::Type(derived(), other.derived()); } +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > +operator+(const SparseMatrixBase &a, const MatrixBase &b) +{ + return a.derived() + SparseView(b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +operator+(const MatrixBase &a, const SparseMatrixBase &b) +{ + return SparseView(a.derived()) + b.derived(); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > +operator-(const SparseMatrixBase &a, const MatrixBase &b) +{ + return a.derived() - SparseView(b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +operator-(const MatrixBase &a, const SparseMatrixBase &b) +{ + return SparseView(a.derived()) - b.derived(); +} + } // end namespace Eigen #endif // EIGEN_SPARSE_CWISE_BINARY_OP_H diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index d803e7dae..5a5650705 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -192,6 +192,11 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3)); // VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4); + VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3); + VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4); + VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3); + VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4); + // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1)); VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval())); From 699634890afdce914553862464450966ead40ad0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jan 2016 23:02:22 +0100 Subject: [PATCH 073/102] bug #946: generalize Cholmod::solve to handle any rhs expression --- Eigen/src/CholmodSupport/CholmodSupport.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index c7c521b95..b8020a92c 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -273,9 +273,10 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); + + // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref. + Ref > b_ref(b.derived()); - // note: cd stands for Cholmod Dense - Rhs& b_ref(b.const_cast_derived()); cholmod_dense b_cd = viewAsCholmod(b_ref); cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod); if(!x_cd) From 8ed1553d20c3837d6365e1a87f6ed11570fc7a26 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:39:50 +0100 Subject: [PATCH 074/102] bug #632: implement general coefficient-wise "dense op sparse" operations through specialized evaluators instead of using SparseView. This permits to deal with arbitrary storage order, and to by-pass the more complex iterator of the sparse-sparse case. --- Eigen/src/Core/util/XprHelper.h | 23 ++- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 219 ++++++++++++++++++--- 2 files changed, 205 insertions(+), 37 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 9fe8cfcd1..a001c473a 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -526,22 +526,21 @@ template struct promote_storage_type * the functor. * The default rules are as follows: * \code - * A op A -> A - * A op dense -> dense - * dense op B -> dense - * A * dense -> A - * dense * B -> B + * A op A -> A + * A op dense -> dense + * dense op B -> dense + * sparse op dense -> sparse + * dense op sparse -> sparse * \endcode */ template struct cwise_promote_storage_type; -template struct cwise_promote_storage_type { typedef A ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type > { typedef Dense ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type { typedef Dense ret; }; -template struct cwise_promote_storage_type > { typedef A ret; }; -template struct cwise_promote_storage_type > { typedef B ret; }; +template struct cwise_promote_storage_type { typedef A ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B. * The template parameter ProductTag permits to specialize the resulting storage kind wrt to diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 06c6d0e4d..c57d9ac59 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -49,15 +49,6 @@ class CwiseBinaryOpImpl namespace internal { -template::StorageKind, - typename _RhsStorageMode = typename traits::StorageKind> -class sparse_cwise_binary_op_inner_iterator_selector; - -} // end namespace internal - -namespace internal { - // Generic "sparse OP sparse" template struct binary_sparse_evaluator; @@ -155,6 +146,182 @@ protected: evaluator m_rhsImpl; }; +// dense op sparse +template +struct binary_evaluator, IndexBased, IteratorBased> + : evaluator_base > +{ +protected: + typedef typename evaluator::InnerIterator RhsIterator; + typedef CwiseBinaryOp XprType; + typedef typename traits::Scalar Scalar; + typedef typename XprType::StorageIndex StorageIndex; +public: + + class ReverseInnerIterator; + class InnerIterator + { + enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit }; + public: + + EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer) + : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize()) + { + this->operator++(); + } + + EIGEN_STRONG_INLINE InnerIterator& operator++() + { + ++m_id; + if(m_id &m_lhsEval; + RhsIterator m_rhsIter; + const BinaryOp& m_functor; + Scalar m_value; + StorageIndex m_id; + StorageIndex m_innerSize; + }; + + + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) + }; + + explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()), + m_expr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + inline Index nonZerosEstimate() const { + return m_expr.size(); + } + +protected: + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; + const XprType &m_expr; +}; + +// sparse op dense +template +struct binary_evaluator, IteratorBased, IndexBased> + : evaluator_base > +{ +protected: + typedef typename evaluator::InnerIterator LhsIterator; + typedef CwiseBinaryOp XprType; + typedef typename traits::Scalar Scalar; + typedef typename XprType::StorageIndex StorageIndex; +public: + + class ReverseInnerIterator; + class InnerIterator + { + enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit }; + public: + + EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer) + : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize()) + { + this->operator++(); + } + + EIGEN_STRONG_INLINE InnerIterator& operator++() + { + ++m_id; + if(m_id &m_rhsEval; + const BinaryOp& m_functor; + Scalar m_value; + StorageIndex m_id; + StorageIndex m_innerSize; + }; + + + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) + }; + + explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()), + m_expr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + inline Index nonZerosEstimate() const { + return m_expr.size(); + } + +protected: + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; + const XprType &m_expr; +}; + // "sparse .* sparse" template struct binary_evaluator, Lhs, Rhs>, IteratorBased, IteratorBased> @@ -289,7 +456,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -362,7 +530,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -430,32 +599,32 @@ SparseMatrixBase::cwiseProduct(const MatrixBase &other) c return typename CwiseProductDenseReturnType::Type(derived(), other.derived()); } -template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > -operator+(const SparseMatrixBase &a, const MatrixBase &b) -{ - return a.derived() + SparseView(b.derived()); -} - template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const DenseDerived, const SparseDerived> operator+(const MatrixBase &a, const SparseMatrixBase &b) { - return SparseView(a.derived()) + b.derived(); + return CwiseBinaryOp, const DenseDerived, const SparseDerived>(a.derived(), b.derived()); } template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const SparseView > -operator-(const SparseMatrixBase &a, const MatrixBase &b) +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const DenseDerived> +operator+(const SparseMatrixBase &a, const MatrixBase &b) { - return a.derived() - SparseView(b.derived()); + return CwiseBinaryOp, const SparseDerived, const DenseDerived>(a.derived(), b.derived()); } template -EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseView, const SparseDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const DenseDerived, const SparseDerived> operator-(const MatrixBase &a, const SparseMatrixBase &b) { - return SparseView(a.derived()) - b.derived(); + return CwiseBinaryOp, const DenseDerived, const SparseDerived>(a.derived(), b.derived()); +} + +template +EIGEN_STRONG_INLINE const CwiseBinaryOp, const SparseDerived, const DenseDerived> +operator-(const SparseMatrixBase &a, const MatrixBase &b) +{ + return CwiseBinaryOp, const SparseDerived, const DenseDerived>(a.derived(), b.derived()); } } // end namespace Eigen From 1bc207c528bcfc4d9fb27ada28a8aaf1b9e8d3f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:43:21 +0100 Subject: [PATCH 075/102] backout changeset d4a9e615699bd7f26864d57d2b28021b9f64b6ff : the extended SparseView is not needed anymore --- Eigen/src/Core/util/ForwardDeclarations.h | 2 +- Eigen/src/SparseCore/SparseUtil.h | 1 + Eigen/src/SparseCore/SparseView.h | 30 +++++++++++------------ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index e6ed965ca..483af876f 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -126,7 +126,7 @@ template class TriangularBase; template class TriangularView; template class SelfAdjointView; -template class SparseView; +template class SparseView; template class WithFormat; template struct CommaInitializer; template class ReturnByValue; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 3b1cf03ab..74df0d496 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -56,6 +56,7 @@ template class template class SparseSelfAdjointView; template class SparseDiagonalProduct; +template class SparseView; template class SparseSparseProduct; template class SparseTimeDenseProduct; diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h index f2af1b5d9..b867877d8 100644 --- a/Eigen/src/SparseCore/SparseView.h +++ b/Eigen/src/SparseCore/SparseView.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011-2015 Gael Guennebaud +// Copyright (C) 2011-2014 Gael Guennebaud // Copyright (C) 2010 Daniel Lowengrub // // This Source Code Form is subject to the terms of the Mozilla @@ -15,8 +15,8 @@ namespace Eigen { namespace internal { -template -struct traits > : traits +template +struct traits > : traits { typedef typename MatrixType::StorageIndex StorageIndex; typedef Sparse StorageKind; @@ -27,8 +27,8 @@ struct traits > : traits } // end namespace internal -template -class SparseView : public SparseMatrixBase > +template +class SparseView : public SparseMatrixBase > { typedef typename MatrixType::Nested MatrixTypeNested; typedef typename internal::remove_all::type _MatrixTypeNested; @@ -66,13 +66,13 @@ namespace internal { // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is // not easy because the evaluators do not expose the sizes of the underlying expression. -template -struct unary_evaluator, IteratorBased> - : public evaluator_base > +template +struct unary_evaluator, IteratorBased> + : public evaluator_base > { typedef typename evaluator::InnerIterator EvalIterator; public: - typedef SparseView XprType; + typedef SparseView XprType; class InnerIterator : public EvalIterator { @@ -88,7 +88,7 @@ struct unary_evaluator, IteratorBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { EvalIterator::operator++(); - if(!KeepZeros) incrementToNonZero(); + incrementToNonZero(); return *this; } @@ -119,12 +119,12 @@ struct unary_evaluator, IteratorBased> const XprType &m_view; }; -template -struct unary_evaluator, IndexBased> - : public evaluator_base > +template +struct unary_evaluator, IndexBased> + : public evaluator_base > { public: - typedef SparseView XprType; + typedef SparseView XprType; protected: enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit }; typedef typename XprType::Scalar Scalar; @@ -144,7 +144,7 @@ struct unary_evaluator, IndexBased> EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; - if(!KeepZeros) incrementToNonZero(); + incrementToNonZero(); return *this; } From 102fa96a9610ccee4f246f8c1030c0bdc380a429 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 14:58:21 +0100 Subject: [PATCH 076/102] Extend doc on dense+sparse --- doc/TutorialSparse.dox | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index fb07adaa2..1f0be387d 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -257,7 +257,14 @@ Binary coefficient wise operators can also mix sparse and dense expressions: \code sm2 = sm1.cwiseProduct(dm1); dm2 = sm1 + dm1; +dm2 = dm1 - sm1; \endcode +Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing dm2 = sm1 + dm1, better write: +\code +dm2 = dm1; +dm2 += sm1; +\endcode +This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only. %Sparse expressions also support transposition: From 4281eb1e2c3c2fc563aad9625e90d7b3e6fe1e75 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:20:43 -0800 Subject: [PATCH 077/102] Added 2 benchmarks to the suite of tensor benchmarks running on GPU --- bench/tensors/tensor_benchmarks_gpu.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fe807d2ab..611e8197b 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -22,6 +22,8 @@ BM_FuncGPU(memcpy); BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); BM_FuncGPU(shuffling); BM_FuncGPU(padding); BM_FuncGPU(striding); From ba27c8a7ded7bd766d724ab74d5730d4195d6722 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:28:43 -0800 Subject: [PATCH 078/102] Made the CUDA contract test more robust to numerical noise. --- unsupported/test/cxx11_tensor_contract_cuda.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index ac447dd7b..cbd902d6a 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -24,7 +24,7 @@ typedef Tensor::DimensionPair DimPair; template static void test_cuda_contraction(int m_size, int k_size, int n_size) { - std::cout<<"Calling with ("<= 1e-4) { - std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << std::endl; - assert(false); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); } cudaFree((void*)d_t_left); @@ -83,7 +87,7 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { - std::cout<<"Calling contraction tests"<(128, 128, 128)); CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { From d0db95f730b84e59bbad7fce24eb4becef106b9e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:43:57 -0800 Subject: [PATCH 079/102] Sharded the tensor thread pool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e28cf55e2..8ee644ecc 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,7 +17,7 @@ using Eigen::Tensor; -static void test_multithread_elementwise() +void test_multithread_elementwise() { Tensor in1(2,3,7); Tensor in2(2,3,7); @@ -40,7 +40,7 @@ static void test_multithread_elementwise() } -static void test_multithread_compound_assignment() +void test_multithread_compound_assignment() { Tensor in1(2,3,7); Tensor in2(2,3,7); @@ -64,7 +64,7 @@ static void test_multithread_compound_assignment() } template -static void test_multithread_contraction() +void test_multithread_contraction() { Tensor t_left(30, 50, 37, 31); Tensor t_right(37, 31, 70, 2, 10); @@ -99,7 +99,7 @@ static void test_multithread_contraction() } template -static void test_contraction_corner_cases() +void test_contraction_corner_cases() { Tensor t_left(32, 500); Tensor t_right(32, 28*28); @@ -186,7 +186,7 @@ static void test_contraction_corner_cases() } template -static void test_multithread_contraction_agrees_with_singlethread() { +void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(1, 5000); Tensor left(internal::random(1, 80), @@ -229,7 +229,7 @@ static void test_multithread_contraction_agrees_with_singlethread() { template -static void test_multithreaded_reductions() { +void test_multithreaded_reductions() { const int num_threads = internal::random(3, 11); ThreadPool thread_pool(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); @@ -251,7 +251,7 @@ static void test_multithreaded_reductions() { } -static void test_memcpy() { +void test_memcpy() { for (int i = 0; i < 5; ++i) { const int num_threads = internal::random(3, 11); @@ -270,7 +270,7 @@ static void test_memcpy() { } -static void test_multithread_random() +void test_multithread_random() { Eigen::ThreadPool tp(2); Eigen::ThreadPoolDevice device(&tp, 2); @@ -281,23 +281,22 @@ static void test_multithread_random() void test_cxx11_tensor_thread_pool() { - CALL_SUBTEST(test_multithread_elementwise()); - CALL_SUBTEST(test_multithread_compound_assignment()); + CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST_2(test_multithread_contraction()); + CALL_SUBTEST_2(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases()); - CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_contraction_corner_cases()); - CALL_SUBTEST(test_multithreaded_reductions()); - CALL_SUBTEST(test_multithreaded_reductions()); + CALL_SUBTEST_5(test_multithreaded_reductions()); + CALL_SUBTEST_5(test_multithreaded_reductions()); - CALL_SUBTEST(test_memcpy()); - - CALL_SUBTEST(test_multithread_random()); + CALL_SUBTEST_6(test_memcpy()); + CALL_SUBTEST_6(test_multithread_random()); } From 2053478c5677b53c87b302d962a0545d08833a72 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:46:36 -0800 Subject: [PATCH 080/102] Made sure to use a tensor of rank 0 to store the result of a full reduction in the tensor thread pool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 8ee644ecc..8e5636bb7 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -239,15 +239,15 @@ void test_multithreaded_reductions() { Tensor t1(num_rows, num_cols); t1.setRandom(); - Tensor full_redux(1); + Tensor full_redux; full_redux = t1.sum(); - Tensor full_redux_tp(1); + Tensor full_redux_tp; full_redux_tp.device(thread_pool_device) = t1.sum(); // Check that the single threaded and the multi threaded reductions return // the same result. - VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0)); + VERIFY_IS_APPROX(full_redux(), full_redux_tp()); } From 32088c06a169ed8d1286c491ed21a20321ae58a5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:51:14 -0800 Subject: [PATCH 081/102] Made the comparison between single and multithreaded contraction results more resistant to numerical noise to prevent spurious test failures. --- unsupported/test/cxx11_tensor_thread_pool.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 8e5636bb7..aa76009b7 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -91,10 +91,15 @@ void test_multithread_contraction() for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; - assert(false); + if (fabs(t_result(i) - m_result(i)) < 1e-4) { + continue; } + if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << m_result(i) << std::endl; + assert(false); } } From 9de155d15320a68182e7f572adf504cad6172419 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 10:56:47 -0800 Subject: [PATCH 082/102] Added a test to cover threaded tensor shuffling --- unsupported/test/cxx11_tensor_thread_pool.cpp | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index aa76009b7..e46197464 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -283,6 +283,31 @@ void test_multithread_random() t.device(device) = t.random>(); } +template +void test_multithread_shuffle() +{ + Tensor tensor(17,5,7,11); + tensor.setRandom(); + + const int num_threads = internal::random(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor shuffle(7,5,11,17); + array shuffles = {{2,1,3,0}}; + shuffle.device(device) = tensor.shuffle(shuffles); + + for (int i = 0; i < 17; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); + } + } + } + } +} + void test_cxx11_tensor_thread_pool() { @@ -304,4 +329,6 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_6(test_memcpy()); CALL_SUBTEST_6(test_multithread_random()); + CALL_SUBTEST_6(test_multithread_shuffle()); + CALL_SUBTEST_6(test_multithread_shuffle()); } From bd21aba1817f76f4e72ddf3c55ef23d4a62ed6f7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 11:47:09 -0800 Subject: [PATCH 083/102] Sharded the cxx11_tensor_cuda test and fixed a memory leak --- unsupported/test/cxx11_tensor_cuda.cu | 131 +++++++++++++++++--------- 1 file changed, 88 insertions(+), 43 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 79f1c5315..60f9314a5 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -63,6 +63,10 @@ void test_cuda_elementwise_small() { out(Eigen::array(i)), in1(Eigen::array(i)) + in2(Eigen::array(i))); } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); } void test_cuda_elementwise() @@ -113,6 +117,11 @@ void test_cuda_elementwise() } } } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); } void test_cuda_reduction() @@ -158,10 +167,13 @@ void test_cuda_reduction() VERIFY_IS_APPROX(out(i,j), expected); } } + + cudaFree(d_in1); + cudaFree(d_out); } template -static void test_cuda_contraction() +void test_cuda_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -216,10 +228,14 @@ static void test_cuda_contraction() assert(false); } } + + cudaFree(d_t_left); + cudaFree(d_t_right); + cudaFree(d_t_result); } template -static void test_cuda_convolution_1d() +void test_cuda_convolution_1d() { Tensor input(74,37,11,137); Tensor kernel(4); @@ -266,9 +282,13 @@ static void test_cuda_convolution_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_col_major_1d() +void test_cuda_convolution_inner_dim_col_major_1d() { Tensor input(74,9,11,7); Tensor kernel(4); @@ -315,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_row_major_1d() +void test_cuda_convolution_inner_dim_row_major_1d() { Tensor input(7,9,11,74); Tensor kernel(4); @@ -364,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template -static void test_cuda_convolution_2d() +void test_cuda_convolution_2d() { Tensor input(74,37,11,137); Tensor kernel(3,4); @@ -424,10 +452,14 @@ static void test_cuda_convolution_2d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template -static void test_cuda_convolution_3d() +void test_cuda_convolution_3d() { Tensor input(Eigen::array(74,37,11,137,17)); Tensor kernel(3,4,2); @@ -498,6 +530,10 @@ static void test_cuda_convolution_3d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } @@ -535,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template @@ -571,6 +610,9 @@ void test_cuda_erf(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template @@ -607,47 +649,50 @@ void test_cuda_erfc(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_elementwise_small()); - CALL_SUBTEST(test_cuda_elementwise()); - CALL_SUBTEST(test_cuda_reduction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_contraction()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_lgamma(1.0f)); - CALL_SUBTEST(test_cuda_lgamma(100.0f)); - CALL_SUBTEST(test_cuda_lgamma(0.01f)); - CALL_SUBTEST(test_cuda_lgamma(0.001f)); - CALL_SUBTEST(test_cuda_erf(1.0f)); - CALL_SUBTEST(test_cuda_erf(100.0f)); - CALL_SUBTEST(test_cuda_erf(0.01f)); - CALL_SUBTEST(test_cuda_erf(0.001f)); - CALL_SUBTEST(test_cuda_erfc(1.0f)); + CALL_SUBTEST_1(test_cuda_elementwise_small()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_reduction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); + CALL_SUBTEST_4(test_cuda_erf(100.0f)); + CALL_SUBTEST_4(test_cuda_erf(0.01f)); + CALL_SUBTEST_4(test_cuda_erf(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); // CALL_SUBTEST(test_cuda_erfc(100.0f)); - CALL_SUBTEST(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01f)); - CALL_SUBTEST(test_cuda_erfc(0.001f)); - CALL_SUBTEST(test_cuda_lgamma(1.0)); - CALL_SUBTEST(test_cuda_lgamma(100.0)); - CALL_SUBTEST(test_cuda_lgamma(0.01)); - CALL_SUBTEST(test_cuda_lgamma(0.001)); - CALL_SUBTEST(test_cuda_erf(1.0)); - CALL_SUBTEST(test_cuda_erf(100.0)); - CALL_SUBTEST(test_cuda_erf(0.01)); - CALL_SUBTEST(test_cuda_erf(0.001)); - CALL_SUBTEST(test_cuda_erfc(1.0)); + CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01f)); + CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + CALL_SUBTEST_4(test_cuda_erf(1.0)); + CALL_SUBTEST_4(test_cuda_erf(100.0)); + CALL_SUBTEST_4(test_cuda_erf(0.01)); + CALL_SUBTEST_4(test_cuda_erf(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(1.0)); // CALL_SUBTEST(test_cuda_erfc(100.0)); - CALL_SUBTEST(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc(0.01)); - CALL_SUBTEST(test_cuda_erfc(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01)); + CALL_SUBTEST_4(test_cuda_erfc(0.001)); } From 483082ef6e4ff25d43cba03e1b1f2ed15000ac3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 30 Jan 2016 11:59:22 -0800 Subject: [PATCH 084/102] Fixed a few memory leaks in the cuda tests --- unsupported/test/cxx11_tensor_argmax_cuda.cu | 10 ++++++++++ unsupported/test/cxx11_tensor_reduction_cuda.cu | 3 +++ 2 files changed, 13 insertions(+) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index d37490d15..48cec510d 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -56,6 +56,10 @@ void test_cuda_simple_argmax() VERIFY_IS_EQUAL(out_max(Eigen::array(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array(0)), 0); + + cudaFree(d_in); + cudaFree(d_out_max); + cudaFree(d_out_min); } template @@ -141,6 +145,9 @@ void test_cuda_argmax_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } @@ -227,6 +234,9 @@ void test_cuda_argmin_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 9e06eb126..417242586 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -48,6 +48,9 @@ static void test_full_reductions() { // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); } void test_cxx11_tensor_reduction_cuda() { From 3ba8a3ab1a77fd2f58448187c6881d13bf51f430 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 22:14:04 +0100 Subject: [PATCH 085/102] Disable underflow unit test on the i387 FPU. --- test/adjoint.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index b1e69c2e5..9c895e0ac 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -45,12 +45,14 @@ template<> struct adjoint_specific { // check null inputs VERIFY_IS_APPROX((v1*0).normalized(), (v1*0)); +#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE) RealScalar very_small = (std::numeric_limits::min)(); VERIFY( (v1*very_small).norm() == 0 ); VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small)); v3 = v1*very_small; v3.normalize(); VERIFY_IS_APPROX(v3, (v1*very_small)); +#endif // check compatibility of dot and adjoint ref = NumTraits::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm())); From a4e4542b89092eb2ed2984aae6f15bbcc43d7ed6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 30 Jan 2016 22:26:17 +0100 Subject: [PATCH 086/102] Avoid overflow in unit test. --- test/stable_norm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 9f12320e0..c3eb5ff31 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -174,7 +174,8 @@ template void stable_norm(const MatrixType& m) VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1)); VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1)); VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1)); - VERIFY_IS_APPROX(vbig, vbig.stableNorm() * vbig.stableNormalized()); + RealScalar big_scaling = ((std::numeric_limits::max)() * RealScalar(1e-4)); + VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling); VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized()); } } From d1421659427a5bb237bbba1c86781267b98ce235 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 31 Jan 2016 16:34:10 +0100 Subject: [PATCH 087/102] bug #667: declare several critical functions as FORECE_INLINE to make ICC happier. HG: branch 'default' HG: changed Eigen/src/Core/ArrayBase.h HG: changed Eigen/src/Core/AssignEvaluator.h HG: changed Eigen/src/Core/CoreEvaluators.h HG: changed Eigen/src/Core/CwiseUnaryOp.h HG: changed Eigen/src/Core/DenseBase.h HG: changed Eigen/src/Core/MatrixBase.h --- Eigen/src/Core/ArrayBase.h | 16 +-- Eigen/src/Core/AssignEvaluator.h | 34 ++++-- Eigen/src/Core/CoreEvaluators.h | 197 +++++++++++++++++++++---------- Eigen/src/Core/CwiseUnaryOp.h | 16 +-- Eigen/src/Core/DenseBase.h | 8 +- Eigen/src/Core/MatrixBase.h | 8 +- 6 files changed, 183 insertions(+), 96 deletions(-) diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index b4c24a27a..0443e3032 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -103,7 +103,7 @@ template class ArrayBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const ArrayBase& other) { internal::call_assignment(derived(), other.derived()); @@ -112,28 +112,28 @@ template class ArrayBase /** Set all the entries to \a value. * \sa DenseBase::setConstant(), DenseBase::fill() */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Scalar &value) { Base::setConstant(value); return derived(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& scalar); - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& scalar); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase& other); public: diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f6632de69..5b65bfb0c 100755 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -637,7 +637,7 @@ protected: ***************************************************************************/ template -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -654,7 +654,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S } template -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) { call_dense_assignment_loop(dst, src, internal::assign_op()); } @@ -688,26 +688,30 @@ struct Assignment; // does not has to bother about these annoying details. template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } template -EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } // Deal with "assume-aliasing" template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) { typename plain_matrix_type::type tmp(src); call_assignment_no_alias(dst, tmp, func); } template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) { call_assignment_no_alias(dst, src, func); } @@ -715,14 +719,16 @@ EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& fun // by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template class StorageBase, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment(NoAlias& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(NoAlias& dst, const Src& src, const Func& func) { call_assignment_no_alias(dst.expression(), src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) { enum { NeedToTranspose = ( (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) @@ -747,13 +753,15 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Assignment::run(actualDst, src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src) { call_assignment_no_alias(dst, src, internal::assign_op()); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) { Index dstRows = src.rows(); Index dstCols = src.cols(); @@ -767,7 +775,8 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src Assignment::run(dst, src, func); } template -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { call_assignment_no_alias_no_transpose(dst, src, internal::assign_op()); } @@ -779,7 +788,8 @@ template void check_for_aliasing(const Dst &dst, con template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment { - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 7776948d1..a729e0454 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -148,7 +148,8 @@ struct evaluator > EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) return m_data[row * m_outerStride.value() + col]; @@ -156,12 +157,14 @@ struct evaluator > return m_data[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) return const_cast(m_data)[row * m_outerStride.value() + col]; @@ -169,12 +172,14 @@ struct evaluator > return const_cast(m_data)[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return const_cast(m_data)[index]; } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { if (IsRowMajor) @@ -184,12 +189,14 @@ struct evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return ploadt(m_data + index); } template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { if (IsRowMajor) @@ -201,6 +208,7 @@ struct evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return pstoret(const_cast(m_data) + index, x); @@ -260,45 +268,53 @@ struct unary_evaluator, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(col, row); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); } - EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename XprType::Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(col, row); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet(index); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket(col, row, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket(index, x); @@ -338,23 +354,27 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(index); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.template packetOp(row, col); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.template packetOp(index); @@ -380,7 +400,8 @@ struct unary_evaluator, IndexBased > Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { @@ -390,23 +411,27 @@ struct unary_evaluator, IndexBased > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_argImpl.template packet(row, col)); } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); @@ -466,17 +491,20 @@ struct binary_evaluator, IndexBased, IndexBase typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_lhsImpl.template packet(row, col), @@ -484,6 +512,7 @@ struct binary_evaluator, IndexBased, IndexBase } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_lhsImpl.template packet(index), @@ -523,22 +552,26 @@ struct unary_evaluator, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_unaryOp(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_unaryOp(m_argImpl.coeff(index)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_unaryOp(m_argImpl.coeffRef(row, col)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_unaryOp(m_argImpl.coeffRef(index)); } @@ -578,47 +611,55 @@ struct mapbase_evaluator : evaluator_base EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index * m_xpr.innerStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_data[index * m_xpr.innerStride()]; } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::ploadt(ptr); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return internal::ploadt(m_data + index * m_xpr.innerStride()); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::pstoret(ptr, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { internal::pstoret(m_data + index * m_xpr.innerStride(), x); @@ -767,46 +808,54 @@ struct unary_evaluator, IndexBa RowsAtCompileTime = XprType::RowsAtCompileTime }; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return packet(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return writePacket(RowsAtCompileTime == 1 ? 0 : index, @@ -859,7 +908,7 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - inline EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) @@ -869,7 +918,8 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (m_conditionImpl.coeff(row, col)) return m_thenImpl.coeff(row, col); @@ -877,7 +927,8 @@ struct evaluator > return m_elseImpl.coeff(row, col); } - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { if (m_conditionImpl.coeff(index)) return m_thenImpl.coeff(index); @@ -921,7 +972,8 @@ struct unary_evaluator > m_cols(replicate.nestedExpression().cols()) {} - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 @@ -934,7 +986,8 @@ struct unary_evaluator > return m_argImpl.coeff(actual_row, actual_col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_index = internal::traits::RowsAtCompileTime==1 @@ -945,6 +998,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 @@ -958,6 +1012,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { const Index actual_index = internal::traits::RowsAtCompileTime==1 @@ -1008,7 +1063,8 @@ struct evaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const { if (Direction==Vertical) return m_functor(m_arg.col(j)); @@ -1016,7 +1072,8 @@ struct evaluator > return m_functor(m_arg.row(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const { if (Direction==Vertical) return m_functor(m_arg.col(index)); @@ -1051,45 +1108,53 @@ struct evaluator_wrapper_base typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet(row, col); } - template + template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet(index); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket(row, col, x); } - template + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket(index, x); @@ -1164,29 +1229,34 @@ struct unary_evaluator > m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1); } template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { enum { @@ -1201,6 +1271,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE PacketType packet(Index index) const { enum { PacketSize = unpacket_traits::size }; @@ -1208,6 +1279,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { // FIXME we could factorize some code with packet(i,j) @@ -1224,6 +1296,7 @@ struct unary_evaluator > } template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { enum { PacketSize = unpacket_traits::size }; @@ -1267,22 +1340,26 @@ struct evaluator > typedef typename internal::conditional::value, typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index) const { return m_argImpl.coeff(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index + rowOffset(), index + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index) { return m_argImpl.coeffRef(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index + rowOffset(), index + colOffset()); } diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index 8c182303c..ff1391996 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -61,26 +61,26 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type XprTypeNested; typedef typename internal::remove_all::type NestedExpression; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_xpr.cols(); } /** \returns the functor representing the unary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_all::type& nestedExpression() { return m_xpr; } diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 2c5c0ad28..ea8283cea 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -275,13 +275,13 @@ template class DenseBase /** Copies \a other into *this. \returns a reference to *this. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); template @@ -388,9 +388,9 @@ template class DenseBase inline bool hasNaN() const; inline bool allFinite() const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE inline Derived& operator*=(const Scalar& other); - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE inline Derived& operator/=(const Scalar& other); typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 338879c73..3770ab257 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -135,14 +135,14 @@ template class MatrixBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other); // We cannot inherit here via Base::operator= since it is causing // trouble with MSVC. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); template @@ -154,10 +154,10 @@ template class MatrixBase Derived& operator=(const ReturnByValue& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase& other); template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); #ifdef __CUDACC__ From 4a2ddfb81dbbb07c2f399d7bc9664f66abc1e65a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 10:44:15 -0800 Subject: [PATCH 088/102] Sharded the CUDA argmax tensor test --- unsupported/test/cxx11_tensor_argmax_cuda.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 48cec510d..45311d4f7 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -242,10 +242,10 @@ void test_cuda_argmin_dim() void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_simple_argmax()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmax_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); - CALL_SUBTEST(test_cuda_argmin_dim()); + CALL_SUBTEST_1(test_cuda_simple_argmax()); + CALL_SUBTEST_1(test_cuda_simple_argmax()); + CALL_SUBTEST_2(test_cuda_argmax_dim()); + CALL_SUBTEST_2(test_cuda_argmax_dim()); + CALL_SUBTEST_3(test_cuda_argmin_dim()); + CALL_SUBTEST_3(test_cuda_argmin_dim()); } From 3f1ee458333ab59218342d595b60536aee760f6a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 10:48:49 -0800 Subject: [PATCH 089/102] Fixed compilation errors triggered by duplicate inline declaration --- Eigen/src/Core/CwiseUnaryOp.h | 2 +- Eigen/src/Core/DenseBase.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index ff1391996..1d2dd19f2 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -62,7 +62,7 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type NestedExpression; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index ea8283cea..5a38e5f22 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -389,9 +389,9 @@ template class DenseBase inline bool allFinite() const; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - inline Derived& operator*=(const Scalar& other); + Derived& operator*=(const Scalar& other); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - inline Derived& operator/=(const Scalar& other); + Derived& operator/=(const Scalar& other); typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; /** \returns the matrix or vector obtained by evaluating this expression. From 6720b38fbf60d750393af7d63777b06438ba5d81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 16:48:50 -0800 Subject: [PATCH 090/102] Fixed a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 5 ++++- unsupported/test/cxx11_tensor_empty.cpp | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 18a916e46..ed933b6ac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -41,7 +41,10 @@ class TensorStorage private: static const std::size_t Size = FixedDimensions::total_size; - EIGEN_ALIGN_MAX T m_data[Size]; + // Allocate an array of size at least one to prevent compiler warnings. + static const std::size_t MinSize = max_n_1::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + FixedDimensions m_dimensions; public: diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp index ca03a297c..9130fff35 100644 --- a/unsupported/test/cxx11_tensor_empty.cpp +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -16,16 +16,20 @@ static void test_empty_tensor() { Tensor source; Tensor tgt1 = source; - Tensor tgt2; - tgt2 = source; + Tensor tgt2(source); + Tensor tgt3; + tgt3 = tgt1; + tgt3 = tgt2; } static void test_empty_fixed_size_tensor() { TensorFixedSize> source; TensorFixedSize> tgt1 = source; - TensorFixedSize> tgt2; - tgt2 = source; + TensorFixedSize> tgt2(source); + TensorFixedSize> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; } From e80ed948e14c2de929a97bfbacab0b3a9172a59e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 31 Jan 2016 20:09:41 -0800 Subject: [PATCH 091/102] Fixed a number of compilation warnings generated by the cuda tests --- .../Eigen/CXX11/src/Core/util/EmulateArray.h | 39 +++++++++++++++++-- .../CXX11/src/Tensor/TensorConvolution.h | 8 ++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 4 +- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 456b34d0b..89aeb03e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -25,6 +25,16 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static std::size_t size() { return n; } @@ -123,13 +133,33 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[] (size_t) { eigen_assert(false && "Can't index a zero size array"); - return *static_cast(NULL); + return dummy; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t) const { eigen_assert(false && "Can't index a zero size array"); - return *static_cast(NULL); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; } static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } @@ -142,6 +172,9 @@ template class array { eigen_assert(l.size() == 0); } #endif + + private: + T dummy; }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 367a152a0..67c797802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { -template +template class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array& kernel_dims, @@ -123,7 +123,7 @@ class IndexMapper { } inputIndex += p * m_inputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -147,7 +147,7 @@ class IndexMapper { } outputIndex += p * m_outputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -206,7 +206,7 @@ class IndexMapper { } private: - static const size_t NumDims = internal::array_size::value; + static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; array m_cudaInputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index a03b52629..22aea5ea4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -463,7 +463,7 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides.back() = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } @@ -479,7 +479,7 @@ struct TensorEvaluator, Device> input_strides[i] = input_strides[i-1] * input_dims[i-1]; } } else { - input_strides[NumInputDims - 1] = 1; + input_strides.back() = 1; for (int i = NumInputDims - 2; i >= 0; --i) { input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; } From 2c3224924b8a290cbc33847d20103ec0db479828 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 10:23:45 +0100 Subject: [PATCH 092/102] Fix warning and replace min/max macros by calls to mini/maxi --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 960df4a46..933cd564b 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -98,9 +98,6 @@ namespace internal { /* === Definitions ========================================================== */ /* ========================================================================== */ -#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b)) -#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b)) - #define ONES_COMPLEMENT(r) (-(r)-1) /* -------------------------------------------------------------------------- */ @@ -735,8 +732,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; - dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -800,7 +797,7 @@ static void init_scoring else { /* keep track of max degree of remaining rows */ - max_deg = COLAMD_MAX (max_deg, deg) ; + max_deg = numext::maxi(max_deg, deg) ; } } COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ; @@ -838,7 +835,7 @@ static void init_scoring /* add row's external degree */ score += Row [row].shared1.degree - 1 ; /* guard against integer overflow */ - score = COLAMD_MIN (score, n_col) ; + score = numext::mini(score, n_col) ; } /* determine pruned column length */ col_length = (IndexType) (new_cp - &A [Col [c].start]) ; @@ -910,7 +907,7 @@ static void init_scoring head [score] = c ; /* see if this score is less than current min */ - min_score = COLAMD_MIN (min_score, score) ; + min_score = numext::mini(min_score, score) ; } @@ -1036,7 +1033,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Garbage_collection, if necessary ============================= */ - needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ; + needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; @@ -1095,7 +1092,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* clear tag on pivot column */ Col [pivot_col].shared1.thickness = pivot_col_thickness ; - max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ; + max_deg = numext::maxi(max_deg, pivot_row_degree) ; /* === Kill all rows used to construct pivot row ==================== */ @@ -1269,7 +1266,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* add set difference */ cur_score += row_mark - tag_mark ; /* integer overflow... */ - cur_score = COLAMD_MIN (cur_score, n_col) ; + cur_score = numext::mini(cur_score, n_col) ; } /* recompute the column's length */ @@ -1382,7 +1379,7 @@ static IndexType find_ordering /* return the number of garbage collections */ cur_score -= Col [col].shared1.thickness ; /* make sure score is less or equal than the max score */ - cur_score = COLAMD_MIN (cur_score, max_score) ; + cur_score = numext::mini(cur_score, max_score) ; COLAMD_ASSERT (cur_score >= 0) ; /* store updated score */ @@ -1405,7 +1402,7 @@ static IndexType find_ordering /* return the number of garbage collections */ head [cur_score] = col ; /* see if this score is less than current min */ - min_score = COLAMD_MIN (min_score, cur_score) ; + min_score = numext::mini(min_score, cur_score) ; } From e1d219e5c9ea782550882aa8eb131b107f05105e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 14:25:34 +0100 Subject: [PATCH 093/102] bug #698: fix linspaced for integer types. --- Eigen/src/Core/functors/NullaryFunctors.h | 63 ++++++++++++++++++----- test/nullary.cpp | 31 +++++++---- 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index cd9fbf267..71629af4c 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,7 +37,7 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; // linear access for packet ops: // 1) initialization @@ -48,12 +48,12 @@ template struct linspaced_ // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp) // in order to avoid the padd() in operator() ? template -struct linspaced_op_impl +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_packetStep(pset1(unpacket_traits::size*step)), - m_base(padd(pset1(low), pmul(pset1(step),plset(-unpacket_traits::size)))) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_packetStep(pset1(unpacket_traits::size*m_step)), + m_base(padd(pset1(low), pmul(pset1(m_step),plset(-unpacket_traits::size)))) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const @@ -75,11 +75,11 @@ struct linspaced_op_impl // 1) each step // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) template -struct linspaced_op_impl +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_lowPacket(pset1(m_low)), m_stepPacket(pset1(m_step)), m_interPacket(plset(0)) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_lowPacket(pset1(m_low)), m_stepPacket(pset1(m_step)), m_interPacket(plset(0)) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } @@ -95,6 +95,31 @@ struct linspaced_op_impl const Packet m_interPacket; }; +template +struct linspaced_op_impl +{ + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_length(high-low), m_numSteps(num_steps), m_interPacket(plset(0)) + {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar operator() (Index i) const { + return m_low + (m_length*Scalar(i))/(m_numSteps-1); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Packet packetOp(Index i) const { + return internal::padd(pset1(m_low), pdiv(pmul(pset1(m_length), padd(pset1(Scalar(i)),m_interPacket)), + pset1(m_numSteps-1))); } + + const Scalar m_low; + const Scalar m_length; + const Index m_numSteps; + const Packet m_interPacket; +}; + // ----- Linspace functor ---------------------------------------------------------------- // Forward declaration (we default to random access which does not really give @@ -102,10 +127,20 @@ struct linspaced_op_impl // nested expressions). template struct linspaced_op; template struct functor_traits< linspaced_op > -{ enum { Cost = 1, PacketAccess = packet_traits::HasSetLinear, IsRepeatable = true }; }; +{ + enum + { + Cost = 1, + PacketAccess = packet_traits::HasSetLinear + && ((!NumTraits::IsInteger) || packet_traits::HasDiv), + IsRepeatable = true + }; +}; template struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {} + linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + : impl((num_steps==1 ? high : low),high,num_steps) + {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); } @@ -134,7 +169,9 @@ template struct linspa // This proxy object handles the actual required temporaries, the different // implementations (random vs. sequential access) as well as the // correct piping to size 2/4 packet operations. - const linspaced_op_impl impl; + // As long as we don't have a Bresenham-like implementation for linear-access and integer types, + // we have to by-pass RandomAccess for integer types. See bug 698. + const linspaced_op_impl::IsInteger?true:RandomAccess),NumTraits::IsInteger> impl; }; // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta diff --git a/test/nullary.cpp b/test/nullary.cpp index 4844f2952..8d65910eb 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -48,30 +48,32 @@ void testVectorType(const VectorType& base) VectorType m(base); m.setLinSpaced(size,low,high); + if(!NumTraits::IsInteger) + { + VectorType n(size); + for (int i=0; i::epsilon() ); - - // These guys sometimes fail! This is not good. Any ideas how to fix them!? - //VERIFY( m(m.size()-1) == high ); - //VERIFY( m(0) == low ); + VERIFY( internal::isApprox(m(m.size()-1),high) ); + VERIFY( size==1 || internal::isApprox(m(0),low) ); // sequential access version m = VectorType::LinSpaced(Sequential,size,low,high); VERIFY_IS_APPROX(m,n); - // These guys sometimes fail! This is not good. Any ideas how to fix them!? - //VERIFY( m(m.size()-1) == high ); - //VERIFY( m(0) == low ); + VERIFY( internal::isApprox(m(m.size()-1),high) ); + VERIFY( size==1 || internal::isApprox(m(0),low) ); // check whether everything works with row and col major vectors Matrix row_vector(size); @@ -126,5 +128,12 @@ void test_nullary() CALL_SUBTEST_8( testVectorType(Vector4f()) ); CALL_SUBTEST_8( testVectorType(Matrix()) ); CALL_SUBTEST_8( testVectorType(Matrix()) ); + + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,300))) ); } + +#ifdef EIGEN_TEST_PART_6 + // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79). + VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits::epsilon() ); +#endif } From 6e0a86194ce6664e83d8035cbdd6047e5a27ed43 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 15:00:04 +0100 Subject: [PATCH 094/102] Fix integer path for num_steps==1 --- Eigen/src/Core/functors/NullaryFunctors.h | 8 ++++---- test/nullary.cpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 71629af4c..c5836d048 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -99,24 +99,24 @@ template struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_length(high-low), m_numSteps(num_steps), m_interPacket(plset(0)) + m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset(0)) {} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { - return m_low + (m_length*Scalar(i))/(m_numSteps-1); + return m_low + (m_length*Scalar(i))/m_divisor; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return internal::padd(pset1(m_low), pdiv(pmul(pset1(m_length), padd(pset1(Scalar(i)),m_interPacket)), - pset1(m_numSteps-1))); } + pset1(m_divisor))); } const Scalar m_low; const Scalar m_length; - const Index m_numSteps; + const Index m_divisor; const Packet m_interPacket; }; diff --git a/test/nullary.cpp b/test/nullary.cpp index 8d65910eb..cb87695ee 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -130,6 +130,7 @@ void test_nullary() CALL_SUBTEST_8( testVectorType(Matrix()) ); CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,300))) ); + CALL_SUBTEST_9( testVectorType(Matrix()) ); } #ifdef EIGEN_TEST_PART_6 From ec469700dcf82fd9b5668fe3c82d9dac49d147df Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 15:04:33 +0100 Subject: [PATCH 095/102] bug #557: make InnerIterator of sparse storage types more versatile by adding default-ctor, copy-ctor/assignment --- Eigen/src/SparseCore/SparseCompressedBase.h | 21 +++++++++++++++- test/sparse_basic.cpp | 27 +++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index f78d7c24d..ea71b41d1 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -117,6 +117,24 @@ template class SparseCompressedBase::InnerIterator { public: + InnerIterator() + : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0) + {} + + InnerIterator(const InnerIterator& other) + : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end) + {} + + InnerIterator& operator=(const InnerIterator& other) + { + m_values = other.m_values; + m_indices = other.m_indices; + const_cast(m_outer).setValue(other.m_outer.value()); + m_id = other.m_id; + m_end = other.m_end; + return *this; + } + InnerIterator(const SparseCompressedBase& mat, Index outer) : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer) { @@ -162,7 +180,8 @@ class SparseCompressedBase::InnerIterator protected: const Scalar* m_values; const StorageIndex* m_indices; - const internal::variable_if_dynamic m_outer; + typedef internal::variable_if_dynamic OuterType; + const OuterType m_outer; Index m_id; Index m_end; private: diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 5a5650705..0a06c828b 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -460,6 +460,33 @@ template void sparse_basic(const SparseMatrixType& re refMat1.setIdentity(); VERIFY_IS_APPROX(m1, refMat1); } + + // test array/vector of InnerIterator + { + typedef typename SparseMatrixType::InnerIterator IteratorType; + + DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); + SparseMatrixType m2(rows, cols); + initSparse(density, refMat2, m2); + IteratorType static_array[2]; + static_array[0] = IteratorType(m2,0); + static_array[1] = IteratorType(m2,m2.outerSize()-1); + VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 ); + VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 ); + if(static_array[0] && static_array[1]) + { + ++(static_array[1]); + static_array[1] = IteratorType(m2,0); + VERIFY( static_array[1] ); + VERIFY( static_array[1].index() == static_array[0].index() ); + VERIFY( static_array[1].outer() == static_array[0].outer() ); + VERIFY( static_array[1].value() == static_array[0].value() ); + } + + std::vector iters(2); + iters[0] = IteratorType(m2,0); + iters[1] = IteratorType(m2,m2.outerSize()-1); + } } From ff1157bcbf1998c80d96612ed201b6a20db2de5f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Feb 2016 16:09:34 +0100 Subject: [PATCH 096/102] bug #694: document that SparseQR::matrixR is not sorted. --- Eigen/src/SparseQR/SparseQR.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 0d448d02e..acd7f7e10 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -128,6 +128,17 @@ class SparseQR : public SparseSolverBase > inline Index cols() const { return m_pmat.cols();} /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization. + * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms + * expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()), + * and coefficient-wise operations. Matrix products and triangular solves are fine though. + * + * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix + * is required, you can copy it again: + * \code + * SparseMatrix R = qr.matrixR(); // column-major, not sorted! + * SparseMatrix Rr = qr.matrixR(); // row-major, sorted + * SparseMatrix Rc = Rr; // column-major, sorted + * \endcode */ const QRMatrixType& matrixR() const { return m_R; } From 11bb71c8fcd1fa0f663cedda707a36d40952eca9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 07:34:59 -0800 Subject: [PATCH 097/102] Sharded the tensor device test --- unsupported/test/cxx11_tensor_device.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index ed5dd7505..80cf9ffba 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -383,6 +383,6 @@ static void test_gpu() { void test_cxx11_tensor_device() { - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); + CALL_SUBTEST_1(test_cpu()); + CALL_SUBTEST_2(test_gpu()); } From 264f8141f86e84312f0eea9e741d2260ed839890 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 07:44:31 -0800 Subject: [PATCH 098/102] Shared the tensor reduction test --- unsupported/test/cxx11_tensor_reduction_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 417242586..cad0c08e0 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -54,6 +54,6 @@ static void test_full_reductions() { } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST_1(test_full_reductions()); + CALL_SUBTEST_2(test_full_reductions()); } From 6b5dff875e4ba2235f255b7cf0a86b7abed21df0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 12:46:32 -0800 Subject: [PATCH 099/102] Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 12 +++++++++--- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 5abdc489b..e684ab8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface { struct GpuDevice { // The StreamInterface is not owned: the caller is // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { eigen_assert(stream); } - // TODO(bsteiner): This is an internal API, we should not expose it. EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return stream_->stream(); @@ -246,6 +248,10 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { @@ -259,7 +265,7 @@ struct GpuDevice { private: const StreamInterface* stream_; - + int max_blocks_; }; #ifndef __CUDA_ARCH__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d2ab70f2b..df15c6204 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); From 922b5f527b56ba2fcf1e5a4da5216a29afdbb312 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:30:49 -0800 Subject: [PATCH 100/102] Silenced a few compilation warnings --- unsupported/test/cxx11_tensor_device.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index 80cf9ffba..cbe9e6449 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -109,19 +109,19 @@ struct GPUContext { // The actual expression to evaluate template -static void test_contextual_eval(Context* context) +void test_contextual_eval(Context* context) { context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template -static void test_forced_contextual_eval(Context* context) +void test_forced_contextual_eval(Context* context) { context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } template -static void test_compound_assignment(Context* context) +void test_compound_assignment(Context* context) { context->out().device(context->device()) = context->in1().constant(2.718f); context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; @@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context) template -static void test_contraction(Context* context) +void test_contraction(Context* context) { Eigen::array, 2> dims; dims[0] = std::make_pair(1, 1); @@ -145,7 +145,7 @@ static void test_contraction(Context* context) template -static void test_1d_convolution(Context* context) +void test_1d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(40,49,70); @@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context) } template -static void test_2d_convolution(Context* context) +void test_2d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(40,49,69); @@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context) } template -static void test_3d_convolution(Context* context) +void test_3d_convolution(Context* context) { Eigen::DSizes indices(0,0,0); Eigen::DSizes sizes(39,49,69); @@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context) } -static void test_cpu() { +void test_cpu() { Eigen::Tensor in1(40,50,70); Eigen::Tensor in2(40,50,70); Eigen::Tensor out(40,50,70); @@ -267,7 +267,7 @@ static void test_cpu() { } } -static void test_gpu() { +void test_gpu() { Eigen::Tensor in1(40,50,70); Eigen::Tensor in2(40,50,70); Eigen::Tensor out(40,50,70); From 0ce5d32be583c0a2592158ad59ce7ad11125d645 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:33:23 -0800 Subject: [PATCH 101/102] Sharded the cxx11_tensor_contract_cuda test --- .../test/cxx11_tensor_contract_cuda.cu | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index cbd902d6a..2c3cf64a9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -22,7 +22,7 @@ using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; template -static void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_cuda_contraction(int m_size, int k_size, int n_size) { std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is @@ -88,37 +88,39 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) void test_cxx11_tensor_cuda() { std::cout << "Calling contraction tests" << std::endl; - CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); - CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST_2(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST_3(test_cuda_contraction(128, k, 128)); } for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); - CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST_4(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST_5(test_cuda_contraction(128, 128, k)); } for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); - CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST_6(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST_7(test_cuda_contraction(k, 128, 128)); } - int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; + static const int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + static const int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; + static const int k_sizes[] = {31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; - for (int i = 0; i <15; i++) - for (int j = 0; j < 15; j++) + for (int i = 0; i <15; i++) { + for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST_8(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST_9(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); } + } + } } From 64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 1 Feb 2016 13:57:41 -0800 Subject: [PATCH 102/102] Cleaned up a tensor contraction test --- .../test/cxx11_tensor_contract_cuda.cu | 86 ++++++++++++------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 2c3cf64a9..6d1ef07f9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -24,14 +24,14 @@ typedef Tensor::DimensionPair DimPair; template void test_cuda_contraction(int m_size, int k_size, int n_size) { - std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor t_left(Eigen::array(m_size, k_size)); - Tensor t_right(Eigen::array(k_size, n_size)); - Tensor t_result(Eigen::array(m_size, n_size)); - Tensor t_result_gpu(Eigen::array(m_size, n_size)); + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result(m_size, n_size); + Tensor t_result_gpu(m_size, n_size); Eigen::array dims(DimPair(1, 0)); t_left.setRandom(); @@ -84,43 +84,69 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } - -void test_cxx11_tensor_cuda() -{ - std::cout << "Calling contraction tests" << std::endl; - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); +template +void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_2(test_cuda_contraction(128, k, 128)); - CALL_SUBTEST_3(test_cuda_contraction(128, k, 128)); + test_cuda_contraction(k, 128, 128); + test_cuda_contraction(k, 128, 128); } +} + +template +void test_cuda_contraction_k() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_4(test_cuda_contraction(128, 128, k)); - CALL_SUBTEST_5(test_cuda_contraction(128, 128, k)); + test_cuda_contraction(128, k, 128); + test_cuda_contraction(128, k, 128); } +} + +template +void test_cuda_contraction_n() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST_6(test_cuda_contraction(k, 128, 128)); - CALL_SUBTEST_7(test_cuda_contraction(k, 128, 128)); + test_cuda_contraction(128, 128, k); + test_cuda_contraction(128, 128, k); } +} - static const int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - static const int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - static const int k_sizes[] = {31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; +template +void test_cuda_contraction_sizes() { + int m_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257 , 511, + 512, 513, 1023, 1024, 1025}; - for (int i = 0; i <15; i++) { + int n_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i < 15; i++) { for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST_8(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST_9(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); } } } } + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + + CALL_SUBTEST_2(test_cuda_contraction_m()); + CALL_SUBTEST_3(test_cuda_contraction_m()); + + CALL_SUBTEST_4(test_cuda_contraction_k()); + CALL_SUBTEST_5(test_cuda_contraction_k()); + + CALL_SUBTEST_6(test_cuda_contraction_n()); + CALL_SUBTEST_7(test_cuda_contraction_n()); + + CALL_SUBTEST_8(test_cuda_contraction_sizes()); + CALL_SUBTEST_9(test_cuda_contraction_sizes()); +}